diff --git a/python/pyabacus/CONTRIBUTING.md b/python/pyabacus/CONTRIBUTING.md index b5d7728eae..dedd981e44 100644 --- a/python/pyabacus/CONTRIBUTING.md +++ b/python/pyabacus/CONTRIBUTING.md @@ -190,8 +190,10 @@ list(APPEND _diago ${HSOLVER_PATH}/diag_const_nums.cpp ${HSOLVER_PATH}/diago_iter_assist.cpp ${HSOLVER_PATH}/kernels/dngvd_op.cpp + ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp ${BASE_PATH}/kernels/math_kernel_op.cpp - ${BASE_PATH}/kernels/math_op.cpp + ${BASE_PATH}/kernels/math_kernel_op_vec.cpp + ${BASE_PATH}/kernels/math_ylm_op.cpp ${BASE_PATH}/module_device/device.cpp ${BASE_PATH}/module_device/memory_op.cpp ${PSI_PATH}/psi.cpp diff --git a/python/pyabacus/src/ModuleBase/CMakeLists.txt b/python/pyabacus/src/ModuleBase/CMakeLists.txt index 1c2d9a728b..e89a065147 100644 --- a/python/pyabacus/src/ModuleBase/CMakeLists.txt +++ b/python/pyabacus/src/ModuleBase/CMakeLists.txt @@ -1,7 +1,8 @@ list(APPEND pymodule_base ${PROJECT_SOURCE_DIR}/src/ModuleBase/py_base_math.cpp - ${BASE_PATH}/kernels/math_op.cpp + ${BASE_PATH}/kernels/math_ylm_op.cpp ${BASE_PATH}/kernels/math_kernel_op.cpp + ${BASE_PATH}/kernels/math_kernel_op_vec.cpp ${BASE_PATH}/module_device/memory_op.cpp ${BASE_PATH}/module_device/device.cpp ) diff --git a/python/pyabacus/src/ModuleNAO/CMakeLists.txt b/python/pyabacus/src/ModuleNAO/CMakeLists.txt index 5e86604adc..b9b7fe9935 100644 --- a/python/pyabacus/src/ModuleNAO/CMakeLists.txt +++ b/python/pyabacus/src/ModuleNAO/CMakeLists.txt @@ -13,8 +13,9 @@ list(APPEND _naos ${NAO_PATH}/two_center_integrator.cpp ${NAO_PATH}/two_center_table.cpp # dependency - ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp + ${ABACUS_SOURCE_DIR}/module_base/kernels/math_ylm_op.cpp ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op.cpp + ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op_vec.cpp # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp ${ABACUS_SOURCE_DIR}/module_base/module_device/memory_op.cpp ${ABACUS_SOURCE_DIR}/module_base/module_device/device.cpp diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt index 4bd0153b48..7c9f216870 100644 --- a/python/pyabacus/src/hsolver/CMakeLists.txt +++ b/python/pyabacus/src/hsolver/CMakeLists.txt @@ -10,9 +10,11 @@ list(APPEND _diago ${HSOLVER_PATH}/kernels/dngvd_op.cpp + ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp # dependency ${BASE_PATH}/kernels/math_kernel_op.cpp - ${BASE_PATH}/kernels/math_op.cpp + ${BASE_PATH}/kernels/math_kernel_op_vec.cpp + ${BASE_PATH}/kernels/math_ylm_op.cpp ${BASE_PATH}/module_device/device.cpp ${BASE_PATH}/module_device/memory_op.cpp diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 5acce9103f..43e8fcdc16 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -36,6 +36,7 @@ list(APPEND device_srcs module_hamilt_pw/hamilt_stodft/kernels/hpsi_norm_op.cpp module_basis/module_pw/kernels/pw_op.cpp module_hsolver/kernels/dngvd_op.cpp + module_hsolver/kernels/bpcg_kernel_op.cpp module_elecstate/kernels/elecstate_op.cpp # module_psi/kernels/psi_memory_op.cpp @@ -44,13 +45,14 @@ list(APPEND device_srcs module_base/module_device/device.cpp module_base/module_device/memory_op.cpp module_base/kernels/math_kernel_op.cpp + module_base/kernels/math_kernel_op_vec.cpp module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/wf_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.cpp - module_base/kernels/math_op.cpp + module_base/kernels/math_ylm_op.cpp module_hamilt_general/module_xc/kernels/xc_functional_op.cpp ) @@ -64,6 +66,7 @@ if(USE_CUDA) module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu module_basis/module_pw/kernels/cuda/pw_op.cu module_hsolver/kernels/cuda/dngvd_op.cu + module_hsolver/kernels/cuda/bpcg_kernel_op.cu module_elecstate/kernels/cuda/elecstate_op.cu # module_psi/kernels/cuda/memory_op.cu @@ -73,8 +76,9 @@ if(USE_CUDA) module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu module_hamilt_pw/hamilt_pwdft/kernels/cuda/wf_op.cu module_hamilt_pw/hamilt_pwdft/kernels/cuda/vnl_op.cu - module_base/kernels/cuda/math_op.cu + module_base/kernels/cuda/math_ylm_op.cu module_base/kernels/cuda/math_kernel_op.cu + module_base/kernels/cuda/math_kernel_op_vec.cu module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu ) endif() @@ -89,6 +93,7 @@ if(USE_ROCM) module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu module_basis/module_pw/kernels/rocm/pw_op.hip.cu module_hsolver/kernels/rocm/dngvd_op.hip.cu + module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu module_elecstate/kernels/rocm/elecstate_op.hip.cu # module_psi/kernels/rocm/memory_op.hip.cu @@ -99,7 +104,8 @@ if(USE_ROCM) module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu module_base/kernels/rocm/math_kernel_op.hip.cu - module_base/kernels/rocm/math_op.hip.cu + module_base/kernels/rocm/math_kernel_op.hip_vec.cu + module_base/kernels/rocm/math_ylm_op.hip.cu module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu ) endif() diff --git a/source/Makefile.Objects b/source/Makefile.Objects index f15bde444f..ffb49d2c6d 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -146,8 +146,9 @@ OBJS_BASE=abfs-vector3_order.o\ math_ylmreal.o\ math_bspline.o\ math_chebyshev.o\ - math_op.o\ + math_ylm_op.o\ math_kernel_op.o\ + math_kernel_op_vec.o\ mathzone_add1.o\ matrix.o\ matrix3.o\ @@ -349,6 +350,7 @@ OBJS_HSOLVER=diago_cg.o\ hsolver_pw_sdft.o\ diago_iter_assist.o\ dngvd_op.o\ + bpcg_kernel_op.o\ diag_const_nums.o\ diag_hs_para.o\ diago_pxxxgvx.o\ diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu index 7227df630d..3a42767925 100644 --- a/source/module_base/kernels/cuda/math_kernel_op.cu +++ b/source/module_base/kernels/cuda/math_kernel_op.cu @@ -8,22 +8,6 @@ #include #include #include - -namespace ModuleBase -{ -const int warp_size = 32; -// const unsigned int full_mask = 0xffffffff; -const int thread_per_block = 256; -} - -template <> -struct GetTypeReal> { - using type = float; /**< The return type specialization for std::complex. */ -}; -template <> -struct GetTypeReal> { - using type = double; /**< The return type specialization for std::complex. */ -}; namespace ModuleBase { template struct GetTypeThrust { @@ -42,12 +26,10 @@ struct GetTypeThrust> { static cublasHandle_t cublas_handle = nullptr; -static inline void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) { cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result)); } -static inline void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) { cublasErrcheck(cublasDdot(cublas_handle, n, x, incx, y, incy, &result)); } @@ -70,310 +52,57 @@ void destoryBLAShandle(){ // for (int offset = 16; offset > 0; offset >>= 1) // val += __shfl_down_sync(full_mask, val, offset); // } - -template -__global__ void line_minimize_with_block( - thrust::complex* grad, - thrust::complex* hgrad, - thrust::complex* psi, - thrust::complex* hpsi, - const int n_basis, - const int n_basis_max) +template <> +void scal_op::operator()(const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) { - int band_idx = blockIdx.x; // band_idx - int tid = threadIdx.x; // basis_idx - int item = 0; - Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; - Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; - __shared__ Real data[thread_per_block * 3]; - - data[tid] = 0; - - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - data[tid] += (grad[item] * thrust::conj(grad[item])).real(); - } - __syncthreads(); - // just do some parallel reduction in shared memory - for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - - // For threads in the same warp, it is better that they process the same work - // Also, __syncwarp() should be used instead of __syncthreads() - // Therefore we unroll the loop and ensure that the threads does the same work - if (tid < warp_size) { - data[tid] += data[tid + 32]; __syncwarp(); - data[tid] += data[tid + 16]; __syncwarp(); - data[tid] += data[tid + 8]; __syncwarp(); - data[tid] += data[tid + 4]; __syncwarp(); - data[tid] += data[tid + 2]; __syncwarp(); - data[tid] += data[tid + 1]; __syncwarp(); - } - - __syncthreads(); - - Real norm = 1.0 / sqrt(data[0]); - __syncthreads(); - - data[tid] = 0; - data[thread_per_block + tid] = 0; - data[2 * thread_per_block + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - grad[item] *= norm; - hgrad[item] *= norm; - data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); - data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); - data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - data[thread_per_block + tid] += data[thread_per_block + tid + ii]; - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii]; - } - __syncthreads(); - } - if (tid < warp_size) { - data[tid] += data[tid + 32]; __syncwarp(); - data[tid] += data[tid + 16]; __syncwarp(); - data[tid] += data[tid + 8]; __syncwarp(); - data[tid] += data[tid + 4]; __syncwarp(); - data[tid] += data[tid + 2]; __syncwarp(); - data[tid] += data[tid + 1]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp(); - data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp(); - } - __syncthreads(); - epsilo_0 = data[0]; - epsilo_1 = data[thread_per_block]; - epsilo_2 = data[2 * thread_per_block]; - - theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); - cos_theta = cos(theta); - sin_theta = sin(theta); - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - psi [item] = psi [item] * cos_theta + grad [item] * sin_theta; - hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta; - } + cublasErrcheck(cublasCscal(cublas_handle, N, (float2*)alpha, (float2*)X, incx)); } -template -__global__ void calc_grad_with_block( - const Real* prec, - Real* err, - Real* beta, - thrust::complex* psi, - thrust::complex* hpsi, - thrust::complex* grad, - thrust::complex* grad_old, - const int n_basis, - const int n_basis_max) +template <> +void scal_op::operator()(const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) { - int band_idx = blockIdx.x; // band_idx - int tid = threadIdx.x; // basis_idx - int item = 0; - Real err_st = 0.0; - Real beta_st = 0.0; - Real epsilo = 0.0; - Real grad_2 = 0.0; - thrust::complex grad_1 = {0, 0}; - __shared__ Real data[thread_per_block * 2]; - - // Init shared memory - data[tid] = 0; - - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - data[tid] += (psi[item] * thrust::conj(psi[item])).real(); - } - __syncthreads(); - // just do some parallel reduction in shared memory - for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - - if (tid < warp_size) { - data[tid] += data[tid + 32]; __syncwarp(); - data[tid] += data[tid + 16]; __syncwarp(); - data[tid] += data[tid + 8]; __syncwarp(); - data[tid] += data[tid + 4]; __syncwarp(); - data[tid] += data[tid + 2]; __syncwarp(); - data[tid] += data[tid + 1]; __syncwarp(); - } - - __syncthreads(); - - Real norm = 1.0 / sqrt(data[0]); - __syncthreads(); - - data[tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - psi[item] *= norm; - hpsi[item] *= norm; - data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - - if (tid < warp_size) { - data[tid] += data[tid + 32]; __syncwarp(); - data[tid] += data[tid + 16]; __syncwarp(); - data[tid] += data[tid + 8]; __syncwarp(); - data[tid] += data[tid + 4]; __syncwarp(); - data[tid] += data[tid + 2]; __syncwarp(); - data[tid] += data[tid + 1]; __syncwarp(); - } - - __syncthreads(); - epsilo = data[0]; - __syncthreads(); - - data[tid] = 0; - data[thread_per_block + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi[item] - epsilo * psi[item]; - grad_2 = thrust::norm(grad_1); - data[tid] += grad_2; - data[thread_per_block + tid] += grad_2 / prec[basis_idx]; - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - data[thread_per_block + tid] += data[thread_per_block + tid + ii]; - } - __syncthreads(); - } - - if (tid < warp_size) { - data[tid] += data[tid + 32]; __syncwarp(); - data[tid] += data[tid + 16]; __syncwarp(); - data[tid] += data[tid + 8]; __syncwarp(); - data[tid] += data[tid + 4]; __syncwarp(); - data[tid] += data[tid + 2]; __syncwarp(); - data[tid] += data[tid + 1]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); - data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); - } - - __syncthreads(); - err_st = data[0]; - beta_st = data[thread_per_block]; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { - item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi[item] - epsilo * psi[item]; - grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item]; - } - - __syncthreads(); - if (tid == 0) { - beta[band_idx] = beta_st; - err[band_idx] = sqrt(err_st); - } + cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx)); } -// Define the CUDA kernel: -template -__global__ void vector_div_constant_kernel( - const int size, - T* result, - const T* vector, - const typename GetTypeReal::type constant) +template <> +void axpy_op::operator()(const int& N, + const double* alpha, + const double* X, + const int& incX, + double* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector[i] / constant; - } + cublasErrcheck(cublasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY)); } -template -__global__ void vector_mul_vector_kernel(const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2, - const bool add) +template <> +void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, + const std::complex* alpha, + const std::complex* X, + const int& incX, + std::complex* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - if (add) - { - result[i] += vector1[i] * vector2[i]; - } - else - { - result[i] = vector1[i] * vector2[i]; - } - } + cublasErrcheck(cublasCaxpy(cublas_handle, N, (float2*)alpha, (float2*)X, incX, (float2*)Y, incY)); } -template -__global__ void vector_div_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2) +template <> +void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, + const std::complex* alpha, + const std::complex* X, + const int& incX, + std::complex* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] / vector2[i]; - } + cublasErrcheck(cublasZaxpy(cublas_handle, N, (double2*)alpha, (double2*)X, incX, (double2*)Y, incY)); } -template -__global__ void constantvector_addORsub_constantVector_kernel( - const int size, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } -} template __global__ void matrix_transpose_kernel( @@ -404,303 +133,6 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const } } -template -void line_minimize_with_block_op::operator()(T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) -{ - auto A = reinterpret_cast*>(grad_out); - auto B = reinterpret_cast*>(hgrad_out); - auto C = reinterpret_cast*>(psi_out); - auto D = reinterpret_cast*>(hpsi_out); - - line_minimize_with_block<<>>( - A, B, C, D, - n_basis, n_basis_max); - - cudaCheckOnDebug(); -} - -template -void calc_grad_with_block_op::operator()(const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) -{ - auto A = reinterpret_cast*>(psi_out); - auto B = reinterpret_cast*>(hpsi_out); - auto C = reinterpret_cast*>(grad_out); - auto D = reinterpret_cast*>(grad_old_out); - - calc_grad_with_block<<>>( - prec_in, err_out, beta_out, - A, B, C, D, - n_basis, n_basis_max); - - cudaCheckOnDebug(); -} - -template <> -double dot_real_op::operator()(const int& dim, - const double* psi_L, - const double* psi_R, - const bool reduce) -{ - double result = 0.0; - xdot_wrapper(dim, psi_L, 1, psi_R, 1, result); - if (reduce) { - Parallel_Reduce::reduce_pool(result); - } - return result; -} -// for this implementation, please check -// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify -// 2022-10-03 Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation. -template -inline FPTYPE dot_complex_wrapper(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - // denghui modify 2022-10-07 - // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) - const FPTYPE* pL = reinterpret_cast(psi_L); - const FPTYPE* pR = reinterpret_cast(psi_R); - FPTYPE result = 0.0; - xdot_wrapper(dim * 2, pL, 1, pR, 1, result); - if (reduce) { - Parallel_Reduce::reduce_pool(result); - } - return result; -} - -template <> -float dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - return dot_complex_wrapper(dim, psi_L, psi_R, reduce); -} -template <> -double dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - return dot_complex_wrapper(dim, psi_L, psi_R, reduce); -} - -// vector operator: result[i] = vector[i] / constant -template <> -void vector_div_constant_op::operator()(const int dim, - double* result, - const double* vector, - const double constant) -{ - // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40 - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_div_constant_kernel <<>> (dim, result, vector, constant); - - cudaCheckOnDebug(); -} - -// vector operator: result[i] = vector[i] / constant -template -inline void vector_div_constant_complex_wrapper(const int dim, - std::complex* result, - const std::complex* vector, - const FPTYPE constant) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector_tmp = reinterpret_cast*>(vector); - - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_div_constant_kernel> <<>> (dim, result_tmp, vector_tmp, constant); - - cudaCheckOnDebug(); -} -template <> -void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int dim, - std::complex* result, - const std::complex* vector, - const float constant) -{ - vector_div_constant_complex_wrapper(dim, result, vector, constant); -} -template <> -void vector_div_constant_op, base_device::DEVICE_GPU>::operator()( - const int dim, - std::complex* result, - const std::complex* vector, - const double constant) -{ - vector_div_constant_complex_wrapper(dim, result, vector, constant); -} -// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) -template <> -void vector_mul_vector_op::operator()(const int& dim, - double* result, - const double* vector1, - const double* vector2, - const bool& add) -{ - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel <<>> (dim, result, vector1, vector2, add); - - cudaCheckOnDebug(); -} -// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) -template -inline void vector_mul_vector_complex_wrapper(const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE* vector2, - const bool& add) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2, add); - - cudaCheckOnDebug(); -} -template <> -void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, - std::complex* result, - const std::complex* vector1, - const float* vector2, - const bool& add) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); -} -template <> -void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( - const int& dim, - std::complex* result, - const std::complex* vector1, - const double* vector2, - const bool& add) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); -} - -// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex) -template <> -void vector_div_vector_op::operator()(const int& dim, - double* result, - const double* vector1, - const double* vector2) -{ - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_div_vector_kernel <<>> (dim, result, vector1, vector2); - - cudaCheckOnDebug(); -} -// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) -template -inline void vector_div_vector_complex_wrapper(const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE* vector2) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - vector_div_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); - - cudaCheckOnDebug(); -} -template <> -void vector_div_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, - std::complex* result, - const std::complex* vector1, - const float* vector2) -{ - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); -} -template <> -void vector_div_vector_op, base_device::DEVICE_GPU>::operator()( - const int& dim, - std::complex* result, - const std::complex* vector1, - const double* vector2) -{ - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); -} -// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template -void constantvector_addORsub_constantVector_op::operator()(const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) -{ - using Type = typename GetTypeThrust::type; - using Real = typename GetTypeReal::type; - - auto result_tmp = reinterpret_cast(result); - auto vector1_tmp = reinterpret_cast(vector1); - auto vector2_tmp = reinterpret_cast(vector2); - - int thread = thread_per_block; - int block = (dim + thread - 1) / thread; - constantvector_addORsub_constantVector_kernel <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); - - cudaCheckOnDebug(); -} - -template <> -void axpy_op::operator()(const int& N, - const double* alpha, - const double* X, - const int& incX, - double* Y, - const int& incY) -{ - cublasErrcheck(cublasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY)); -} - -template <> -void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, - const std::complex* alpha, - const std::complex* X, - const int& incX, - std::complex* Y, - const int& incY) -{ - cublasErrcheck(cublasCaxpy(cublas_handle, N, (float2*)alpha, (float2*)X, incX, (float2*)Y, incY)); -} - -template <> -void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, - const std::complex* alpha, - const std::complex* X, - const int& incX, - std::complex* Y, - const int& incY) -{ - cublasErrcheck(cublasZaxpy(cublas_handle, N, (double2*)alpha, (double2*)X, incX, (double2*)Y, incY)); -} - cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name) { if (trans == 'N') @@ -778,24 +210,6 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const ch cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incx)); } -template <> -void scal_op::operator()(const int& N, - const std::complex* alpha, - std::complex* X, - const int& incx) -{ - cublasErrcheck(cublasCscal(cublas_handle, N, (float2*)alpha, (float2*)X, incx)); -} - -template <> -void scal_op::operator()(const int& N, - const std::complex* alpha, - std::complex* X, - const int& incx) -{ - cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx)); -} - template <> void gemm_op::operator()(const char& transa, const char& transb, @@ -1026,32 +440,8 @@ void matrixCopy, base_device::DEVICE_GPU>::operator()(const // Explicitly instantiate functors for the types of functor registered. -template struct dot_real_op, base_device::DEVICE_GPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; -template struct vector_div_constant_op, base_device::DEVICE_GPU>; -template struct vector_mul_vector_op; -template struct vector_mul_vector_op, base_device::DEVICE_GPU>; -template struct vector_div_vector_op, base_device::DEVICE_GPU>; -template struct constantvector_addORsub_constantVector_op; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; -template struct matrixCopy, base_device::DEVICE_GPU>; -template struct dot_real_op, base_device::DEVICE_GPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; -template struct vector_div_constant_op, base_device::DEVICE_GPU>; -template struct vector_mul_vector_op; -template struct vector_mul_vector_op, base_device::DEVICE_GPU>; -template struct vector_div_vector_op, base_device::DEVICE_GPU>; -template struct constantvector_addORsub_constantVector_op; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; +template struct matrixCopy, base_device::DEVICE_GPU>; template struct matrixCopy; template struct matrixCopy, base_device::DEVICE_GPU>; - -#ifdef __LCAO -template struct dot_real_op; -template struct vector_div_constant_op; -template struct vector_div_vector_op; -#endif } // namespace ModuleBase diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu new file mode 100644 index 0000000000..ecc5e8b898 --- /dev/null +++ b/source/module_base/kernels/cuda/math_kernel_op_vec.cu @@ -0,0 +1,325 @@ +#include "module_base/kernels/math_kernel_op.h" + +#include +#include + +template <> +struct GetTypeReal> { + using type = float; /**< The return type specialization for std::complex. */ +}; +template <> +struct GetTypeReal> { + using type = double; /**< The return type specialization for std::complex. */ +}; +namespace ModuleBase +{ +const int thread_per_block = 256; +void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result); +void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result); + +// Define the CUDA kernel: +template +__global__ void vector_mul_real_kernel(const int size, + T* result, + const T* vector, + const typename GetTypeReal::type constant) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector[i] * constant; + } +} + +template +__global__ void vector_mul_vector_kernel(const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2, + const bool add) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + if (add) + { + result[i] += vector1[i] * vector2[i]; + } + else + { + result[i] = vector1[i] * vector2[i]; + } + } +} + +template +__global__ void vector_div_vector_kernel(const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] / vector2[i]; + } +} + +template +__global__ void constantvector_addORsub_constantVector_kernel(const int size, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } +} + +// vector operator: result[i] = vector[i] * constant +template <> +void vector_mul_real_op::operator()(const int dim, + double* result, + const double* vector, + const double constant) +{ + // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40 + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_mul_real_kernel<<>>(dim, result, vector, constant); + + cudaCheckOnDebug(); +} + +template +inline void vector_mul_real_wrapper(const int dim, + std::complex* result, + const std::complex* vector, + const FPTYPE constant) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector_tmp = reinterpret_cast*>(vector); + + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_mul_real_kernel><<>>(dim, result_tmp, vector_tmp, constant); + + cudaCheckOnDebug(); +} +template <> +void vector_mul_real_op, base_device::DEVICE_GPU>::operator()(const int dim, + std::complex* result, + const std::complex* vector, + const float constant) +{ + vector_mul_real_wrapper(dim, result, vector, constant); +} +template <> +void vector_mul_real_op, base_device::DEVICE_GPU>::operator()(const int dim, + std::complex* result, + const std::complex* vector, + const double constant) +{ + vector_mul_real_wrapper(dim, result, vector, constant); +} + +// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) +template <> +void vector_mul_vector_op::operator()(const int& dim, + double* result, + const double* vector1, + const double* vector2, + const bool& add) +{ + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_mul_vector_kernel<<>>(dim, result, vector1, vector2, add); + + cudaCheckOnDebug(); +} +// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) +template +inline void vector_mul_vector_complex_wrapper(const int& dim, + std::complex* result, + const std::complex* vector1, + const FPTYPE* vector2, + const bool& add) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_mul_vector_kernel><<>>(dim, result_tmp, vector1_tmp, vector2, add); + + cudaCheckOnDebug(); +} +template <> +void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector1, + const float* vector2, + const bool& add) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); +} +template <> +void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( + const int& dim, + std::complex* result, + const std::complex* vector1, + const double* vector2, + const bool& add) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); +} + +// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex) +template <> +void vector_div_vector_op::operator()(const int& dim, + double* result, + const double* vector1, + const double* vector2) +{ + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_div_vector_kernel<<>>(dim, result, vector1, vector2); + + cudaCheckOnDebug(); +} +// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) +template +inline void vector_div_vector_complex_wrapper(const int& dim, + std::complex* result, + const std::complex* vector1, + const FPTYPE* vector2) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_div_vector_kernel><<>>(dim, result_tmp, vector1_tmp, vector2); + + cudaCheckOnDebug(); +} +template <> +void vector_div_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector1, + const float* vector2) +{ + vector_div_vector_complex_wrapper(dim, result, vector1, vector2); +} +template <> +void vector_div_vector_op, base_device::DEVICE_GPU>::operator()( + const int& dim, + std::complex* result, + const std::complex* vector1, + const double* vector2) +{ + vector_div_vector_complex_wrapper(dim, result, vector1, vector2); +} + +// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 +template +void constantvector_addORsub_constantVector_op::operator()(const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) +{ + using Type = typename GetTypeThrust::type; + using Real = typename GetTypeReal::type; + + auto result_tmp = reinterpret_cast(result); + auto vector1_tmp = reinterpret_cast(vector1); + auto vector2_tmp = reinterpret_cast(vector2); + + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + constantvector_addORsub_constantVector_kernel + <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); + + cudaCheckOnDebug(); +} + +template <> +double dot_real_op::operator()(const int& dim, + const double* psi_L, + const double* psi_R, + const bool reduce) +{ + double result = 0.0; + xdot_wrapper(dim, psi_L, 1, psi_R, 1, result); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; +} +// for this implementation, please check +// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify +// 2022-10-03 Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation. +template +inline FPTYPE dot_complex_wrapper(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // denghui modify 2022-10-07 + // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) + const FPTYPE* pL = reinterpret_cast(psi_L); + const FPTYPE* pR = reinterpret_cast(psi_R); + FPTYPE result = 0.0; + xdot_wrapper(dim * 2, pL, 1, pR, 1, result); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; +} + +template <> +float dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + return dot_complex_wrapper(dim, psi_L, psi_R, reduce); +} +template <> +double dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + return dot_complex_wrapper(dim, psi_L, psi_R, reduce); +} + +// Explicitly instantiate functors for the types of functor registered. +template struct vector_mul_real_op, base_device::DEVICE_GPU>; +template struct vector_mul_real_op; +template struct vector_mul_real_op, base_device::DEVICE_GPU>; + +template struct vector_mul_vector_op; +template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; +template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct vector_div_vector_op, base_device::DEVICE_GPU>; +template struct vector_div_vector_op; +template struct vector_div_vector_op, base_device::DEVICE_GPU>; + +template struct constantvector_addORsub_constantVector_op; +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; +template struct constantvector_addORsub_constantVector_op; +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; + +template struct dot_real_op, base_device::DEVICE_GPU>; +template struct dot_real_op; +template struct dot_real_op, base_device::DEVICE_GPU>; +} // namespace ModuleBase \ No newline at end of file diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_ylm_op.cu similarity index 99% rename from source/module_base/kernels/cuda/math_op.cu rename to source/module_base/kernels/cuda/math_ylm_op.cu index cef3c04f3f..390f22b93b 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_ylm_op.cu @@ -1,5 +1,5 @@ #include -#include "module_base/kernels/math_op.h" +#include "module_base/kernels/math_ylm_op.h" #include diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp index 9420d7c4b7..9c4ccdc42b 100644 --- a/source/module_base/kernels/math_kernel_op.cpp +++ b/source/module_base/kernels/math_kernel_op.cpp @@ -6,241 +6,6 @@ namespace ModuleBase { -template -struct line_minimize_with_block_op -{ - using Real = typename GetTypeReal::type; - void operator()(T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) - { - for (int band_idx = 0; band_idx < n_band; band_idx++) - { - Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; - Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; - auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) - { - auto item = band_idx * n_basis_max + basis_idx; - grad_out[item] *= norm; - hgrad_out[item] *= norm; - epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item])); - epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); - epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); - } - Parallel_Reduce::reduce_pool(epsilo_0); - Parallel_Reduce::reduce_pool(epsilo_1); - Parallel_Reduce::reduce_pool(epsilo_2); - theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); - cos_theta = std::cos(theta); - sin_theta = std::sin(theta); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) - { - auto item = band_idx * n_basis_max + basis_idx; - psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta; - hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta; - } - } - } -}; - -template -struct calc_grad_with_block_op -{ - using Real = typename GetTypeReal::type; - void operator()(const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) - { - for (int band_idx = 0; band_idx < n_band; band_idx++) - { - Real err = 0.0; - Real beta = 0.0; - Real epsilo = 0.0; - Real grad_2 = {0.0}; - T grad_1 = {0.0, 0.0}; - auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) - { - auto item = band_idx * n_basis_max + basis_idx; - psi_out[item] *= norm; - hpsi_out[item] *= norm; - epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); - } - Parallel_Reduce::reduce_pool(epsilo); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) - { - auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_2 = std::norm(grad_1); - err += grad_2; - beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? - } - Parallel_Reduce::reduce_pool(err); - Parallel_Reduce::reduce_pool(beta); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) - { - auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; - } - beta_out[band_idx] = beta; - err_out[band_idx] = sqrt(err); - } - } -}; - -template -struct dot_real_op -{ - FPTYPE operator()(const int& dim, - const FPTYPE* psi_L, - const FPTYPE* psi_R, - const bool reduce) - { - FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1); - if (reduce) - { - Parallel_Reduce::reduce_pool(result); - } - return result; - } -}; - -// CPU specialization of actual computation. -template -struct dot_real_op, base_device::DEVICE_CPU> -{ - FPTYPE operator()(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) - { - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - // qianrui modify 2021-3-14 - // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) - const FPTYPE* pL = reinterpret_cast(psi_L); - const FPTYPE* pR = reinterpret_cast(psi_R); - FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1); - if (reduce) - { - Parallel_Reduce::reduce_pool(result); - } - return result; - } -}; - -template -struct vector_div_constant_op -{ - using Real = typename GetTypeReal::type; - void operator()(const int dim, T* result, const T* vector, const Real constant) - { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector[i] / constant; - } - } -}; - -template -struct vector_mul_vector_op -{ - using Real = typename GetTypeReal::type; - void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add) - { - if (add) - { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] += vector1[i] * vector2[i]; - } - } - else - { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * vector2[i]; - } - } - } -}; - -template -struct vector_div_vector_op -{ - using Real = typename GetTypeReal::type; - void operator()(const int& dim, T* result, const T* vector1, const Real* vector2) - { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] / vector2[i]; - } - } -}; - -template -struct constantvector_addORsub_constantVector_op -{ - using Real = typename GetTypeReal::type; - void operator()(const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) - { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } - } -}; - -template -struct scal_op -{ - void operator()(const int& N, - const std::complex* alpha, - std::complex* X, - const int& incx) - { - BlasConnector::scal(N, *alpha, X, incx); - } -}; - template struct gemv_op { @@ -260,20 +25,6 @@ struct gemv_op } }; -template -struct axpy_op -{ - void operator()(const int& dim, - const T* alpha, - const T* X, - const int& incX, - T* Y, - const int& incY) - { - BlasConnector::axpy(dim, *alpha, X, incX, Y, incY); - } -}; - template struct gemm_op { @@ -367,48 +118,23 @@ struct matrixCopy } }; -// Explicitly instantiate functors for the types of functor registered. -template struct scal_op; -template struct axpy_op, base_device::DEVICE_CPU>; template struct gemv_op, base_device::DEVICE_CPU>; template struct gemv_op; template struct gemm_op, base_device::DEVICE_CPU>; template struct gemm_op; -template struct dot_real_op, base_device::DEVICE_CPU>; -template struct vector_div_constant_op, base_device::DEVICE_CPU>; -template struct vector_mul_vector_op, base_device::DEVICE_CPU>; -template struct vector_div_vector_op, base_device::DEVICE_CPU>; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; template struct matrixCopy, base_device::DEVICE_CPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; -template struct scal_op; -template struct axpy_op, base_device::DEVICE_CPU>; -template struct axpy_op; template struct gemv_op, base_device::DEVICE_CPU>; template struct gemv_op; template struct gemm_op, base_device::DEVICE_CPU>; template struct gemm_op; -template struct dot_real_op, base_device::DEVICE_CPU>; -template struct dot_real_op; -template struct vector_div_constant_op, base_device::DEVICE_CPU>; -template struct vector_mul_vector_op, base_device::DEVICE_CPU>; -template struct vector_div_vector_op, base_device::DEVICE_CPU>; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; template struct matrixCopy; template struct matrixCopy, base_device::DEVICE_CPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; #ifdef __LCAO -template struct vector_mul_vector_op; -template struct vector_div_constant_op; -template struct vector_div_vector_op; template struct matrixTranspose_op; -template struct constantvector_addORsub_constantVector_op; #endif #ifdef __DSP template struct gemm_op_mt, base_device::DEVICE_CPU>; diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h index daca2ac548..fdb6d24457 100644 --- a/source/module_base/kernels/math_kernel_op.h +++ b/source/module_base/kernels/math_kernel_op.h @@ -19,6 +19,9 @@ namespace ModuleBase { +//--------------------------------------------------------------------------------- +//-----------------------------0. Tool Functions----------------------------------- +//--------------------------------------------------------------------------------- inline std::complex set_real_tocomplex(const std::complex &x) { return {x.real(), 0.0}; } @@ -43,68 +46,30 @@ inline double get_conj(const double &x) { return x; } inline float get_conj(const float &x) { return x; } -template struct line_minimize_with_block_op { - /// @brief dot_real_op computes the dot product of the given complex - /// arrays(treated as float arrays). And there's may have MPI communications - /// while enabling planewave parallization strategy. - /// - /// Input Parameters - /// \param dev : the type of computing device - /// \param A : input array arr - /// \param dim : size of A - /// \param lda : leading dimention of A - /// \param batch : batch size, the size of the result array res - /// - /// \return res : the result vector - /// T : dot product result - void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out, - const int &n_basis, const int &n_basis_max, - const int &n_band); -}; - -template struct calc_grad_with_block_op { - /// @brief dot_real_op computes the dot product of the given complex - /// arrays(treated as float arrays). And there's may have MPI communications - /// while enabling planewave parallization strategy. - /// - /// Input Parameters - /// \param dev : the type of computing device - /// \param A : input array arr - /// \param dim : size of A - /// \param lda : leading dimention of A - /// \param batch : batch size, the size of the result array res - /// - /// \return res : the result vector - /// T : dot product result - using Real = typename GetTypeReal::type; - void operator()(const Real *prec_in, Real *err_out, Real *beta_out, - T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out, - const int &n_basis, const int &n_basis_max, - const int &n_band); -}; -template struct dot_real_op { - using Real = typename GetTypeReal::type; - /// @brief dot_real_op computes the dot product of the given complex - /// arrays(treated as float arrays). And there's may have MPI communications - /// while enabling planewave parallization strategy. +//--------------------------------------------------------------------------------- +//-----------------------------1. Vector Operations-------------------------------- +//--------------------------------------------------------------------------------- +template struct scal_op { + /// @brief x = alpha * x, where alpha and x are complex numbers /// /// Input Parameters - /// \param dim : array size - /// \param psi_L : input array A - /// \param psi_R : input array B - /// \param reduce : flag to control whether to perform the MPI communications + /// \param N : array size + /// \param alpha : input constant + /// \param X : input array + /// \param incx : computing strip of array X /// - /// \return - /// FPTYPE : dot product result - Real operator()(const int &dim, const T *psi_L, - const T *psi_R, const bool reduce = true); + /// Output Parameters + /// \param X : output array + void operator()(const int &N, + const std::complex *alpha, std::complex *X, + const int &incx); }; -// vector operator: result[i] = vector[i] / constant -template struct vector_div_constant_op { +template struct vector_mul_real_op { using Real = typename GetTypeReal::type; - /// @brief result[i] = vector[i] / constant + /// @brief result[i] = vector[i] * constant, where vector is complex number and constant is real number。 + /// It is different from the scal_op, which is used to multiply a complex number by a complex number. /// /// Input Parameters /// \param dim : array size @@ -113,25 +78,8 @@ template struct vector_div_constant_op { /// /// Output Parameters /// \param result : output array - void operator()(const int dim, T *result, const T *vector, - const Real constant); -}; - -// replace vector_div_constant_op : x = alpha * x -template struct scal_op { - /// @brief x = alpha * x - /// - /// Input Parameters - /// \param N : array size - /// \param alpha : input constant - /// \param X : input array - /// \param incx : computing strip of array X - /// - /// Output Parameters - /// \param X : output array - void operator()(const int &N, - const std::complex *alpha, std::complex *X, - const int &incx); + /// \note Use mulitple instead of divide. It is faster. + void operator()(const int dim, T* result, const T* vector, const Real constant); }; // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) @@ -166,6 +114,24 @@ template struct vector_div_vector_op { const Real *vector2); }; +// compute Y = alpha * X + Y +template struct axpy_op { + /// @brief Y = alpha * X + Y + /// + /// Input Parameters + /// \param N : array size + /// \param alpha : input constant alpha + /// \param X : input array X + /// \param incX : computing strip of X + /// \param Y : computing strip of Y + /// \param incY : computing strip of Y + /// + /// Output Parameters + /// \param Y : output array Y + void operator()(const int &N, const T *alpha, const T *X, + const int &incX, T *Y, const int &incY); +}; + // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 template struct constantvector_addORsub_constantVector_op { @@ -185,24 +151,29 @@ struct constantvector_addORsub_constantVector_op { const Real constant1, const T *vector2, const Real constant2); }; -// compute Y = alpha * X + Y -template struct axpy_op { - /// @brief Y = alpha * X + Y +template struct dot_real_op { + using Real = typename GetTypeReal::type; + /// @brief dot_real_op computes the dot product of the given complex + /// arrays(treated as float arrays). And there's may have MPI communications + /// while enabling planewave parallization strategy. /// /// Input Parameters - /// \param N : array size - /// \param alpha : input constant alpha - /// \param X : input array X - /// \param incX : computing strip of X - /// \param Y : computing strip of Y - /// \param incY : computing strip of Y + /// \param dim : array size + /// \param psi_L : input array A + /// \param psi_R : input array B + /// \param reduce : flag to control whether to perform the MPI communications /// - /// Output Parameters - /// \param Y : output array Y - void operator()(const int &N, const T *alpha, const T *X, - const int &incX, T *Y, const int &incY); + /// \return + /// FPTYPE : dot product result + Real operator()(const int &dim, const T *psi_L, + const T *psi_R, const bool reduce = true); }; + +//--------------------------------------------------------------------------------- +//-----------------------------2. Matrix Operations-------------------------------- +//--------------------------------------------------------------------------------- + // compute y = alpha * op(A) * x + beta * y template struct gemv_op { /// @brief y = alpha * op(A) * x + beta * y @@ -314,24 +285,6 @@ template struct matrixCopy { }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - -template -struct line_minimize_with_block_op { - using Real = typename GetTypeReal::type; - void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out, - const int &n_basis, const int &n_basis_max, - const int &n_band); -}; - -template -struct calc_grad_with_block_op { - using Real = typename GetTypeReal::type; - void operator()(const Real *prec_in, Real *err_out, Real *beta_out, - T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out, - const int &n_basis, const int &n_basis_max, - const int &n_band); -}; - // Partially specialize functor for base_device::GpuDevice. template struct dot_real_op { using Real = typename GetTypeReal::type; @@ -341,10 +294,10 @@ template struct dot_real_op { // vector operator: result[i] = vector[i] / constant template -struct vector_div_constant_op { +struct vector_mul_real_op +{ using Real = typename GetTypeReal::type; - void operator()(const int dim, T *result, - const T *vector, const Real constant); + void operator()(const int dim, T* result, const T* vector, const Real constant); }; // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp new file mode 100644 index 0000000000..747857c580 --- /dev/null +++ b/source/module_base/kernels/math_kernel_op_vec.cpp @@ -0,0 +1,177 @@ +#include "module_base/kernels/math_kernel_op.h" + +namespace ModuleBase +{ + +template +struct scal_op +{ + void operator()(const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) + { + BlasConnector::scal(N, *alpha, X, incx); + } +}; + +template +struct vector_mul_real_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int dim, T* result, const T* vector, const Real constant) + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector[i] * constant; + } + } +}; + +template +struct vector_mul_vector_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add) + { + if (add) + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] += vector1[i] * vector2[i]; + } + } + else + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * vector2[i]; + } + } + } +}; + +template +struct vector_div_vector_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& dim, T* result, const T* vector1, const Real* vector2) + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] / vector2[i]; + } + } +}; + +template +struct axpy_op +{ + void operator()(const int& dim, + const T* alpha, + const T* X, + const int& incX, + T* Y, + const int& incY) + { + BlasConnector::axpy(dim, *alpha, X, incX, Y, incY); + } +}; + + +template +struct constantvector_addORsub_constantVector_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } +}; + + + +template +struct dot_real_op +{ + FPTYPE operator()(const int& dim, const FPTYPE* psi_L, const FPTYPE* psi_R, const bool reduce) + { + FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; + } +}; + +template +struct dot_real_op, base_device::DEVICE_CPU> +{ + FPTYPE operator()(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) + { + // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) + const FPTYPE* pL = reinterpret_cast(psi_L); + const FPTYPE* pR = reinterpret_cast(psi_R); + FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; + } +}; + +template struct scal_op; +template struct scal_op; + +template struct vector_mul_real_op, base_device::DEVICE_CPU>; +template struct vector_mul_real_op; +template struct vector_mul_real_op, base_device::DEVICE_CPU>; + +template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct vector_mul_vector_op; +template struct vector_mul_vector_op, base_device::DEVICE_CPU>; + +template struct vector_div_vector_op, base_device::DEVICE_CPU>; +template struct vector_div_vector_op; +template struct vector_div_vector_op, base_device::DEVICE_CPU>; + +template struct axpy_op, base_device::DEVICE_CPU>; +template struct axpy_op, base_device::DEVICE_CPU>; +template struct axpy_op; + +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; +template struct constantvector_addORsub_constantVector_op; +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; + +template struct dot_real_op, base_device::DEVICE_CPU>; +template struct dot_real_op, base_device::DEVICE_CPU>; +template struct dot_real_op; +} // namespace ModuleBase diff --git a/source/module_base/kernels/math_op.cpp b/source/module_base/kernels/math_ylm_op.cpp similarity index 99% rename from source/module_base/kernels/math_op.cpp rename to source/module_base/kernels/math_ylm_op.cpp index 1abf464df1..0846801797 100644 --- a/source/module_base/kernels/math_op.cpp +++ b/source/module_base/kernels/math_ylm_op.cpp @@ -1,4 +1,4 @@ -#include "module_base/kernels/math_op.h" +#include "module_base/kernels/math_ylm_op.h" #include "module_base/libm/libm.h" namespace ModuleBase { diff --git a/source/module_base/kernels/math_op.h b/source/module_base/kernels/math_ylm_op.h similarity index 100% rename from source/module_base/kernels/math_op.h rename to source/module_base/kernels/math_ylm_op.h diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu index a6a787faf8..1211b76b6d 100644 --- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu @@ -7,10 +7,6 @@ #include #include #include - -#define WARP_SIZE 32 -#define FULL_MASK 0xffffffff -#define THREAD_PER_BLOCK 256 template <> struct GetTypeReal> { using type = float; /**< The return type specialization for std::complex. */ @@ -39,12 +35,10 @@ struct GetTypeThrust> { static hipblasHandle_t cublas_handle = nullptr; -static inline void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) { hipblasErrcheck(hipblasSdot(cublas_handle, n, x, incx, y, incy, &result)); } -static inline void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) { hipblasErrcheck(hipblasDdot(cublas_handle, n, x, incx, y, incy, &result)); } @@ -62,239 +56,62 @@ void destoryBLAShandle(){ } } -template -__global__ void line_minimize_with_block( - thrust::complex* grad, - thrust::complex* hgrad, - thrust::complex* psi, - thrust::complex* hpsi, - const int n_basis, - const int n_basis_max) -{ - int band_idx = blockIdx.x; // band_idx - int tid = threadIdx.x; // basis_idx - int item = 0; - Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; - Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; - __shared__ Real data[THREAD_PER_BLOCK * 3]; - - data[tid] = 0; - - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - data[tid] += (grad[item] * thrust::conj(grad[item])).real(); - } - __syncthreads(); - // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - - Real norm = 1.0 / sqrt(data[0]); - __syncthreads(); - - data[tid] = 0; - data[THREAD_PER_BLOCK + tid] = 0; - data[2 * THREAD_PER_BLOCK + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - grad[item] *= norm; - hgrad[item] *= norm; - data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); - data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); - data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; - data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii]; - } - __syncthreads(); - } - epsilo_0 = data[0]; - epsilo_1 = data[THREAD_PER_BLOCK]; - epsilo_2 = data[2 * THREAD_PER_BLOCK]; - - theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); - cos_theta = cos(theta); - sin_theta = sin(theta); - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - psi [item] = psi [item] * cos_theta + grad [item] * sin_theta; - hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta; - } -} - -template -__global__ void calc_grad_with_block( - const Real* prec, - Real* err, - Real* beta, - thrust::complex* psi, - thrust::complex* hpsi, - thrust::complex* grad, - thrust::complex* grad_old, - const int n_basis, - const int n_basis_max) +template <> +void scal_op::operator()(const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) { - int band_idx = blockIdx.x; // band_idx - int tid = threadIdx.x; // basis_idx - int item = 0; - Real err_st = 0.0; - Real beta_st = 0.0; - Real epsilo = 0.0; - Real grad_2 = 0.0; - thrust::complex grad_1 = {0, 0}; - __shared__ Real data[THREAD_PER_BLOCK * 2]; - - // Init shared memory - data[tid] = 0; - - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - data[tid] += (psi[item] * thrust::conj(psi[item])).real(); - } - __syncthreads(); - // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - - Real norm = 1.0 / sqrt(data[0]); - __syncthreads(); - - data[tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - psi[item] *= norm; - hpsi[item] *= norm; - data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - epsilo = data[0]; - __syncthreads(); - - data[tid] = 0; - data[THREAD_PER_BLOCK + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi[item] - epsilo * psi[item]; - grad_2 = thrust::norm(grad_1); - data[tid] += grad_2; - data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx]; - } - __syncthreads(); - - // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; - } - __syncthreads(); - } - err_st = data[0]; - beta_st = data[THREAD_PER_BLOCK]; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { - item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi[item] - epsilo * psi[item]; - grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item]; - } - - __syncthreads(); - if (tid == 0) { - beta[band_idx] = beta_st; - err[band_idx] = sqrt(err_st); - } + hipblasErrcheck(hipblasCscal(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incx)); } -// Define the CUDA kernel: -template -__launch_bounds__(1024) -__global__ void vector_div_constant_kernel( - const int size, - T* result, - const T* vector, - const typename GetTypeReal::type constant) +template <> +void scal_op::operator()(const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector[i] / constant; - } + hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx)); } -template -__launch_bounds__(1024) -__global__ void vector_mul_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2, - const bool add) +template <> +void axpy_op::operator()(const int& N, + const double* alpha, + const double* X, + const int& incX, + double* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - if (add) - { - result[i] += vector1[i] * vector2[i]; - } - else - { - result[i] = vector1[i] * vector2[i]; - } - } + hipblasErrcheck(hipblasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY)); } -template -__launch_bounds__(1024) -__global__ void vector_div_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2) +template <> +void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, + const std::complex* alpha, + const std::complex* X, + const int& incX, + std::complex* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] / vector2[i]; - } + hipblasErrcheck( + hipblasCaxpy(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incX, (hipblasComplex*)Y, incY)); } -template -__launch_bounds__(1024) -__global__ void constantvector_addORsub_constantVector_kernel( - const int size, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) +template <> +void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, + const std::complex* alpha, + const std::complex* X, + const int& incX, + std::complex* Y, + const int& incY) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } + hipblasErrcheck(hipblasZaxpy(cublas_handle, + N, + (hipblasDoubleComplex*)alpha, + (hipblasDoubleComplex*)X, + incX, + (hipblasDoubleComplex*)Y, + incY)); } template @@ -328,304 +145,6 @@ __launch_bounds__(1024) __global__ } } -template -void line_minimize_with_block_op::operator()(T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) -{ - auto A = reinterpret_cast*>(grad_out); - auto B = reinterpret_cast*>(hgrad_out); - auto C = reinterpret_cast*>(psi_out); - auto D = reinterpret_cast*>(hpsi_out); - - line_minimize_with_block<<>>( - A, B, C, D, - n_basis, n_basis_max); - - hipCheckOnDebug(); -} - -template -void calc_grad_with_block_op::operator()(const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int& n_basis, - const int& n_basis_max, - const int& n_band) -{ - auto A = reinterpret_cast*>(psi_out); - auto B = reinterpret_cast*>(hpsi_out); - auto C = reinterpret_cast*>(grad_out); - auto D = reinterpret_cast*>(grad_old_out); - - calc_grad_with_block<<>>( - prec_in, err_out, beta_out, - A, B, C, D, - n_basis, n_basis_max); - - hipCheckOnDebug(); -} - -template <> -double dot_real_op::operator()(const int& dim, - const double* psi_L, - const double* psi_R, - const bool reduce) -{ - double result = 0.0; - xdot_wrapper(dim, psi_L, 1, psi_R, 1, result); - if (reduce) { - Parallel_Reduce::reduce_pool(result); - } - return result; -} - -// for this implementation, please check -// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify -// 2022-10-03 Note that ddot_(2*dim,a,1, b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation. -template -inline FPTYPE dot_complex_wrapper(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - // denghui modify 2022-10-07 - // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) - const FPTYPE* pL = reinterpret_cast(psi_L); - const FPTYPE* pR = reinterpret_cast(psi_R); - FPTYPE result = 0.0; - xdot_wrapper(dim * 2, pL, 1, pR, 1, result); - if (reduce) { - Parallel_Reduce::reduce_pool(result); - } - return result; -} -template <> -float dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - return dot_complex_wrapper(dim, psi_L, psi_R, reduce); -} -template <> -double dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) -{ - return dot_complex_wrapper(dim, psi_L, psi_R, reduce); -} - -template <> -void vector_div_constant_op::operator()(const int dim, - double* result, - const double* vector, - const double constant) -{ - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector, constant); - - hipCheckOnDebug(); -} -// vector operator: result[i] = vector[i] / constant -template -inline void vector_div_constant_complex_wrapper(const int dim, - std::complex* result, - const std::complex* vector, - const FPTYPE constant) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector_tmp = reinterpret_cast*>(vector); - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector_tmp, constant); - - hipCheckOnDebug(); -} -template <> -void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int dim, - std::complex* result, - const std::complex* vector, - const float constant) -{ - vector_div_constant_complex_wrapper(dim, result, vector, constant); - - hipCheckOnDebug(); -} -template <> -void vector_div_constant_op, base_device::DEVICE_GPU>::operator()( - const int dim, - std::complex* result, - const std::complex* vector, - const double constant) -{ - vector_div_constant_complex_wrapper(dim, result, vector, constant); - - hipCheckOnDebug(); -} -// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) -template <> -void vector_mul_vector_op::operator()(const int& dim, - double* result, - const double* vector1, - const double* vector2, - const bool& add) -{ - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add); - - hipCheckOnDebug(); -} - -// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) -template -inline void vector_mul_vector_complex_wrapper(const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE* vector2, - const bool& add) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add); - - hipCheckOnDebug(); -} -template <> -void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, - std::complex* result, - const std::complex* vector1, - const float* vector2, - const bool& add) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); -} -template <> -void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( - const int& dim, - std::complex* result, - const std::complex* vector1, - const double* vector2, - const bool& add) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); -} -// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) -template <> -void vector_div_vector_op::operator()(const int& dim, - double* result, - const double* vector1, - const double* vector2) -{ - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2); - - hipCheckOnDebug(); -} -// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) -template -inline void vector_div_vector_op_complex_wrapper(const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE* vector2) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = 1024; - int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2); - - hipCheckOnDebug(); -} -template <> -void vector_div_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, - std::complex* result, - const std::complex* vector1, - const float* vector2) -{ - vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2); -} -template <> -void vector_div_vector_op, base_device::DEVICE_GPU>::operator()( - const int& dim, - std::complex* result, - const std::complex* vector1, - const double* vector2) -{ - vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2); -} - -// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template -void constantvector_addORsub_constantVector_op::operator()(const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) -{ - using Type = typename GetTypeThrust::type; - using Real = typename GetTypeReal::type; - - auto result_tmp = reinterpret_cast(result); - auto vector1_tmp = reinterpret_cast(vector1); - auto vector2_tmp = reinterpret_cast(vector2); - - int thread = 1024; - int block = (dim + thread - 1) / thread; - constantvector_addORsub_constantVector_kernel <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); - - hipCheckOnDebug(); -} - -template <> -void axpy_op::operator()(const int& N, - const double* alpha, - const double* X, - const int& incX, - double* Y, - const int& incY) -{ - hipblasErrcheck(hipblasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY)); -} - -template <> -void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, - const std::complex* alpha, - const std::complex* X, - const int& incX, - std::complex* Y, - const int& incY) -{ - hipblasErrcheck(hipblasCaxpy(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incX, (hipblasComplex*)Y, incY)); -} - -template <> -void axpy_op, base_device::DEVICE_GPU>::operator()(const int& N, - const std::complex* alpha, - const std::complex* X, - const int& incX, - std::complex* Y, - const int& incY) -{ - hipblasErrcheck(hipblasZaxpy(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incX, (hipblasDoubleComplex*)Y, incY)); -} - hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name) { if (trans == 'N') @@ -697,24 +216,6 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const ch hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incx)); } -template <> -void scal_op::operator()(const int& N, - const std::complex* alpha, - std::complex* X, - const int& incx) -{ - hipblasErrcheck(hipblasCscal(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incx)); -} - -template <> -void scal_op::operator()(const int& N, - const std::complex* alpha, - std::complex* X, - const int& incx) -{ - hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx)); -} - template <> void gemm_op::operator()(const char& transa, const char& transb, @@ -939,31 +440,7 @@ void matrixCopy, base_device::DEVICE_GPU>::operator()(const // Explicitly instantiate functors for the types of functor registered. -template struct dot_real_op, base_device::DEVICE_GPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; -template struct vector_div_constant_op, base_device::DEVICE_GPU>; -template struct vector_mul_vector_op; -template struct vector_mul_vector_op, base_device::DEVICE_GPU>; -template struct vector_div_vector_op, base_device::DEVICE_GPU>; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; +template struct matrixCopy; template struct matrixCopy, base_device::DEVICE_GPU>; - -template struct dot_real_op, base_device::DEVICE_GPU>; -template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; -template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; -template struct vector_div_constant_op, base_device::DEVICE_GPU>; -template struct vector_mul_vector_op; -template struct vector_mul_vector_op, base_device::DEVICE_GPU>; -template struct vector_div_vector_op, base_device::DEVICE_GPU>; -template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixCopy, base_device::DEVICE_GPU>; - -#ifdef __LCAO -template struct dot_real_op; -template struct vector_div_constant_op; -template struct vector_div_vector_op; -template struct matrixCopy; -template struct constantvector_addORsub_constantVector_op; -#endif } // namespace ModuleBase diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu new file mode 100644 index 0000000000..4ba8ae8925 --- /dev/null +++ b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu @@ -0,0 +1,376 @@ +#include "module_base/kernels/math_kernel_op.h" + +#include +#include +template <> +struct GetTypeReal> { + using type = float; /**< The return type specialization for std::complex. */ +}; +template <> +struct GetTypeReal> { + using type = double; /**< The return type specialization for std::complex. */ +}; +namespace ModuleBase +{ +void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result); +void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result); + +// Define the CUDA kernel: +template +__launch_bounds__(1024) __global__ void vector_mul_real_kernel(const int size, + T* result, + const T* vector, + const typename GetTypeReal::type constant) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector[i] * constant; + } +} + +template +__launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2, + const bool add) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + if (add) + { + result[i] += vector1[i] * vector2[i]; + } + else + { + result[i] = vector1[i] * vector2[i]; + } + } +} + +template +__launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] / vector2[i]; + } +} + +template +__launch_bounds__(1024) __global__ void constantvector_addORsub_constantVector_kernel(const int size, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } +} + +// vector operator: result[i] = vector[i] * constant +template <> +void vector_mul_real_op::operator()(const int dim, + double* result, + const double* vector, + const double constant) +{ + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result, + vector, + constant); + + hipCheckOnDebug(); +} +template +inline void vector_mul_real_wrapper(const int dim, + std::complex* result, + const std::complex* vector, + const FPTYPE constant) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector_tmp = reinterpret_cast*>(vector); + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel>), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result_tmp, + vector_tmp, + constant); + + hipCheckOnDebug(); +} +template <> +void vector_mul_real_op, base_device::DEVICE_GPU>::operator()(const int dim, + std::complex* result, + const std::complex* vector, + const float constant) +{ + vector_mul_real_wrapper(dim, result, vector, constant); + + hipCheckOnDebug(); +} +template <> +void vector_mul_real_op, base_device::DEVICE_GPU>::operator()(const int dim, + std::complex* result, + const std::complex* vector, + const double constant) +{ + vector_mul_real_wrapper(dim, result, vector, constant); + + hipCheckOnDebug(); +} + +// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) +template <> +void vector_mul_vector_op::operator()(const int& dim, + double* result, + const double* vector1, + const double* vector2, + const bool& add) +{ + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result, + vector1, + vector2, + add); + + hipCheckOnDebug(); +} + +// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) +template +inline void vector_mul_vector_complex_wrapper(const int& dim, + std::complex* result, + const std::complex* vector1, + const FPTYPE* vector2, + const bool& add) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel>), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result_tmp, + vector1_tmp, + vector2, + add); + + hipCheckOnDebug(); +} +template <> +void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector1, + const float* vector2, + const bool& add) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); +} +template <> +void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( + const int& dim, + std::complex* result, + const std::complex* vector1, + const double* vector2, + const bool& add) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); +} + +// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) +template <> +void vector_div_vector_op::operator()(const int& dim, + double* result, + const double* vector1, + const double* vector2) +{ + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result, + vector1, + vector2); + + hipCheckOnDebug(); +} +// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) +template +inline void vector_div_vector_op_complex_wrapper(const int& dim, + std::complex* result, + const std::complex* vector1, + const FPTYPE* vector2) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel>), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result_tmp, + vector1_tmp, + vector2); + + hipCheckOnDebug(); +} +template <> +void vector_div_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector1, + const float* vector2) +{ + vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2); +} +template <> +void vector_div_vector_op, base_device::DEVICE_GPU>::operator()( + const int& dim, + std::complex* result, + const std::complex* vector1, + const double* vector2) +{ + vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2); +} + +// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 +template +void constantvector_addORsub_constantVector_op::operator()(const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) +{ + using Type = typename GetTypeThrust::type; + using Real = typename GetTypeReal::type; + + auto result_tmp = reinterpret_cast(result); + auto vector1_tmp = reinterpret_cast(vector1); + auto vector2_tmp = reinterpret_cast(vector2); + + int thread = 1024; + int block = (dim + thread - 1) / thread; + constantvector_addORsub_constantVector_kernel + <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2); + + hipCheckOnDebug(); +} + +template <> +double dot_real_op::operator()(const int& dim, + const double* psi_L, + const double* psi_R, + const bool reduce) +{ + double result = 0.0; + xdot_wrapper(dim, psi_L, 1, psi_R, 1, result); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; +} + +// for this implementation, please check +// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify +// 2022-10-03 Note that ddot_(2*dim,a,1, b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual +// computation. +template +inline FPTYPE dot_complex_wrapper(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // denghui modify 2022-10-07 + // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) + const FPTYPE* pL = reinterpret_cast(psi_L); + const FPTYPE* pR = reinterpret_cast(psi_R); + FPTYPE result = 0.0; + xdot_wrapper(dim * 2, pL, 1, pR, 1, result); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; +} +template <> +float dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + return dot_complex_wrapper(dim, psi_L, psi_R, reduce); +} +template <> +double dot_real_op, base_device::DEVICE_GPU>::operator()(const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) +{ + return dot_complex_wrapper(dim, psi_L, psi_R, reduce); +} + +// Explicitly instantiate functors for the types of functor registered. +template struct vector_mul_real_op, base_device::DEVICE_GPU>; +template struct vector_mul_real_op; +template struct vector_mul_real_op, base_device::DEVICE_GPU>; + +template struct vector_mul_vector_op; +template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; +template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct vector_div_vector_op, base_device::DEVICE_GPU>; +template struct vector_div_vector_op; +template struct vector_div_vector_op, base_device::DEVICE_GPU>; + +template struct constantvector_addORsub_constantVector_op; +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; +template struct constantvector_addORsub_constantVector_op; +template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; + +template struct dot_real_op, base_device::DEVICE_GPU>; +template struct dot_real_op; +template struct dot_real_op, base_device::DEVICE_GPU>; +} // namespace ModuleBase \ No newline at end of file diff --git a/source/module_base/kernels/rocm/math_op.hip.cu b/source/module_base/kernels/rocm/math_ylm_op.hip.cu similarity index 99% rename from source/module_base/kernels/rocm/math_op.hip.cu rename to source/module_base/kernels/rocm/math_ylm_op.hip.cu index 12e43bdde4..8cfdac4a39 100644 --- a/source/module_base/kernels/rocm/math_op.hip.cu +++ b/source/module_base/kernels/rocm/math_ylm_op.hip.cu @@ -1,4 +1,4 @@ -#include "module_base/kernels/math_op.h" +#include "module_base/kernels/math_ylm_op.h" #include #include diff --git a/source/module_base/kernels/test/CMakeLists.txt b/source/module_base/kernels/test/CMakeLists.txt index 1453545d14..c1cbf6771f 100644 --- a/source/module_base/kernels/test/CMakeLists.txt +++ b/source/module_base/kernels/test/CMakeLists.txt @@ -3,5 +3,5 @@ remove_definitions(-D__MPI) AddTest( TARGET Base_Kernels_UTs LIBS parameter ${math_libs} base device - SOURCES math_op_test.cpp math_kernel_test.cpp + SOURCES math_ylm_op_test.cpp math_kernel_test.cpp ) diff --git a/source/module_base/kernels/test/math_kernel_test.cpp b/source/module_base/kernels/test/math_kernel_test.cpp index 9490563743..b8cfb68387 100644 --- a/source/module_base/kernels/test/math_kernel_test.cpp +++ b/source/module_base/kernels/test/math_kernel_test.cpp @@ -72,7 +72,7 @@ class TestModuleHsolverMathKernel : public ::testing::Test // haozhihan add // cpu operator - using vector_div_constant_op_cpu = ModuleBase::vector_div_constant_op, base_device::DEVICE_CPU>; + using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op, base_device::DEVICE_CPU>; using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op, base_device::DEVICE_CPU>; using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op, base_device::DEVICE_CPU>; using constantvector_addORsub_constantVector_op_cpu @@ -81,7 +81,7 @@ class TestModuleHsolverMathKernel : public ::testing::Test using scal_op_cpu = ModuleBase::scal_op; using gemv_op_cpu = ModuleBase::gemv_op, base_device::DEVICE_CPU>; // gpu operator - using vector_div_constant_op_gpu = ModuleBase::vector_div_constant_op, base_device::DEVICE_GPU>; + using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op, base_device::DEVICE_GPU>; using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op, base_device::DEVICE_GPU>; using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op, base_device::DEVICE_GPU>; using constantvector_addORsub_constantVector_op_gpu @@ -117,10 +117,10 @@ class TestModuleHsolverMathKernel : public ::testing::Test {1.41257916e+00, 5.45282609e-01}, {-1.29333636e-01, -5.04228492e-03}}; - // (1) for test vector_div_constant_op + // (1) for test vector_mul_real_op const std::vector> input = L; const double constant = 5.5; - const std::vector> output_vector_div_constant_op = {{-0.11893203, -0.13492526}, + const std::vector> output_vector_mul_real_op = {{-0.11893203, -0.13492526}, {-0.40314756, 0.07734553}, {0.61158728, -0.45754102}, {-0.54274745, -0.09682102}, @@ -261,14 +261,14 @@ TEST_F(TestModuleHsolverMathKernel, zdot_real_op_cpu) EXPECT_LT(fabs(result - expected_result), 1e-12); } -TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_cpu) +TEST_F(TestModuleHsolverMathKernel, vector_mul_real_op_cpu) { std::vector> output(input.size()); - vector_div_constant_op_cpu()(dim, output.data(), input.data(), constant); + vector_mul_real_op_cpu()(dim, output.data(), input.data(), 1.0 / constant); for (int i = 0; i < input.size(); i++) { - EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8); - EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8); + EXPECT_LT(fabs(output[i].imag() - output_vector_mul_real_op[i].imag()), 1e-8); + EXPECT_LT(fabs(output[i].real() - output_vector_mul_real_op[i].real()), 1e-8); } } @@ -381,7 +381,7 @@ TEST_F(TestModuleHsolverMathKernel, zdot_real_op_gpu) delete_memory_op()(psi_R_dev); } -TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu) +TEST_F(TestModuleHsolverMathKernel, vector_mul_real_op_gpu) { // in CPU std::vector> output(input.size()); @@ -393,14 +393,14 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu) // syn the input data in CPU to GPU synchronize_memory_op()(input_dev, input.data(), input.size()); // run - vector_div_constant_op_gpu()(dim, output_dev, input_dev, constant); + vector_mul_real_op_gpu()(dim, output_dev, input_dev, 1.0 / constant); // syn the output data in GPU to CPU synchronize_memory_op_gpu()(output.data(), output_dev, output.size()); for (int i = 0; i < input.size(); i++) { - EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8); - EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8); + EXPECT_LT(fabs(output[i].imag() - output_vector_mul_real_op[i].imag()), 1e-8); + EXPECT_LT(fabs(output[i].real() - output_vector_mul_real_op[i].real()), 1e-8); } delete_memory_op()(input_dev); delete_memory_op()(output_dev); diff --git a/source/module_base/kernels/test/math_op_test.cpp b/source/module_base/kernels/test/math_ylm_op_test.cpp similarity index 99% rename from source/module_base/kernels/test/math_op_test.cpp rename to source/module_base/kernels/test/math_ylm_op_test.cpp index cfdedb234e..5da30d8111 100644 --- a/source/module_base/kernels/test/math_op_test.cpp +++ b/source/module_base/kernels/test/math_ylm_op_test.cpp @@ -1,4 +1,4 @@ -#include "module_base/kernels/math_op.h" +#include "module_base/kernels/math_ylm_op.h" #include "module_base/module_device/memory_op.h" diff --git a/source/module_base/math_ylmreal.cpp b/source/module_base/math_ylmreal.cpp index fac76cf959..69c79dc089 100644 --- a/source/module_base/math_ylmreal.cpp +++ b/source/module_base/math_ylmreal.cpp @@ -1,7 +1,7 @@ #include "math_ylmreal.h" #include "constants.h" -#include "module_base/kernels/math_op.h" +#include "module_base/kernels/math_ylm_op.h" #include "module_base/libm/libm.h" #include "module_base/module_device/memory_op.h" #include "module_base/array_pool.h" diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp index 2a03fe13b9..83914874e5 100644 --- a/source/module_hsolver/diago_bpcg.cpp +++ b/source/module_hsolver/diago_bpcg.cpp @@ -1,18 +1,17 @@ -#include - #include "module_hsolver/diago_bpcg.h" -#include -#include - -#include - #include "diago_iter_assist.h" #include "module_base/blas_connector.h" #include "module_base/global_function.h" #include "module_base/kernels/math_kernel_op.h" +#include "module_hsolver/kernels/bpcg_kernel_op.h" #include "para_linear_transform.h" +#include +#include +#include +#include + namespace hsolver { template @@ -100,7 +99,13 @@ void DiagoBPCG::line_minimize( ct::Tensor& psi_out, ct::Tensor& hpsi_out) { - line_minimize_with_block_op()(grad_in.data(), hgrad_in.data(), psi_out.data(), hpsi_out.data(), this->n_dim, this->n_basis, this->n_band_l); + line_minimize_with_block_op()(grad_in.data(), + hgrad_in.data(), + psi_out.data(), + hpsi_out.data(), + this->n_dim, + this->n_basis, + this->n_band_l); } @@ -138,17 +143,16 @@ void DiagoBPCG::calc_grad_with_block( ct::Tensor& grad_out, ct::Tensor& grad_old_out) { - calc_grad_with_block_op()( - prec_in.data(), - err_out.data(), - beta_out.data(), - psi_in.data(), - hpsi_in.data(), - grad_out.data(), - grad_old_out.data(), - this->n_dim, - this->n_basis, - this->n_band_l); + calc_grad_with_block_op()(prec_in.data(), + err_out.data(), + beta_out.data(), + psi_in.data(), + hpsi_in.data(), + grad_out.data(), + grad_old_out.data(), + this->n_dim, + this->n_basis, + this->n_band_l); } template diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h index 243ac8b44f..27d8e6a0ae 100644 --- a/source/module_hsolver/diago_bpcg.h +++ b/source/module_hsolver/diago_bpcg.h @@ -349,8 +349,6 @@ class DiagoBPCG // note: these operators use template parameter base_device::Device_* // defined in module_base/module_device/types.h // different from ct_Device! - using calc_grad_with_block_op = ModuleBase::calc_grad_with_block_op; - using line_minimize_with_block_op = ModuleBase::line_minimize_with_block_op; using gemm_op = ModuleBase::gemm_op; }; diff --git a/source/module_hsolver/diago_cg.cpp b/source/module_hsolver/diago_cg.cpp index 08d5d914ba..9eba7c2bda 100644 --- a/source/module_hsolver/diago_cg.cpp +++ b/source/module_hsolver/diago_cg.cpp @@ -555,7 +555,7 @@ void DiagoCG::schmit_orth(const int& m, const ct::Tensor& psi, const // { // pphi_m[ig] /= psi_norm; // } - ModuleBase::vector_div_constant_op()(this->n_basis_, phi_m.data(), phi_m.data(), psi_norm); + ModuleBase::vector_mul_real_op()(this->n_basis_, phi_m.data(), phi_m.data(), Real(1.0 / psi_norm)); // ModuleBase::timer::tick("DiagoCG","schmit_orth"); } diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index 09ff916917..9e8aa2f066 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -366,20 +366,18 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, } // "normalize!!!" in order to improve numerical stability of subspace diagonalization - std::vector psi_norm(notconv, 0.0); for (size_t i = 0; i < notconv; i++) { - psi_norm[i] = ModuleBase::dot_real_op()(this->dim, + Real psi_norm = ModuleBase::dot_real_op()(this->dim, psi_iter + (nbase + i) * this->dim, psi_iter + (nbase + i) * this->dim, true); - assert(psi_norm[i] > 0.0); - psi_norm[i] = sqrt(psi_norm[i]); - - ModuleBase::vector_div_constant_op()(this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - psi_norm[i]); + assert(psi_norm > 0.0); + psi_norm = sqrt(psi_norm); + ModuleBase::vector_mul_real_op()(this->dim, + psi_iter + (nbase + i) * this->dim, + psi_iter + (nbase + i) * this->dim, + Real(1.0 / psi_norm)); } // update hpsi[:, nbase:nbase+notconv] diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp index 504c3a2053..59c40c9fa4 100644 --- a/source/module_hsolver/diago_david.cpp +++ b/source/module_hsolver/diago_david.cpp @@ -966,7 +966,7 @@ void DiagoDavid::SchmidtOrth(const int& dim, else { // psi_m = psi_m / psi_norm - ModuleBase::vector_div_constant_op()(dim, psi_m, psi_m, psi_norm); + ModuleBase::vector_mul_real_op()(dim, psi_m, psi_m, Real(1.0 / psi_norm)); // for (int i = 0; i < npw; i++) // { // psi_m[i] /= psi_norm; diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.cpp b/source/module_hsolver/kernels/bpcg_kernel_op.cpp new file mode 100644 index 0000000000..bf05ac94e1 --- /dev/null +++ b/source/module_hsolver/kernels/bpcg_kernel_op.cpp @@ -0,0 +1,113 @@ +#include "module_hsolver/kernels/bpcg_kernel_op.h" +#include "module_base/blas_connector.h" +#include "module_base/kernels/math_kernel_op.h" +#include "module_base/parallel_reduce.h" +namespace hsolver +{ + +template +struct line_minimize_with_block_op +{ + using Real = typename GetTypeReal::type; + void operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) + { + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; + Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; + auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); + Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); + norm = 1.0 / sqrt(norm); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { + auto item = band_idx * n_basis_max + basis_idx; + grad_out[item] *= norm; + hgrad_out[item] *= norm; + epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item])); + epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); + epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); + } + Parallel_Reduce::reduce_pool(epsilo_0); + Parallel_Reduce::reduce_pool(epsilo_1); + Parallel_Reduce::reduce_pool(epsilo_2); + theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); + cos_theta = std::cos(theta); + sin_theta = std::sin(theta); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { + auto item = band_idx * n_basis_max + basis_idx; + psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta; + hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta; + } + } + } +}; + +template +struct calc_grad_with_block_op +{ + using Real = typename GetTypeReal::type; + void operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) + { + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real err = 0.0; + Real beta = 0.0; + Real epsilo = 0.0; + Real grad_2 = {0.0}; + T grad_1 = {0.0, 0.0}; + auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); + Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); + norm = 1.0 / sqrt(norm); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { + auto item = band_idx * n_basis_max + basis_idx; + psi_out[item] *= norm; + hpsi_out[item] *= norm; + epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); + } + Parallel_Reduce::reduce_pool(epsilo); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { + auto item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + grad_2 = std::norm(grad_1); + err += grad_2; + beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? + } + Parallel_Reduce::reduce_pool(err); + Parallel_Reduce::reduce_pool(beta); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { + auto item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; + } + beta_out[band_idx] = beta; + err_out[band_idx] = sqrt(err); + } + } +}; + +template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; +template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; +} // namespace hsolver \ No newline at end of file diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.h b/source/module_hsolver/kernels/bpcg_kernel_op.h new file mode 100644 index 0000000000..fcf8f99267 --- /dev/null +++ b/source/module_hsolver/kernels/bpcg_kernel_op.h @@ -0,0 +1,84 @@ +#ifndef MODULE_HSOLVER_BPCG_KERNEL_H +#define MODULE_HSOLVER_BPCG_KERNEL_H +#include "module_base/macros.h" +#include "module_base/module_device/types.h" +namespace hsolver +{ + +template +struct line_minimize_with_block_op +{ + /// @brief dot_real_op computes the dot product of the given complex + /// arrays(treated as float arrays). And there's may have MPI communications + /// while enabling planewave parallization strategy. + /// + /// Input Parameters + /// \param dev : the type of computing device + /// \param A : input array arr + /// \param dim : size of A + /// \param lda : leading dimention of A + /// \param batch : batch size, the size of the result array res + /// + /// \return res : the result vector + /// T : dot product result + void operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); +}; + +template +struct calc_grad_with_block_op +{ + /// @brief dot_real_op computes the dot product of the given complex + /// arrays(treated as float arrays). And there's may have MPI communications + /// while enabling planewave parallization strategy. + /// + /// Input Parameters + /// \param dev : the type of computing device + /// \param A : input array arr + /// \param dim : size of A + /// \param lda : leading dimention of A + /// \param batch : batch size, the size of the result array res + /// + /// \return res : the result vector + /// T : dot product result + using Real = typename GetTypeReal::type; + void operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); +}; + +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +template +struct line_minimize_with_block_op { + using Real = typename GetTypeReal::type; + void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out, + const int &n_basis, const int &n_basis_max, + const int &n_band); +}; + +template +struct calc_grad_with_block_op { + using Real = typename GetTypeReal::type; + void operator()(const Real *prec_in, Real *err_out, Real *beta_out, + T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out, + const int &n_basis, const int &n_basis_max, + const int &n_band); +}; + +#endif +} // namespace hsolver + +#endif \ No newline at end of file diff --git a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu new file mode 100644 index 0000000000..d851265a1b --- /dev/null +++ b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu @@ -0,0 +1,300 @@ +#include "module_base/kernels/math_kernel_op.h" +#include "module_hsolver/kernels/bpcg_kernel_op.h" + +#include +#include +namespace hsolver +{ +const int warp_size = 32; +const int thread_per_block = 256; + +template +__global__ void line_minimize_with_block( + thrust::complex* grad, + thrust::complex* hgrad, + thrust::complex* psi, + thrust::complex* hpsi, + const int n_basis, + const int n_basis_max) +{ + int band_idx = blockIdx.x; // band_idx + int tid = threadIdx.x; // basis_idx + int item = 0; + Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; + Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; + __shared__ Real data[thread_per_block * 3]; + + data[tid] = 0; + + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + data[tid] += (grad[item] * thrust::conj(grad[item])).real(); + } + __syncthreads(); + // just do some parallel reduction in shared memory + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + + // For threads in the same warp, it is better that they process the same work + // Also, __syncwarp() should be used instead of __syncthreads() + // Therefore we unroll the loop and ensure that the threads does the same work + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); + + Real norm = 1.0 / sqrt(data[0]); + __syncthreads(); + + data[tid] = 0; + data[thread_per_block + tid] = 0; + data[2 * thread_per_block + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + grad[item] *= norm; + hgrad[item] *= norm; + data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); + data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); + data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + data[thread_per_block + tid] += data[thread_per_block + tid + ii]; + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii]; + } + __syncthreads(); + } + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp(); + } + __syncthreads(); + epsilo_0 = data[0]; + epsilo_1 = data[thread_per_block]; + epsilo_2 = data[2 * thread_per_block]; + + theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); + cos_theta = cos(theta); + sin_theta = sin(theta); + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + psi [item] = psi [item] * cos_theta + grad [item] * sin_theta; + hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta; + } +} + +template +__global__ void calc_grad_with_block( + const Real* prec, + Real* err, + Real* beta, + thrust::complex* psi, + thrust::complex* hpsi, + thrust::complex* grad, + thrust::complex* grad_old, + const int n_basis, + const int n_basis_max) +{ + int band_idx = blockIdx.x; // band_idx + int tid = threadIdx.x; // basis_idx + int item = 0; + Real err_st = 0.0; + Real beta_st = 0.0; + Real epsilo = 0.0; + Real grad_2 = 0.0; + thrust::complex grad_1 = {0, 0}; + __shared__ Real data[thread_per_block * 2]; + + // Init shared memory + data[tid] = 0; + + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + data[tid] += (psi[item] * thrust::conj(psi[item])).real(); + } + __syncthreads(); + // just do some parallel reduction in shared memory + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); + + Real norm = 1.0 / sqrt(data[0]); + __syncthreads(); + + data[tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + psi[item] *= norm; + hpsi[item] *= norm; + data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); + epsilo = data[0]; + __syncthreads(); + + data[tid] = 0; + data[thread_per_block + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi[item] - epsilo * psi[item]; + grad_2 = thrust::norm(grad_1); + data[tid] += grad_2; + data[thread_per_block + tid] += grad_2 / prec[basis_idx]; + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + data[thread_per_block + tid] += data[thread_per_block + tid + ii]; + } + __syncthreads(); + } + + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); + } + + __syncthreads(); + err_st = data[0]; + beta_st = data[thread_per_block]; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { + item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi[item] - epsilo * psi[item]; + grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item]; + } + + __syncthreads(); + if (tid == 0) { + beta[band_idx] = beta_st; + err[band_idx] = sqrt(err_st); + } +} + +template +void line_minimize_with_block_op::operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) +{ + auto A = reinterpret_cast*>(grad_out); + auto B = reinterpret_cast*>(hgrad_out); + auto C = reinterpret_cast*>(psi_out); + auto D = reinterpret_cast*>(hpsi_out); + + line_minimize_with_block<<>>( + A, B, C, D, + n_basis, n_basis_max); + + cudaCheckOnDebug(); +} + +template +void calc_grad_with_block_op::operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) +{ + auto A = reinterpret_cast*>(psi_out); + auto B = reinterpret_cast*>(hpsi_out); + auto C = reinterpret_cast*>(grad_out); + auto D = reinterpret_cast*>(grad_old_out); + + calc_grad_with_block<<>>( + prec_in, err_out, beta_out, + A, B, C, D, + n_basis, n_basis_max); + + cudaCheckOnDebug(); +} + +template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; +template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; + +} \ No newline at end of file diff --git a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu new file mode 100644 index 0000000000..850b48bf2e --- /dev/null +++ b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu @@ -0,0 +1,224 @@ +#include "module_base/kernels/math_kernel_op.h" +#include "module_hsolver/kernels/bpcg_kernel_op.h" + +#include +#include +#define WARP_SIZE 32 +#define THREAD_PER_BLOCK 256 +namespace hsolver +{ +template +__global__ void line_minimize_with_block( + thrust::complex* grad, + thrust::complex* hgrad, + thrust::complex* psi, + thrust::complex* hpsi, + const int n_basis, + const int n_basis_max) +{ + int band_idx = blockIdx.x; // band_idx + int tid = threadIdx.x; // basis_idx + int item = 0; + Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; + Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; + __shared__ Real data[THREAD_PER_BLOCK * 3]; + + data[tid] = 0; + + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + data[tid] += (grad[item] * thrust::conj(grad[item])).real(); + } + __syncthreads(); + // just do some parallel reduction in shared memory + for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + + Real norm = 1.0 / sqrt(data[0]); + __syncthreads(); + + data[tid] = 0; + data[THREAD_PER_BLOCK + tid] = 0; + data[2 * THREAD_PER_BLOCK + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + grad[item] *= norm; + hgrad[item] *= norm; + data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); + data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); + data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; + data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii]; + } + __syncthreads(); + } + epsilo_0 = data[0]; + epsilo_1 = data[THREAD_PER_BLOCK]; + epsilo_2 = data[2 * THREAD_PER_BLOCK]; + + theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); + cos_theta = cos(theta); + sin_theta = sin(theta); + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + psi [item] = psi [item] * cos_theta + grad [item] * sin_theta; + hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta; + } +} + +template +__global__ void calc_grad_with_block( + const Real* prec, + Real* err, + Real* beta, + thrust::complex* psi, + thrust::complex* hpsi, + thrust::complex* grad, + thrust::complex* grad_old, + const int n_basis, + const int n_basis_max) +{ + int band_idx = blockIdx.x; // band_idx + int tid = threadIdx.x; // basis_idx + int item = 0; + Real err_st = 0.0; + Real beta_st = 0.0; + Real epsilo = 0.0; + Real grad_2 = 0.0; + thrust::complex grad_1 = {0, 0}; + __shared__ Real data[THREAD_PER_BLOCK * 2]; + + // Init shared memory + data[tid] = 0; + + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + data[tid] += (psi[item] * thrust::conj(psi[item])).real(); + } + __syncthreads(); + // just do some parallel reduction in shared memory + for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + + Real norm = 1.0 / sqrt(data[0]); + __syncthreads(); + + data[tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + psi[item] *= norm; + hpsi[item] *= norm; + data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + epsilo = data[0]; + __syncthreads(); + + data[tid] = 0; + data[THREAD_PER_BLOCK + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi[item] - epsilo * psi[item]; + grad_2 = thrust::norm(grad_1); + data[tid] += grad_2; + data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx]; + } + __syncthreads(); + + // just do some parallel reduction in shared memory + for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; + } + __syncthreads(); + } + err_st = data[0]; + beta_st = data[THREAD_PER_BLOCK]; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + item = band_idx * n_basis_max + basis_idx; + grad_1 = hpsi[item] - epsilo * psi[item]; + grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item]; + } + + __syncthreads(); + if (tid == 0) { + beta[band_idx] = beta_st; + err[band_idx] = sqrt(err_st); + } +} + +template +void line_minimize_with_block_op::operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) +{ + auto A = reinterpret_cast*>(grad_out); + auto B = reinterpret_cast*>(hgrad_out); + auto C = reinterpret_cast*>(psi_out); + auto D = reinterpret_cast*>(hpsi_out); + + line_minimize_with_block<<>>( + A, B, C, D, + n_basis, n_basis_max); + + hipCheckOnDebug(); +} + +template +void calc_grad_with_block_op::operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) +{ + auto A = reinterpret_cast*>(psi_out); + auto B = reinterpret_cast*>(hpsi_out); + auto C = reinterpret_cast*>(grad_out); + auto D = reinterpret_cast*>(grad_old_out); + + calc_grad_with_block<<>>( + prec_in, err_out, beta_out, + A, B, C, D, + n_basis, n_basis_max); + + hipCheckOnDebug(); +} + +template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; +template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; +template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; +} \ No newline at end of file diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp index bb69ca7ac7..f9f0228c1f 100644 --- a/source/module_hsolver/kernels/test/perf_math_kernel.cpp +++ b/source/module_hsolver/kernels/test/perf_math_kernel.cpp @@ -17,7 +17,7 @@ /** * Tested function: * - zdot_real_cpu_op - * - vector_div_constant_op_cpu + * - vector_mul_real_op_cpu * - vector_mul_vector_op_cpu * - vector_div_vector_op_cpu * - constantvector_addORsub_constantVector_op_cpu @@ -25,7 +25,7 @@ * - scal_cpu * - zdot_real_gpu_op - * - vector_div_constant_op_gpu + * - vector_mul_real_op_gpu * - vector_mul_vector_op_gpu * - vector_div_vector_op_gpu * - constantvector_addORsub_constantVector_op_gpu @@ -134,7 +134,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture { // CPU operator using zdot_real_cpu_op = ModuleBase::dot_real_op, base_device::DEVICE_CPU>; - using vector_div_constant_op_cpu = ModuleBase::vector_div_constant_op, base_device::DEVICE_CPU>; + using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op, base_device::DEVICE_CPU>; using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op, base_device::DEVICE_CPU>; using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op, base_device::DEVICE_CPU>; using constantvector_addORsub_constantVector_op_cpu @@ -148,7 +148,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture { // GPU operator using zdot_real_gpu_op = ModuleBase::dot_real_op, base_device::DEVICE_GPU>; - using vector_div_constant_op_gpu = ModuleBase::vector_div_constant_op, base_device::DEVICE_GPU>; + using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op, base_device::DEVICE_GPU>; using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op, base_device::DEVICE_GPU>; using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op, base_device::DEVICE_GPU>; using constantvector_addORsub_constantVector_op_gpu @@ -167,9 +167,9 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)(benchmark:: } -BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)(benchmark::State& state) { +BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_cpu)(benchmark::State& state) { for (auto _ : state) { - vector_div_constant_op_cpu()(dim_vector, result_zvector, test_zvector_a, dconstant_a); + vector_mul_real_op_cpu()(dim_vector, result_zvector, test_zvector_a, dconstant_a); } } @@ -205,7 +205,7 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_cpu)(benchmark::State BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); -BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); +BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); @@ -236,9 +236,9 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)(benchmark:: } } -BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)(benchmark::State& state) { +BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_gpu)(benchmark::State& state) { for (auto _ : state) { - vector_div_constant_op_gpu()(dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a); + vector_mul_real_op_gpu()(dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a); } } @@ -276,7 +276,7 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_gpu)(benchmark::State // BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->UseManualTime()->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); -BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); +BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond); diff --git a/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref b/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref index 6896763a6c..f53a4b9f43 100644 --- a/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref +++ b/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref @@ -1,7 +1,7 @@ etotref -1675.0621003323110472 etotperatomref -837.5310501662 -totalforceref 9.445612 -totalstressref 1646.076135 +totalforceref 9.443968 +totalstressref 1645.827919 pointgroupref C_2v spacegroupref C_2v nksibzref 5 diff --git a/tests/integrate/102_PW_PINT_UKS/result.ref b/tests/integrate/102_PW_PINT_UKS/result.ref index 476cf3e367..9db5bb132a 100644 --- a/tests/integrate/102_PW_PINT_UKS/result.ref +++ b/tests/integrate/102_PW_PINT_UKS/result.ref @@ -1,3 +1,3 @@ -etotref -6147.553111898429 -etotperatomref -3073.7765559492 +etotref -6147.5531114 +etotperatomref -3073.7765557 totaltimeref 14.32 diff --git a/tests/integrate/107_PW_outWfcR/result.ref b/tests/integrate/107_PW_outWfcR/result.ref index 1880e707f7..c610d46c6f 100644 --- a/tests/integrate/107_PW_outWfcR/result.ref +++ b/tests/integrate/107_PW_outWfcR/result.ref @@ -1,8 +1,8 @@ etotref -197.1405644417868 etotperatomref -98.5702822209 variance_wfc_r_0_0 0.31340 -variance_wfc_r_0_1 1.71055 -variance_wfc_r_0_2 2.39603 +variance_wfc_r_0_1 1.71056 +variance_wfc_r_0_2 2.39604 variance_wfc_r_0_3 1.66607 variance_wfc_r_0_4 1.05190 variance_wfc_r_0_5 1.29386