diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu index 39043daff1..7227df630d 100644 --- a/source/module_base/kernels/cuda/math_kernel_op.cu +++ b/source/module_base/kernels/cuda/math_kernel_op.cu @@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel( } template -__global__ void vector_mul_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2) +__global__ void vector_mul_vector_kernel(const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2, + const bool add) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { - result[i] = vector1[i] * vector2[i]; + if (add) + { + result[i] += vector1[i] * vector2[i]; + } + else + { + result[i] = vector1[i] * vector2[i]; + } } } @@ -548,11 +555,12 @@ template <> void vector_mul_vector_op::operator()(const int& dim, double* result, const double* vector1, - const double* vector2) + const double* vector2, + const bool& add) { int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); + vector_mul_vector_kernel <<>> (dim, result, vector1, vector2, add); cudaCheckOnDebug(); } @@ -561,13 +569,14 @@ template inline void vector_mul_vector_complex_wrapper(const int& dim, std::complex* result, const std::complex* vector1, - const FPTYPE* vector2) + const FPTYPE* vector2, + const bool& add) { thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); + vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2, add); cudaCheckOnDebug(); } @@ -575,18 +584,20 @@ template <> void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, std::complex* result, const std::complex* vector1, - const float* vector2) + const float* vector2, + const bool& add) { - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); } template <> void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( const int& dim, std::complex* result, const std::complex* vector1, - const double* vector2) + const double* vector2, + const bool& add) { - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); } // vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex) @@ -1019,6 +1030,7 @@ template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op; @@ -1029,6 +1041,7 @@ template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op; @@ -1039,7 +1052,6 @@ template struct matrixCopy, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; template struct vector_div_constant_op; -template struct vector_mul_vector_op; template struct vector_div_vector_op; #endif } // namespace ModuleBase diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp index bb0bb923a5..9420d7c4b7 100644 --- a/source/module_base/kernels/math_kernel_op.cpp +++ b/source/module_base/kernels/math_kernel_op.cpp @@ -167,14 +167,27 @@ template struct vector_mul_vector_op { using Real = typename GetTypeReal::type; - void operator()(const int& dim, T* result, const T* vector1, const Real* vector2) + void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add) { + if (add) + { #ifdef _OPENMP #pragma omp parallel for schedule(static, 4096 / sizeof(Real)) #endif - for (int i = 0; i < dim; i++) + for (int i = 0; i < dim; i++) + { + result[i] += vector1[i] * vector2[i]; + } + } + else { - result[i] = vector1[i] * vector2[i]; +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * vector2[i]; + } } } }; diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h index d94aeb6806..daca2ac548 100644 --- a/source/module_base/kernels/math_kernel_op.h +++ b/source/module_base/kernels/math_kernel_op.h @@ -143,11 +143,11 @@ template struct vector_mul_vector_op { /// \param dim : array size /// \param vector1 : input array A /// \param vector2 : input array B + /// \param add : flag to control whether to add the result to the output array /// /// Output Parameters /// \param result : output array - void operator()(const int &dim, T *result, const T *vector1, - const Real *vector2); + void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false); }; // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) @@ -350,8 +350,7 @@ struct vector_div_constant_op { // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) template struct vector_mul_vector_op { using Real = typename GetTypeReal::type; - void operator()(const int &dim, T *result, - const T *vector1, const Real *vector2); + void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false); }; // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu index 0279bfee75..a6a787faf8 100644 --- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu @@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel( const int size, T* result, const T* vector1, - const typename GetTypeReal::type* vector2) + const typename GetTypeReal::type* vector2, + const bool add) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { - result[i] = vector1[i] * vector2[i]; + if (add) + { + result[i] += vector1[i] * vector2[i]; + } + else + { + result[i] = vector1[i] * vector2[i]; + } } } @@ -471,11 +479,12 @@ template <> void vector_mul_vector_op::operator()(const int& dim, double* result, const double* vector1, - const double* vector2) + const double* vector2, + const bool& add) { int thread = 1024; int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add); hipCheckOnDebug(); } @@ -485,13 +494,14 @@ template inline void vector_mul_vector_complex_wrapper(const int& dim, std::complex* result, const std::complex* vector1, - const FPTYPE* vector2) + const FPTYPE* vector2, + const bool& add) { thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); int thread = 1024; int block = (dim + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add); hipCheckOnDebug(); } @@ -499,18 +509,20 @@ template <> void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int& dim, std::complex* result, const std::complex* vector1, - const float* vector2) + const float* vector2, + const bool& add) { - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); } template <> void vector_mul_vector_op, base_device::DEVICE_GPU>::operator()( const int& dim, std::complex* result, const std::complex* vector1, - const double* vector2) + const double* vector2, + const bool& add) { - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add); } // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) template <> @@ -931,6 +943,7 @@ template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; @@ -940,6 +953,7 @@ template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; @@ -948,7 +962,6 @@ template struct matrixCopy, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; template struct vector_div_constant_op; -template struct vector_mul_vector_op; template struct vector_div_vector_op; template struct matrixCopy; template struct constantvector_addORsub_constantVector_op; diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 3bddfc2dfa..4626286801 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -943,7 +943,7 @@ void ESolver_KS_PW::after_all_runners(UnitCell& ucell) //! 7) Use Kubo-Greenwood method to compute conductivities if (PARAM.inp.cal_cond) { - EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell); + EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell); elec_cond.KG(PARAM.inp.cond_smear, PARAM.inp.cond_fwhm, PARAM.inp.cond_wcut, diff --git a/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp b/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp index 5f59858132..5997a139cf 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp @@ -1,10 +1,12 @@ #include "elecond.h" -#include "module_parameter/parameter.h" #include "module_base/global_function.h" #include "module_base/global_variable.h" +#include "module_base/kernels/math_kernel_op.h" +#include "module_base/parallel_device.h" #include "module_elecstate/occupy.h" #include "module_io/binstream.h" +#include "module_parameter/parameter.h" #include @@ -24,9 +26,13 @@ #define TWOSQRT2LN2 2.354820045030949 // FWHM = 2sqrt(2ln2) * \sigma #define FACTOR 1.839939223835727e7 -EleCond::EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState* p_elec_in, - ModulePW::PW_Basis_K* p_wfcpw_in, psi::Psi>* p_psi_in, - pseudopot_cell_vnl* p_ppcell_in) +template +EleCond::EleCond(UnitCell* p_ucell_in, + K_Vectors* p_kv_in, + elecstate::ElecState* p_elec_in, + ModulePW::PW_Basis_K* p_wfcpw_in, + psi::Psi, Device>* p_psi_in, + pseudopot_cell_vnl* p_ppcell_in) { this->p_ppcell = p_ppcell_in; this->p_ucell = p_ucell_in; @@ -36,8 +42,14 @@ EleCond::EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState* this->p_psi = p_psi_in; } -void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut, const double& dw_in, - const double& dt_in, const bool& nonlocal, ModuleBase::matrix& wg) +template +void EleCond::KG(const int& smear_type, + const double& fwhmin, + const double& wcut, + const double& dw_in, + const double& dt_in, + const bool& nonlocal, + ModuleBase::matrix& wg) { //----------------------------------------------------------- // KS conductivity @@ -72,7 +84,7 @@ void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut std::vector ct12(nt, 0); std::vector ct22(nt, 0); - hamilt::Velocity velop(this->p_wfcpw, this->p_kv->isk.data(), this->p_ppcell, this->p_ucell, nonlocal); + hamilt::Velocity velop(this->p_wfcpw, this->p_kv->isk.data(), this->p_ppcell, this->p_ucell, nonlocal); double decut = (wcut + fwhmin) / ModuleBase::Ry_to_eV; std::cout << "Recommended dt: " << 0.25 * M_PI / decut << " a.u." << std::endl; for (int ik = 0; ik < nk; ++ik) @@ -94,44 +106,79 @@ void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut } } -void EleCond::jjresponse_ks(const int ik, const int nt, const double dt, const double decut, ModuleBase::matrix& wg, - hamilt::Velocity& velop, double* ct11, double* ct12, double* ct22) +template +void EleCond::jjresponse_ks(const int ik, + const int nt, + const double dt, + const double decut, + ModuleBase::matrix& wg, + hamilt::Velocity& velop, + double* ct11, + double* ct12, + double* ct22) { const int nbands = PARAM.inp.nbands; - if (wg(ik, 0) - wg(ik, nbands - 1) < 1e-8 || nbands == 0) { + if (wg(ik, 0) - wg(ik, nbands - 1) < 1e-8 || nbands == 0) + { return; -} - const char transn = 'N'; - const char transc = 'C'; + } const int ndim = 3; - const int npwx = this->p_wfcpw->npwk_max; + const int npwk_max = this->p_wfcpw->npwk_max; const double ef = this->p_elec->eferm.ef; const int npw = this->p_kv->ngk[ik]; const int reducenb2 = (nbands - 1) * nbands / 2; const bool gamma_only = false; // ABACUS do not support gamma_only yet. - std::complex* levc = &(this->p_psi[0](ik, 0, 0)); - std::vector> prevc(ndim * npwx * nbands); - std::vector> pij(nbands * nbands); + std::complex* levc = &(this->p_psi[0](ik, 0, 0)); + psi::Psi, Device> v_psi(1, ndim * nbands, npwk_max, npw, true); + std::complex* pij_d = nullptr; + resmem_complex_op()(pij_d, nbands * nbands); + std::vector pij2(reducenb2, 0); // px|right> - velop.act(this->p_psi, nbands * PARAM.globalv.npol, levc, prevc.data()); + velop.act(this->p_psi, nbands * PARAM.globalv.npol, levc, v_psi.get_pointer()); + std::complex one = 1.0; + std::complex zero = 0.0; for (int id = 0; id < ndim; ++id) { + ModuleBase::gemm_op, Device>()('C', + 'N', + nbands, + nbands, + npw, + &one, + levc, + npwk_max, + v_psi.get_pointer() + id * npwk_max * nbands, + npwk_max, + &zero, + pij_d, + nbands); - zgemm_(&transc, &transn, &nbands, &nbands, &npw, &ModuleBase::ONE, levc, &npwx, - prevc.data() + id * npwx * nbands, &npwx, &ModuleBase::ZERO, pij.data(), &nbands); + std::complex* pij_c = nullptr; + if(std::is_same::value) + { + pij_c = pij_d; + } + else + { + std::vector> pij_h(nbands * nbands); + syncmem_complex_d2h_op()(pij_h.data(), pij_d, nbands * nbands); + pij_c = pij_h.data(); + } + #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, pij.data(), nbands * nbands, MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD); + Parallel_Common::reduce_data(pij_c, nbands * nbands, POOL_WORLD); #endif - if (!gamma_only) { + if (!gamma_only) + { for (int ib = 0, ijb = 0; ib < nbands; ++ib) { for (int jb = ib + 1; jb < nbands; ++jb, ++ijb) { - pij2[ijb] += norm(pij[ib * nbands + jb]); + pij2[ijb] += static_cast(std::norm(pij_c[ib * nbands + jb])); } } -} + } } if (GlobalV::RANK_IN_POOL == 0) @@ -185,11 +232,20 @@ void EleCond::jjresponse_ks(const int ik, const int nt, const double dt, const d ct12[it] += tmct12 / 2.0; ct22[it] += tmct22 / 2.0; } + delmem_complex_op()(pij_d); return; } -void EleCond::calcondw(const int nt, const double dt, const int& smear_type, const double fwhmin, const double wcut, - const double dw_in, double* ct11, double* ct12, double* ct22) +template +void EleCond::calcondw(const int nt, + const double dt, + const int& smear_type, + const double fwhmin, + const double wcut, + const double dw_in, + double* ct11, + double* ct12, + double* ct22) { double factor = FACTOR; const int ndim = 3; @@ -259,4 +315,11 @@ void EleCond::calcondw(const int nt, const double dt, const int& smear_type, con std::cout << std::setprecision(6) << "Thermal conductivity: " << kappa0 << " W(mK)^-1" << std::endl; std::cout << std::setprecision(6) << "Lorenz number: " << Lorent0 << " k_B^2/e^2" << std::endl; ofscond.close(); -} \ No newline at end of file +} + +template class EleCond; +template class EleCond; +#if ((defined __CUDA) || (defined __ROCM)) +template class EleCond; +template class EleCond; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/elecond.h b/source/module_hamilt_pw/hamilt_pwdft/elecond.h index a8c99b22dc..8ccc0a1408 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/elecond.h +++ b/source/module_hamilt_pw/hamilt_pwdft/elecond.h @@ -9,11 +9,20 @@ #include "module_hamilt_pw/hamilt_pwdft/VNL_in_pw.h" #include "module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h" +template class EleCond { public: - EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState* p_elec_in, ModulePW::PW_Basis_K* p_wfcpw_in, - psi::Psi>* p_psi_in, pseudopot_cell_vnl* p_ppcell_in); + using resmem_complex_op = base_device::memory::resize_memory_op, Device>; + using delmem_complex_op = base_device::memory::delete_memory_op, Device>; + using syncmem_complex_d2h_op = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; + public: + EleCond(UnitCell* p_ucell_in, + K_Vectors* p_kv_in, + elecstate::ElecState* p_elec_in, + ModulePW::PW_Basis_K* p_wfcpw_in, + psi::Psi, Device>* p_psi_in, + pseudopot_cell_vnl* p_ppcell_in); ~EleCond(){}; /** @@ -27,16 +36,21 @@ class EleCond * @param nonlocal whether to include the nonlocal potential corrections for velocity operator * @param wg wg(ik,ib) occupation for the ib-th band in the ik-th kpoint */ - void KG(const int& smear_type, const double& fwhmin, const double& wcut, const double& dw_in, const double& dt_in, - const bool& nonlocal, ModuleBase::matrix& wg); + void KG(const int& smear_type, + const double& fwhmin, + const double& wcut, + const double& dw_in, + const double& dt_in, + const bool& nonlocal, + ModuleBase::matrix& wg); protected: - pseudopot_cell_vnl* p_ppcell = nullptr; ///< pointer to the pseudopotential - UnitCell* p_ucell = nullptr; ///< pointer to the unit cell - ModulePW::PW_Basis_K* p_wfcpw = nullptr; ///< pointer to the plane wave basis - K_Vectors* p_kv = nullptr; ///< pointer to the k vectors - elecstate::ElecState* p_elec = nullptr; ///< pointer to the electronic state - psi::Psi>* p_psi = nullptr; ///< pointer to the wavefunction + pseudopot_cell_vnl* p_ppcell = nullptr; ///< pointer to the pseudopotential + UnitCell* p_ucell = nullptr; ///< pointer to the unit cell + ModulePW::PW_Basis_K* p_wfcpw = nullptr; ///< pointer to the plane wave basis + K_Vectors* p_kv = nullptr; ///< pointer to the k vectors + elecstate::ElecState* p_elec = nullptr; ///< pointer to the electronic state + psi::Psi, Device>* p_psi = nullptr; ///< pointer to the wavefunction protected: /** @@ -52,8 +66,15 @@ class EleCond * @param ct12 C12(t) * @param ct22 C22(t) */ - void jjresponse_ks(const int ik, const int nt, const double dt, const double decut, ModuleBase::matrix& wg, - hamilt::Velocity& velop, double* ct11, double* ct12, double* ct22); + void jjresponse_ks(const int ik, + const int nt, + const double dt, + const double decut, + ModuleBase::matrix& wg, + hamilt::Velocity& velop, + double* ct11, + double* ct12, + double* ct22); /** * @brief Calculate the conductivity using the response function * @@ -67,8 +88,15 @@ class EleCond * @param ct12 C12 component of the response function * @param ct22 C22 component of the response function */ - void calcondw(const int nt, const double dt, const int& smear_type, const double fwhmin, const double wcut, - const double dw_in, double* ct11, double* ct12, double* ct22); + void calcondw(const int nt, + const double dt, + const int& smear_type, + const double fwhmin, + const double wcut, + const double dw_in, + double* ct11, + double* ct12, + double* ct22); }; #endif // ELECOND_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp index bc2ef072ea..3998ef9b85 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp @@ -1,19 +1,19 @@ #include "velocity_pw.h" -#include "module_base/timer.h" + +#include "module_base/kernels/math_kernel_op.h" #include "module_base/parallel_reduce.h" +#include "module_base/timer.h" namespace hamilt { -Velocity::Velocity -( - const ModulePW::PW_Basis_K* wfcpw_in, - const int* isk_in, - pseudopot_cell_vnl* ppcell_in, - const UnitCell* ucell_in, - const bool nonlocal_in -) +template +Velocity::Velocity(const ModulePW::PW_Basis_K* wfcpw_in, + const int* isk_in, + pseudopot_cell_vnl* ppcell_in, + const UnitCell* ucell_in, + const bool nonlocal_in) { - if( wfcpw_in == nullptr || isk_in == nullptr || ppcell_in == nullptr || ucell_in == nullptr) + if (wfcpw_in == nullptr || isk_in == nullptr || ppcell_in == nullptr || ucell_in == nullptr) { ModuleBase::WARNING_QUIT("Velocity", "Constuctor of Operator::Velocity is failed, please check your code!"); } @@ -22,114 +22,199 @@ Velocity::Velocity this->ppcell = ppcell_in; this->ucell = ucell_in; this->nonlocal = nonlocal_in; - this->tpiba = ucell_in -> tpiba; - if(this->nonlocal) { this->ppcell->initgradq_vnl(*this->ucell); + this->tpiba = ucell_in->tpiba; + if (this->nonlocal) + { + this->ppcell->initgradq_vnl(*this->ucell); + } } + +template +Velocity::~Velocity() +{ + delmem_var_op()(this->gx_); + delmem_var_op()(this->gy_); + delmem_var_op()(this->gz_); + delmem_complex_op()(vkb_); + delmem_complex_op()(gradvkb_); } -void Velocity::init(const int ik_in) +template +void Velocity::init(const int ik_in) { this->ik = ik_in; + // init G+K + const int npw = this->wfcpw->npwk[ik_in]; + const int npwk_max = this->wfcpw->npwk_max; + std::vector gtmp(npw); + resmem_var_op()(gx_, npw); + resmem_var_op()(gy_, npw); + resmem_var_op()(gz_, npw); + std::vector gtmp_ptr = {this->gx_, this->gy_, this->gz_}; + for(int i=0; i<3; ++i) + { + for (int ig = 0; ig < npw; ++ig) + { + const ModuleBase::Vector3 tmpg = wfcpw->getgpluskcar(this->ik, ig); + gtmp[ig] = static_cast(tmpg[i] * tpiba); + } + syncmem_var_h2d_op()(gtmp_ptr[i], gtmp.data(), npw); + } + // Calculate nonlocal pseudopotential vkb - if(this->ppcell->nkb > 0 && this->nonlocal) - { - this->ppcell->getgradq_vnl(*this->ucell,ik_in); - } + if (this->ppcell->nkb > 0 && this->nonlocal) + { + this->ppcell->getgradq_vnl(*this->ucell, ik_in); + + // sync to device + if (std::is_same::value || std::is_same::value) + { + const int nkb = this->ppcell->nkb; + // vkb + resmem_complex_op()(vkb_, nkb * npwk_max); + castmem_complex_h2d_op()(vkb_, this->ppcell->vkb.c, nkb * npwk_max); + // gradvkb + resmem_complex_op()(gradvkb_, 3 * nkb * npwk_max); + castmem_complex_h2d_op()(gradvkb_, this->ppcell->gradvkb.ptr, 3 * nkb * npwk_max); + } + } } -void Velocity::act -( - const psi::Psi> *psi_in, - const int n_npwx, //nbands * NPOL - const std::complex* psi0, - std::complex* vpsi, - const bool add -) const +template +void Velocity::act(const psi::Psi, Device>* psi_in, + const int n_npwx, + const std::complex* psi0, + std::complex* vpsi, + const bool add) const { ModuleBase::timer::tick("Operator", "Velocity"); - const int npw = psi_in->get_current_nbas(); - - const int max_npw = psi_in->get_nbasis() / psi_in->get_npol(); + const int npw = this->wfcpw->npwk[this->ik]; + const int max_npw = this->wfcpw->npwk_max; const int npol = psi_in->get_npol(); - const std::complex* tmpsi_in = psi0; - std::complex* tmhpsi = vpsi; + + std::vector gtmp_ptr = {this->gx_, this->gy_, this->gz_}; // ------------- // p // ------------- - for (int ib = 0; ib < n_npwx; ++ib) + for (int id = 0; id < 3; ++id) { - for (int ig = 0; ig < npw; ++ig) + const Complex* tmpsi_in = psi0; + Complex* tmpvpsi = vpsi + id * n_npwx * max_npw; + for (int ib = 0; ib < n_npwx; ++ib) { - const ModuleBase::Vector3& tmpg = wfcpw->getgpluskcar(this->ik, ig); - if(add) - { - tmhpsi[ig] += tmpsi_in[ig] * tmpg.x * tpiba; - tmhpsi[ig + n_npwx * max_npw] += tmpsi_in[ig] * tmpg.y * tpiba; - tmhpsi[ig + 2 * n_npwx * max_npw]+= tmpsi_in[ig] * tmpg.z * tpiba; - } - else - { - tmhpsi[ig] = tmpsi_in[ig] * tmpg.x * tpiba; - tmhpsi[ig + n_npwx * max_npw] = tmpsi_in[ig] * tmpg.y * tpiba; - tmhpsi[ig + 2 * n_npwx * max_npw] = tmpsi_in[ig] * tmpg.z * tpiba; - } + ModuleBase::vector_mul_vector_op()(npw, tmpvpsi, tmpsi_in, gtmp_ptr[id], add); + tmpvpsi += max_npw; + tmpsi_in += max_npw; } - tmhpsi += max_npw; - tmpsi_in += max_npw; } // --------------------------------------------- - // i[V_NL, r] = (\nabla_q+\nabla_q')V_{NL}(q,q') + // i[V_NL, r] = (\nabla_q+\nabla_q')V_{NL}(q,q') // |\beta><\beta|\psi> // --------------------------------------------- - if (this->ppcell->nkb <= 0 || !this->nonlocal) + if (this->ppcell->nkb <= 0 || !this->nonlocal) { ModuleBase::timer::tick("Operator", "Velocity"); return; } - //1. <\beta|\psi> + // 1. <\beta|\psi> + Complex* becp1_ = nullptr; ///<[Device, n_npwx * nkb] <\beta|\psi> + Complex* becp2_ = nullptr; ///<[Device, n_npwx * 3*nkb] <\nabla\beta|\psi> + Complex* ps1_ = nullptr; ///<[Device, nkb * n_npwx] sum of becp1 + Complex* ps2_ = nullptr; ///<[Device, 3*nkb * n_npwx] sum of becp2 + resmem_complex_op()(ps1_, this->ppcell->nkb * n_npwx); + resmem_complex_op()(ps2_, 3 * this->ppcell->nkb * n_npwx); + resmem_complex_op()(becp1_, this->ppcell->nkb * n_npwx); + resmem_complex_op()(becp2_, 3 * this->ppcell->nkb * n_npwx); + const int nkb = this->ppcell->nkb; const int nkb3 = 3 * nkb; - ModuleBase::ComplexMatrix becp1(n_npwx, nkb, false); - ModuleBase::ComplexMatrix becp2(n_npwx, nkb3, false); - char transC = 'C'; - char transN = 'N'; - char transT = 'T'; - const int npm = n_npwx; + Complex one = 1.0; + Complex zero = 0.0; + + Complex* vkb_d = reinterpret_cast(this->ppcell->vkb.c); + Complex* gradvkb_d = reinterpret_cast(this->ppcell->gradvkb.ptr); + if (std::is_same::value || std::is_same::value) + { + vkb_d = vkb_; + gradvkb_d = gradvkb_; + } + if (n_npwx == 1) { int inc = 1; - zgemv_(&transC, &npw, &nkb, - &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, psi0, &inc, - &ModuleBase::ZERO, becp1.c, &inc); - zgemv_(&transC, &npw, &nkb3, - &ModuleBase::ONE, this->ppcell->gradvkb.ptr, &max_npw, psi0, &inc, - &ModuleBase::ZERO, becp2.c, &inc); + ModuleBase::gemv_op()('C', npw, nkb, &one, vkb_d, max_npw, psi0, inc, &zero, becp1_, inc); + ModuleBase::gemv_op()('C', npw, nkb3, &one, gradvkb_d, max_npw, psi0, inc, &zero, becp2_, inc); } else { - zgemm_(&transC, &transN, &nkb, &n_npwx, &npw, - &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, psi0, &max_npw, - &ModuleBase::ZERO, becp1.c, &nkb); - zgemm_(&transC, &transN, &nkb3, &n_npwx, &npw, - &ModuleBase::ONE, this->ppcell->gradvkb.ptr, &max_npw, psi0, &max_npw, - &ModuleBase::ZERO, becp2.c, &nkb3); + ModuleBase::gemm_op()('C', + 'N', + nkb, + n_npwx, + npw, + &one, + vkb_d, + max_npw, + psi0, + max_npw, + &zero, + becp1_, + nkb); + ModuleBase::gemm_op()('C', + 'N', + nkb3, + n_npwx, + npw, + &one, + gradvkb_d, + max_npw, + psi0, + max_npw, + &zero, + becp2_, + nkb3); } - Parallel_Reduce::reduce_pool(becp1.c, nkb * n_npwx); - Parallel_Reduce::reduce_pool(becp2.c, nkb3 * n_npwx); - //2. <\beta \psi> tmp_space1, tmp_space2; + if(std::is_same::value) + { + tmp_space1.resize(nkb * n_npwx + nkb3 * n_npwx); + becp1_cpu = tmp_space1.data(); + becp2_cpu = becp1_cpu + nkb * n_npwx; + syncmem_complex_d2h_op()(becp1_cpu, becp1_, nkb * n_npwx); + syncmem_complex_d2h_op()(becp2_cpu, becp2_, nkb3 * n_npwx); + + tmp_space2.resize(nkb * n_npwx + nkb3 * n_npwx, 0.0); + ps1_cpu = tmp_space2.data(); + ps2_cpu = ps1_cpu + nkb * n_npwx; + } + else + { + Parallel_Reduce::reduce_pool(becp1_, nkb * n_npwx); + Parallel_Reduce::reduce_pool(becp2_, nkb3 * n_npwx); + becp1_cpu = becp1_; + becp2_cpu = becp2_; + + setmem_complex_op()(ps1_, 0.0, nkb * n_npwx); + setmem_complex_op()(ps2_, 0.0, nkb3 * n_npwx); + ps1_cpu = ps1_; + ps2_cpu = ps2_; + } + // 2. <\beta \psi>isk[ik]; + const int current_spin = 0; for (int it = 0; it < this->ucell->ntype; it++) { const int nproj = this->ucell->atoms[it].ncpp.nh; @@ -141,13 +226,13 @@ void Velocity::act { for (int ib = 0; ib < n_npwx; ++ib) { - double dij = this->ppcell->deeq(current_spin, iat, ip, ip2); + FPTYPE dij = static_cast(this->ppcell->deeq(current_spin, iat, ip, ip2)); int sumip2 = sum + ip2; int sumip = sum + ip; - ps1(sumip2, ib) += dij * becp1(ib, sumip); - ps2(sumip2, ib) += dij * becp2(ib, sumip); - ps2(sumip2 + nkb, ib) += dij * becp2(ib, sumip + nkb); - ps2(sumip2 + 2*nkb, ib) += dij * becp2(ib , sumip + 2*nkb); + ps1_cpu[sumip2 * n_npwx + ib] += dij * becp1_cpu[ib * nkb + sumip]; + ps2_cpu[sumip2 * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip]; + ps2_cpu[(sumip2 + nkb) * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip + nkb]; + ps2_cpu[(sumip2 + 2 * nkb) * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip + 2 * nkb]; } } } @@ -158,86 +243,95 @@ void Velocity::act } else { - for (int it = 0; it < this->ucell->ntype; it++) - { - const int nproj = this->ucell->atoms[it].ncpp.nh; - for (int ia = 0; ia < this->ucell->atoms[it].na; ia++) - { - for (int ip = 0; ip < nproj; ip++) - { - for (int ip2 = 0; ip2 < nproj; ip2++) - { - for (int ib = 0; ib < n_npwx; ib+=2) - { - int sumip2 = sum + ip2; - int sumip = sum + ip; - std::complex pol1becp1 = becp1(ib, sumip); - std::complex pol2becp1 = becp1(ib+1, sumip); - std::complex pol1becp2x = becp2(ib, sumip); - std::complex pol2becp2x = becp2(ib+1, sumip); - std::complex pol1becp2y = becp2(ib, sumip + nkb); - std::complex pol2becp2y = becp2(ib+1, sumip + nkb); - std::complex pol1becp2z = becp2(ib, sumip + 2 * nkb); - std::complex pol2becp2z = becp2(ib+1, sumip + 2 * nkb); - std::complex dij0 = this->ppcell->deeq_nc(0, iat, ip2, ip); - std::complex dij1 = this->ppcell->deeq_nc(1, iat, ip2, ip); - std::complex dij2 = this->ppcell->deeq_nc(2, iat, ip2, ip); - std::complex dij3 = this->ppcell->deeq_nc(3, iat, ip2, ip); - - ps1(sumip2, ib) += dij0 * pol1becp1 + dij1 * pol2becp1; - ps1(sumip2, ib+1) += dij2 * pol1becp1 + dij3 * pol2becp1; - ps2(sumip2, ib) += dij0 * pol1becp2x + dij1 * pol2becp2x; - ps2(sumip2, ib+1) += dij2 * pol1becp2x + dij3 * pol2becp2x; - ps2(sumip2 + nkb, ib) += dij0 * pol1becp2y + dij1 * pol2becp2y; - ps2(sumip2 + nkb, ib+1) += dij2 * pol1becp2y + dij3 * pol2becp2y; - ps2(sumip2 + 2*nkb, ib) += dij0 * pol1becp2z + dij1 * pol2becp2z; - ps2(sumip2 + 2*nkb, ib+1) += dij2 * pol1becp2z + dij3 * pol2becp2z; - } - } - } - sum += nproj; - ++iat; - } - } + ModuleBase::WARNING_QUIT("Velocity", "Velocity operator does not support the non-collinear case yet!"); + } + + if(std::is_same::value) + { + syncmem_complex_h2d_op()(ps1_, ps1_cpu, nkb * n_npwx); + syncmem_complex_h2d_op()(ps2_, ps2_cpu, nkb3 * n_npwx); } - if (n_npwx == 1) { int inc = 1; - for(int id = 0 ; id < 3 ; ++id) + for (int id = 0; id < 3; ++id) { int vkbshift = id * max_npw * nkb; int ps2shift = id * nkb; - int npwshift = id * max_npw ; - zgemv_(&transN, &npw, &nkb, - &ModuleBase::ONE, this->ppcell->gradvkb.ptr + vkbshift, &max_npw, ps1.c, &inc, - &ModuleBase::ONE, vpsi + npwshift, &inc); - zgemv_(&transN, &npw, &nkb, - &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, ps2.c + ps2shift, &inc, - &ModuleBase::ONE, vpsi + npwshift, &inc); + int npwshift = id * max_npw; + ModuleBase::gemv_op()('N', + npw, + nkb, + &one, + gradvkb_d + vkbshift, + max_npw, + ps1_, + inc, + &one, + vpsi + npwshift, + inc); + ModuleBase::gemv_op()('N', + npw, + nkb, + &one, + vkb_d, + max_npw, + ps2_ + ps2shift, + inc, + &one, + vpsi + npwshift, + inc); } } else { - for(int id = 0 ; id < 3 ; ++id) + for (int id = 0; id < 3; ++id) { int vkbshift = id * max_npw * nkb; int ps2shift = id * n_npwx * nkb; int npwshift = id * max_npw * n_npwx; - zgemm_(&transN, &transT, &npw, &npm, &nkb, - &ModuleBase::ONE, this->ppcell->gradvkb.ptr + vkbshift, &max_npw, ps1.c, &n_npwx, - &ModuleBase::ONE, vpsi + npwshift, &max_npw); - zgemm_(&transN, &transT, &npw, &npm, &nkb, - &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, ps2.c + ps2shift, &n_npwx, - &ModuleBase::ONE, vpsi + npwshift, &max_npw); + ModuleBase::gemm_op()('N', + 'T', + npw, + n_npwx, + nkb, + &one, + gradvkb_d + vkbshift, + max_npw, + ps1_, + n_npwx, + &one, + vpsi + npwshift, + max_npw); + ModuleBase::gemm_op()('N', + 'T', + npw, + n_npwx, + nkb, + &one, + vkb_d, + max_npw, + ps2_ + ps2shift, + n_npwx, + &one, + vpsi + npwshift, + max_npw); } } - - + delmem_complex_op()(ps1_); + delmem_complex_op()(ps2_); + delmem_complex_op()(becp1_); + delmem_complex_op()(becp2_); ModuleBase::timer::tick("Operator", "Velocity"); return; } +template class Velocity; +template class Velocity; +#if ((defined __CUDA) || (defined __ROCM)) +template class Velocity; +template class Velocity; +#endif -} \ No newline at end of file +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h index a6752065b0..14ef0799e9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h @@ -8,6 +8,7 @@ namespace hamilt { //velocity operator mv = im/\hbar * [H,r] = p + im/\hbar [V_NL, r] +template class Velocity { public: @@ -19,7 +20,7 @@ class Velocity const bool nonlocal_in = true ); - ~Velocity(){}; + ~Velocity(); void init(const int ik_in); @@ -27,16 +28,16 @@ class Velocity * @brief calculate \hat{v}|\psi> * * @param psi_in Psi class which contains some information - * @param n_npwx i = 1 : n_npwx + * @param n_npwx nbands * NPOL * @param tmpsi_in |\psi_i> size: n_npwx*npwx - * @param tmhpsi \hat{v}|\psi> size: 3*n_npwx*npwx - * @param add true : tmhpsi = tmhpsi + v|\psi> false: tmhpsi = v|\psi> + * @param tmvpsi \hat{v}|\psi> size: 3*n_npwx*npwx + * @param add true : tmvpsi = tmvpsi + v|\psi> false: tmvpsi = v|\psi> * */ - void act(const psi::Psi>* psi_in, + void act(const psi::Psi, Device>* psi_in, const int n_npwx, - const std::complex* tmpsi_in, - std::complex* tmhpsi, + const std::complex* tmpsi_in, + std::complex* tmvpsi, const bool add = false) const; bool nonlocal = true; @@ -53,6 +54,26 @@ class Velocity int ik=0; double tpiba=0.0; + + private: + FPTYPE* gx_ = nullptr; ///<[Device, npwx] x component of G+K + FPTYPE* gy_ = nullptr; ///<[Device, npwx] y component of G+K + FPTYPE* gz_ = nullptr; ///<[Device, npwx] z component of G+K + std::complex* vkb_ = nullptr; ///<[Device, nkb * npwk_max] nonlocal pseudopotential vkb + std::complex* gradvkb_ = nullptr; ///<[Device, 3*nkb * npwk_max] gradient of nonlocal pseudopotential gradvkb + FPTYPE* deeq_ = nullptr; ///<[Device] D matrix for nonlocal pseudopotential + + using Complex = std::complex; + using resmem_var_op = base_device::memory::resize_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using castmem_var_h2d_op = base_device::memory::cast_memory_op; + using resmem_complex_op = base_device::memory::resize_memory_op, Device>; + using setmem_complex_op = base_device::memory::set_memory_op, Device>; + using delmem_complex_op = base_device::memory::delete_memory_op, Device>; + using castmem_complex_h2d_op = base_device::memory::cast_memory_op, std::complex, Device, base_device::DEVICE_CPU>; + using syncmem_complex_d2h_op = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; + using syncmem_complex_h2d_op = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; }; } #endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp index fec8130f77..597c84041b 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp @@ -22,7 +22,7 @@ Sto_EleCond::Sto_EleCond(UnitCell* p_ucell_in, hamilt::Hamilt>* p_hamilt_in, StoChe& stoche, Stochastic_WF, base_device::DEVICE_CPU>* p_stowf_in) - : EleCond(p_ucell_in, p_kv_in, p_elec_in, p_wfcpw_in, p_psi_in, p_ppcell_in) + : EleCond(p_ucell_in, p_kv_in, p_elec_in, p_wfcpw_in, p_psi_in, p_ppcell_in) { this->p_hamilt = p_hamilt_in; this->p_hamilt_sto = static_cast>*>(p_hamilt_in); @@ -149,7 +149,7 @@ void Sto_EleCond::cal_jmatrix(const psi::Psi>& kspsi_all, const int& bsize_psi, std::vector>& j1, std::vector>& j2, - hamilt::Velocity& velop, + hamilt::Velocity& velop, const int& ik, const std::complex& factor, const int bandinfo[6]) @@ -580,7 +580,7 @@ void Sto_EleCond::sKG(const int& smear_type, // ik loop ModuleBase::timer::tick("Sto_EleCond", "kloop"); - hamilt::Velocity velop(p_wfcpw, p_kv->isk.data(), p_ppcell, p_ucell, nonlocal); + hamilt::Velocity velop(p_wfcpw, p_kv->isk.data(), p_ppcell, p_ucell, nonlocal); for (int ik = 0; ik < nk; ++ik) { velop.init(ik); diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h index 77dccff792..4cb55ae328 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h +++ b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h @@ -6,7 +6,7 @@ #include "module_hamilt_pw/hamilt_stodft/sto_wf.h" #include "module_hsolver/hsolver_pw_sdft.h" -class Sto_EleCond : protected EleCond +class Sto_EleCond : protected EleCond { public: Sto_EleCond(UnitCell* p_ucell_in, @@ -90,7 +90,7 @@ class Sto_EleCond : protected EleCond const int& bsize_psi, std::vector>& j1, std::vector>& j2, - hamilt::Velocity& velop, + hamilt::Velocity& velop, const int& ik, const std::complex& factor, const int bandinfo[6]); diff --git a/tests/integrate/186_PW_KG_100_GPU/INPUT b/tests/integrate/186_PW_KG_100_GPU/INPUT new file mode 100644 index 0000000000..1d34d5335d --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/INPUT @@ -0,0 +1,37 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +esolver_type ksdft + +nbands 100 +pseudo_dir ../../PP_ORB +symmetry 1 +kpar 2 +device gpu + +#Parameters (2.Iteration) +ecutwfc 20 +scf_thr 1e-6 +scf_nmax 50 + + +#Parameters (3.Basis) +basis_type pw + +#Parameters (4.Smearing) +smearing_method fd +smearing_sigma 0.6 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.4 +mixing_gg0 0.0 + +cal_cond 1 +cond_fwhm 8 +cond_wcut 10 +cond_dw 1 +cond_dt 0.237464 +cond_dtbatch 1 +cond_nonlocal 1 diff --git a/tests/integrate/186_PW_KG_100_GPU/KPT b/tests/integrate/186_PW_KG_100_GPU/KPT new file mode 100644 index 0000000000..4fd38968a0 --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +2 2 1 0 0 0 diff --git a/tests/integrate/186_PW_KG_100_GPU/README b/tests/integrate/186_PW_KG_100_GPU/README new file mode 100644 index 0000000000..033d794f7f --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/README @@ -0,0 +1,9 @@ +This test for: +*KSDFT +*Al +*kpoints 2*2*1 +*100 KS +*kpar 2 +*bndpar 1 +*device gpu +*cal_cond 1 diff --git a/tests/integrate/186_PW_KG_100_GPU/STRU b/tests/integrate/186_PW_KG_100_GPU/STRU new file mode 100644 index 0000000000..0dbc74b343 --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/STRU @@ -0,0 +1,18 @@ +ATOMIC_SPECIES +Si 14 Si.pz-vbc.UPF + +LATTICE_CONSTANT +5 // add lattice constant + +LATTICE_VECTORS +1 0 0 +0 1 0 +0 0 1 + +ATOMIC_POSITIONS +Direct + +Si // Element type +0.0 // magnetism +1 +0.00 0.00 0.00 1 1 1 diff --git a/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt b/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt new file mode 100644 index 0000000000..4f850b65d2 --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt @@ -0,0 +1,11 @@ +## w(eV) sigma(Sm^-1) kappa(W(mK)^-1) L12/e(Am^-1) L22/e^2(Wm^-1) + 0.5 356691 626.131 -7.13544e+06 2.02056e+08 + 1.5 348656 606.514 -7.03014e+06 1.99209e+08 + 2.5 333032 569.396 -6.81899e+06 1.93563e+08 + 3.5 310656 518.557 -6.50176e+06 1.85201e+08 + 4.5 282683 458.689 -6.08031e+06 1.74236e+08 + 5.5 250512 394.607 -5.56125e+06 1.60839e+08 + 6.5 215751 330.615 -4.95931e+06 1.45316e+08 + 7.5 180190 270.122 -4.3001e+06 1.28208e+08 + 8.5 145738 215.497 -3.62025e+06 1.10345e+08 + 9.5 114241 168.083 -2.96321e+06 9.27832e+07 diff --git a/tests/integrate/186_PW_KG_100_GPU/result.ref b/tests/integrate/186_PW_KG_100_GPU/result.ref new file mode 100644 index 0000000000..13c9a5365f --- /dev/null +++ b/tests/integrate/186_PW_KG_100_GPU/result.ref @@ -0,0 +1,7 @@ +etotref -150.4802165653093 +etotperatomref -150.4802165653 +CompareH_Failed 0 +pointgroupref O_h +spacegroupref O_h +nksibzref 3 +totaltimeref 5.10 diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt index c41f06d83b..b034e955e2 100644 --- a/tests/integrate/CASES_GPU.txt +++ b/tests/integrate/CASES_GPU.txt @@ -2,6 +2,7 @@ 102_PW_DA_davidson_GPU 102_PW_BPCG_GPU 107_PW_outWfcPw_GPU +186_PW_KG_100_GPU 187_PW_SDFT_ALL_GPU 187_PW_SDFT_MALL_GPU 187_PW_SDFT_MALL_BPCG_GPU