jieli-matrix
diff --git a/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 62 additions & 0 deletions b/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎source/module_base/module_device/device.cpp‎
Lines changed: 7 additions & 2 deletions b/‎source/module_base/module_device/device.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎source/module_esolver/esolver.cpp‎
Lines changed: 4 additions & 2 deletions b/‎source/module_esolver/esolver.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 2 additions & 1 deletion b/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎source/module_hsolver/diago_dav_subspace.cpp‎
Lines changed: 67 additions & 51 deletions b/‎source/module_hsolver/diago_dav_subspace.cpp‎
Lines changed: 67 additions & 51 deletions
@@ -284,6 +284,48 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 // Partially specialize functor for base_device::GpuDevice.
 template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -334,6 +376,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
 
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
+} // end of namespace base_device
@@ -114,8 +114,10 @@ std::string determine_type()
     }
 
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
-                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl; 
+    /***auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
     return esolver_type;
 }
 
 
@@ -572,7 +572,8 @@ void ESolver_KS_PW<T, Device>::hamilt2rho_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
 
@@ -293,26 +293,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
 
-    std::vector<Real> e_temp_cpu(nbase, 0);
+    // Eigenvalues operation section
+    std::vector<Real> e_temp_cpu(this->notconv, 0);
     Real* e_temp_hd = e_temp_cpu.data();
-    if(this->device == base_device::GpuDevice)
+    if (this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
-        resmem_real_op()(e_temp_hd, nbase);
+        resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
     }
-    for (int m = 0; m < notconv; m++)
+
+    for (int m = 0; m < this->notconv; m++)
     {
-        e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase);
-        }
-        ModuleBase::vector_mul_vector_op<T, Device>()(nbase,
-                                                      vcc + m * this->nbase_x,
-                                                      vcc + m * this->nbase_x,
-                                                      e_temp_hd);
+        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
     }
-    if(this->device == base_device::GpuDevice)
+
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
+    }
+    
+    apply_eigenvalues_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+    if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(e_temp_hd);
     }
@@ -336,48 +338,62 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
          psi_iter + nbase * this->dim,
          this->dim);
 
-    // "precondition!!!"
-    std::vector<Real> pre(this->dim, 0.0);
-    for (int m = 0; m < notconv; m++)
-    {
-        for (size_t i = 0; i < this->dim; i++)
-        {
-            // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
-        }
+    // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim);
-            ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          this->d_precondition);
-        }
-        else
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* eigenvalues_gpu = nullptr;
+        resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
+        
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    d_precondition,
+                                    eigenvalues_gpu);
+        delmem_real_op()(this->ctx, eigenvalues_gpu);
+    }
+    else
 #endif
-        {
-            ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          pre.data());
-        }
+    {
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    this->precondition.data(),
+                                    (*eigenvalue_iter).data());
     }
 
-    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    for (size_t i = 0; i < notconv; i++)
+    // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* psi_norm = nullptr;
+        resmem_real_op()(this->ctx, psi_norm, notconv);
+        using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+        setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
+        
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
+        delmem_real_op()(this->ctx, psi_norm);
+    }
+    else
+#endif
     {
-        Real psi_norm = ModuleBase::dot_real_op<T, Device>()(this->dim,
-                                                           psi_iter + (nbase + i) * this->dim,
-                                                           psi_iter + (nbase + i) * this->dim,
-                                                           true);
-        assert(psi_norm > 0.0);
-        psi_norm = sqrt(psi_norm);
-        ModuleBase::vector_mul_real_op<T, Device>()(this->dim,
-                                                       psi_iter + (nbase + i) * this->dim,
-                                                       psi_iter + (nbase + i) * this->dim,
-                                                       Real(1.0 / psi_norm));
+        Real* psi_norm = nullptr;
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
     }
 
     // update hpsi[:, nbase:nbase+notconv]
Original file line number	Diff line number	Diff line change
`@@ -114,8 +114,10 @@ std::string determine_type()`
`114`	`114`	`}`
`115`	`115`
`116`	`116`	`GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / "`
`117`		`- << base_device::information::get_device_info(PARAM.inp.device) << std::endl;`
`118`		`-`
	`117`	`+ << base_device::information::get_device_info(PARAM.inp.device) << std::endl;`
	`118`	`+ /***auto end_time = std::chrono::high_resolution_clock::now();`
	`119`	`+ auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);`
	`120`	`+ std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/`
`119`	`121`	`return esolver_type;`
`120`	`122`	`}`
`121`	`123`