deepmodeling
diff --git a/‎source/module_hamilt_lcao/module_gint/gint.cpp‎
Lines changed: 3 additions & 3 deletions b/‎source/module_hamilt_lcao/module_gint/gint.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_force_gpu.cu‎
Lines changed: 3 additions & 2 deletions b/‎source/module_hamilt_lcao/module_gint/gint_force_gpu.cu‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_force.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_force_gpu.h‎
Lines changed: 2 additions & 2 deletions b/‎source/module_hamilt_lcao/module_gint/gint_force.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_force_gpu.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu‎
Lines changed: 17 additions & 46 deletions b/‎source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu‎
Lines changed: 17 additions & 46 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_rho.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_rho_gpu.h‎
Lines changed: 1 addition & 10 deletions b/‎source/module_hamilt_lcao/module_gint/gint_rho.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_rho_gpu.h‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu‎
Lines changed: 3 additions & 3 deletions b/‎source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gint_vl.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_vl_gpu.h‎
Lines changed: 2 additions & 4 deletions b/‎source/module_hamilt_lcao/module_gint/gint_vl.h‎ renamed to ‎source/module_hamilt_lcao/module_gint/gint_vl_gpu.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/grid_technique.cpp‎
Lines changed: 5 additions & 19 deletions b/‎source/module_hamilt_lcao/module_gint/grid_technique.cpp‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/grid_technique.h‎
Lines changed: 0 additions & 6 deletions b/‎source/module_hamilt_lcao/module_gint/grid_technique.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎source/module_hamilt_lcao/module_gint/gtask_force.cpp‎
Lines changed: 1 addition & 2 deletions b/‎source/module_hamilt_lcao/module_gint/gtask_force.cpp‎
Lines changed: 1 addition & 2 deletions
@@ -1,9 +1,9 @@
 #include "gint.h"
 
 #if ((defined __CUDA))
-#include "gint_force.h"
-#include "gint_rho.h"
-#include "gint_vl.h"
+#include "gint_force_gpu.h"
+#include "gint_rho_gpu.h"
+#include "gint_vl_gpu.h"
 #endif
 
 #include "module_base/memory.h"
 
@@ -3,11 +3,12 @@
 #include <fstream>
 #include <sstream>
 
-#include "gint_force.h"
+#include "gint_force_gpu.h"
 #include "kernels/cuda/cuda_tools.cuh"
 #include "kernels/cuda/gint_force.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
+#include "gint_tools.h"
+
 namespace GintKernel
 {
 
 
@@ -1,5 +1,5 @@
-#ifndef GINT_FORCE_H
-#define GINT_FORCE_H
+#ifndef GINT_FORCE_GPU_H
+#define GINT_FORCE_GPU_H
 
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"
 
@@ -1,8 +1,8 @@
 #include "kernels/cuda/cuda_tools.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_rho.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
-#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh"
+#include "gint_rho_gpu.h"
+#include "gint_tools.h"
+#include "kernels/cuda/gint_rho.cuh"
 #include "omp.h"
 
 #include <omp.h>
@@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
     {
         checkCuda(cudaStreamSynchronize(gridt.streams[i]));
     }
-
     // calculate the rho for every nbz bigcells
 
 #pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
@@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
         {
             // get stream id
             int stream_num = omp_get_thread_num();
+            checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
 
             // psi_input contains data used to generate the psi values.
             // The suffix "_g" indicates that the data is stored in the GPU,
@@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
             double* rho_g = gridt.rho_g;
 
             // variables for dot product psir * psir_dm
-            int dot_count = 0;
-            int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num];
-            double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num];
-            double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num];
             double** dot_product
                 = &gridt.dot_product[gridt.num_mcell * stream_num];
 
-            int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num];
-            double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num];
-            double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num];
             double** dot_product_g
                 = &gridt.dot_product_g[gridt.num_mcell * stream_num];
 
@@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
             int atom_pair_num = 0;
             const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp;
             std::vector<bool> gpu_matrix_cal_flag(max_size * gridt.nbzp,false);
-            checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
 
             // generate GPU tasks, including the calculation of psir, matrix
             // multiplication, and dot product
@@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                             max_n,
                             atom_pair_num,
                             rho_g,
-                            vec_l,
-                            vec_r,
-                            dot_product,
-                            vec_len,
-                            dot_count);
+                            dot_product);
 
             // Copying data from host to device
             checkCuda(cudaMemcpyAsync(input_double_g,
@@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                                       cudaMemcpyHostToDevice,
                                       gridt.streams[stream_num]));
 
-            checkCuda(cudaMemcpyAsync(vec_len_g,
-                                      vec_len,
-                                      gridt.num_mcell * sizeof(int),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
-            checkCuda(cudaMemcpyAsync(vec_l_g,
-                                      vec_l,
-                                      gridt.num_mcell * sizeof(double*),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
-            checkCuda(cudaMemcpyAsync(vec_r_g,
-                                      vec_r,
-                                      gridt.num_mcell * sizeof(double*),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
             checkCuda(cudaMemcpyAsync(dot_product_g,
                                       dot_product,
                                       gridt.num_mcell * sizeof(double*),
@@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                                      atom_pair_num,
                                      gridt.streams[stream_num],
                                      ap_alpha_g);
+            checkCudaLastError();
 
             // Launching kernel to calculate dot product psir * psir_dm
-            dim3 grid_dot(64);
-            dim3 block_dot(64);
-            int incx = 1;
-            int incy = 1;
-            psir_dot<<<grid_dot, block_dot, 0, gridt.streams[stream_num]>>>(
-                vec_len_g,
-                vec_l_g,
-                incx,
-                vec_r_g,
-                incy,
-                dot_product_g,
-                dot_count);
+            const int block_size = 128;
+            dim3 block_dot(block_size);
+            dim3 grid_dot(gridt.nbzp, gridt.bxyz);
+            psir_dot<<<grid_dot, block_dot, sizeof(double) * block_size, gridt.streams[stream_num]>>>(
+                gridt.nbzp,
+                gridt.bxyz,
+                max_size * ucell.nwmax,
+                psir_ylm_left_g,
+                psir_r_g,
+                dot_product_g);
+            checkCudaLastError();
         }
     }
 
@@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
     {
         checkCuda(cudaStreamSynchronize(gridt.streams[i]));
     }
-
     // Copy rho from device to host
     checkCuda(cudaMemcpy(rho,
                          gridt.rho_g,
                          nczp * gridt.ncx * gridt.ncy * sizeof(double),
                          cudaMemcpyDeviceToHost));
-
     // free the memory
     checkCuda(cudaFree(dm_matrix_g));
 }
 
@@ -7,7 +7,6 @@
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"
 
-cudaError_t checkCuda(cudaError_t result);
 namespace GintKernel
 {
 
@@ -90,11 +89,7 @@ void gtask_rho(const Grid_Technique& gridt,
  * @param max_n Maximum value of n.
  * @param atom_pair_num Total count of atom pairs, which is also the number of matrix multiplication operations.
  * @param rho_g Rho.
- * @param vec_l Pointers to psir_ylm for vector dot product.
- * @param vec_r Pointers to psir_dm for vector dot product.
  * @param dot_product Pointers to the results of dot products.
- * @param vec_len Vector lengths for each dot product.
- * @param dot_count Total count of dot products.
  */
 void alloc_mult_dot_rho(const Grid_Technique& gridt,
                         const UnitCell& ucell,
@@ -120,11 +115,7 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
                         int& max_n,
                         int& atom_pair_num,
                         double* rho_g,
-                        double** vec_l,
-                        double** vec_r,
-                        double** dot_product,
-                        int* vec_len,
-                        int& dot_count);
+                        double** dot_product);
 
 } // namespace GintKernel
 #endif
@@ -2,9 +2,9 @@
 
 #include "kernels/cuda/cuda_tools.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
-#include "module_hamilt_lcao/module_gint/gint_vl.h"
-#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh"
+#include "gint_tools.h"
+#include "gint_vl_gpu.h"
+#include "kernels/cuda/gint_vl.cuh"
 
 namespace GintKernel
 {
 
@@ -1,14 +1,12 @@
-#ifndef GINT_VL_H
-#define GINT_VL_H
+#ifndef GINT_VL_GPU_H
+#define GINT_VL_GPU_H
 #include <cublas_v2.h>
 #include <cuda.h> // for CUDA_VERSION
 #include <cuda_runtime.h>
 
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"
 
-cudaError_t checkCuda(cudaError_t result);
-
 namespace GintKernel
 {
 
 
@@ -185,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in,
 #if ((defined __CUDA) /* || (defined __ROCM) */)
     if(GlobalV::device_flag == "gpu")
     {
-        this->init_gpu_gint_variables(ucell,orb,num_stream);
+        this->init_gpu_gint_variables(ucell, orb, num_stream);
     }
 #endif
 
@@ -923,22 +923,11 @@ void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Or
     num_mcell = nbzp * bxyz;
     checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double)));
     checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*)));
     checkCudaErrors(cudaMallocHost((void**)&dot_product,
                                    num_mcell * nstreams * sizeof(double*)));
     checkCudaErrors(cudaMalloc((void**)&dot_product_g,
                                num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int)));
+
 
     for (int i = 0; i < nstreams; ++i)
     {
@@ -961,7 +950,10 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
         return;
     }
     for (int i = 0; i < nstreams; ++i)
+    {
         checkCudaErrors(cudaStreamDestroy(streams[i]));
+    }
+    delete[] streams;
 
     checkCudaErrors(cudaFree(ylmcoef_g));
     checkCudaErrors(cudaFree(atom_nwl_g));
@@ -1020,14 +1012,8 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
     checkCudaErrors(cudaFree(dm_global_g));
     checkCudaErrors(cudaFree(ap_output_gbl_g));
 
-    checkCudaErrors(cudaFreeHost(vec_len));
-    checkCudaErrors(cudaFreeHost(vec_l));
-    checkCudaErrors(cudaFreeHost(vec_r));
     checkCudaErrors(cudaFreeHost(dot_product));
 
-    checkCudaErrors(cudaFree(vec_len_g));
-    checkCudaErrors(cudaFree(vec_l_g));
-    checkCudaErrors(cudaFree(vec_r_g));
     checkCudaErrors(cudaFree(dot_product_g));
     checkCudaErrors(cudaFree(rho_g));
 
 
@@ -218,12 +218,6 @@ class Grid_Technique : public Grid_MeshBall
     // additional variables for rho calculating
     int num_mcell;
     double* rho_g;
-    int* vec_len;
-    int* vec_len_g;
-    double** vec_l;
-    double** vec_l_g;
-    double** vec_r;
-    double** vec_r_g;
     double** dot_product;
     double** dot_product_g;
 
 
@@ -1,6 +1,6 @@
 #include <omp.h>
 
-#include "gint_force.h"
+#include "gint_force_gpu.h"
 #include "module_base/ylm.h"
 #include "module_hamilt_lcao/module_gint/gint_tools.h"
 namespace GintKernel
@@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt,
         }
     }
     atom_pair_num = tid;
-    gpu_mat_cal_flag.clear();
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#include <omp.h>`
`2`	`2`
`3`		`-#include "gint_force.h"`
	`3`	`+#include "gint_force_gpu.h"`
`4`	`4`	`#include "module_base/ylm.h"`
`5`	`5`	`#include "module_hamilt_lcao/module_gint/gint_tools.h"`
`6`	`6`	`namespace GintKernel`
`@@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt,`
`213`	`213`	`}`
`214`	`214`	`}`
`215`	`215`	`atom_pair_num = tid;`
`216`		`- gpu_mat_cal_flag.clear();`
`217`	`216`	`}`
`218`	`217`
`219`	`218`