Merge pull request #227 from denghuilu/devel-submit

amcadmus · web-flow · commit c7f8473a5dbc · 2020-05-27T14:35:15.000+08:00
add max nbor size from 1024 to 4096
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
@@ -93,10 +93,8 @@ class NNPInter
   compute_t *array_double;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int *ilist, *jrange, *jlist, *array_int;
+  int *ilist, *jrange, *jlist;
   int ilist_size, jrange_size, jlist_size;
-  int arr_int_size, arr_ll_size, arr_dou_size;
 
   // function used for neighbor list copy
   vector<int> get_sel_a() const;
@@ -191,13 +189,10 @@ class NNPInterModelDevi
   vector<vector<int> > sec;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int max_sec_size = 0, max_sec_back = 0;
-  int *ilist, *jrange, *jlist, *array_int;
-  int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size;
+  int *ilist, *jrange, *jlist;
+  int ilist_size, jrange_size, jlist_size;
 
   // function used for nborlist copy
-  void get_max_sec();
   vector<vector<int> > get_sel() const;
   void cum_sum(const std::vector<std::vector<int32> > n_sel);
 #ifdef USE_CUDA_TOOLKIT
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
@@ -144,9 +144,6 @@ session_input_tensors (vector<std::pair<string, Tensor>>& input_tensors,
 		       const int                        * ilist, 
 		       const int                        * jrange,
 		       const int                        * jlist,
-		       int                              * array_int,
-		       unsigned long long               * array_longlong, 
-		       double                           * array_double,
 		       const vector<VALUETYPE>		& fparam_,
 		       const vector<VALUETYPE>	        & aparam_,
 		       const NNPAtomMap<VALUETYPE>      & nnpmap,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
@@ -3,7 +3,6 @@
 #include "SimulationRegion.h"
 #include <stdexcept>	
 
-#define MAGIC_NUMBER 1024
 
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
@@ -14,7 +13,7 @@
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
-    if (code != cudaSuccess) 
+    if (code != cudaSuccess)
     {
         fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
         if (abort) exit(code);
@@ -273,34 +272,19 @@ NNPInter::~NNPInter() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
     #endif
 }
 
 #ifdef USE_CUDA_TOOLKIT
 void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
     if (!init_nbor) {
-        sec_a = cum_sum(get_sel_a());
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * sec_a.back() * 3;
-        init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
         cudaErrcheck(cudaFree(ilist));
@@ -317,25 +301,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * sec_a.back() * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
-        arr_dou_size = nloc * sec_a.back() * 3;
-    }
+    
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -378,14 +344,10 @@ init (const string & model, const int & gpu_rank)
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -415,12 +377,8 @@ init (const string & model, const int & gpu_rank)
   inited = true;
 
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -602,7 +560,7 @@ compute_inner (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -669,7 +627,7 @@ compute (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -710,9 +668,6 @@ NNPInterModelDevi::~NNPInterModelDevi() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
 #endif
 }
@@ -761,14 +716,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -798,14 +749,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -873,41 +820,18 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
     }
 }
 
-void  
-NNPInterModelDevi::
-get_max_sec() 
-{
-    for (int ii = 0; ii < numb_models; ii++) {
-        this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size;
-        this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back;
-    }
-}
-
 #ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
 update_nbor(const InternalNeighborList & nlist, const int nloc) 
 {
     if (!init_nbor) {
-        cum_sum(get_sel());
-        get_max_sec();
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * max_sec_back * 3;
-        init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
         cudaErrcheck(cudaFree(ilist));
@@ -924,25 +848,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc)
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * max_sec_back * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
-        arr_dou_size = nloc * max_sec_back * 3;
-    }
+
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -1044,7 +950,7 @@ compute (vector<ENERGYTYPE> &		all_energy,
 
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -1094,7 +1000,7 @@ compute (vector<ENERGYTYPE> &			all_energy,
         
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
@@ -479,9 +479,6 @@ session_input_tensors (
     const int                           *   ilist, 
     const int                           *   jrange,
     const int                           *   jlist,
-    int                                 *   array_int,
-    unsigned long long                  *   array_longlong, 
-    double                              *   array_double,
     const vector<VALUETYPE>	            &   fparam_,
     const vector<VALUETYPE>	            &   aparam_,
     const NNPAtomMap<VALUETYPE>         &   nnpmap,
@@ -511,7 +508,7 @@ session_input_tensors (
     box_shape.AddDim (nframes);
     box_shape.AddDim (9);
     TensorShape mesh_shape;
-    mesh_shape.AddDim (32);
+    mesh_shape.AddDim (16);
     TensorShape natoms_shape;
     natoms_shape.AddDim (2 + ntypes);
     TensorShape fparam_shape;
@@ -565,7 +562,7 @@ session_input_tensors (
         }
     }
     
-    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
+    for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
     
     mesh (0) = sizeof(int *) / sizeof(int);
     assert (mesh(0) * sizeof(int) == sizeof(int *));
@@ -577,9 +574,6 @@ session_input_tensors (
     memcpy (&mesh(4), &(ilist), sizeof(int *));
     memcpy (&mesh(8), &(jrange), sizeof(int *));
     memcpy (&mesh(12), &(jlist), sizeof(int *));
-    memcpy (&mesh(16), &(array_int), sizeof(int *));
-    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
-    memcpy (&mesh(24), &(array_double), sizeof(double *));
 
     natoms (0) = nloc;
     natoms (1) = nall;
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc