Skip to content

Commit c7f8473

Browse files
authored
Merge pull request #227 from denghuilu/devel-submit
add max nbor size from 1024 to 4096
2 parents 3d77e59 + 848e1f4 commit c7f8473

File tree

6 files changed

+284
-186
lines changed

6 files changed

+284
-186
lines changed

source/lib/include/NNPInter.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,8 @@ class NNPInter
9393
compute_t *array_double;
9494
InternalNeighborList nlist;
9595
NNPAtomMap<VALUETYPE> nnpmap;
96-
unsigned long long *array_longlong;
97-
int *ilist, *jrange, *jlist, *array_int;
96+
int *ilist, *jrange, *jlist;
9897
int ilist_size, jrange_size, jlist_size;
99-
int arr_int_size, arr_ll_size, arr_dou_size;
10098

10199
// function used for neighbor list copy
102100
vector<int> get_sel_a() const;
@@ -191,13 +189,10 @@ class NNPInterModelDevi
191189
vector<vector<int> > sec;
192190
InternalNeighborList nlist;
193191
NNPAtomMap<VALUETYPE> nnpmap;
194-
unsigned long long *array_longlong;
195-
int max_sec_size = 0, max_sec_back = 0;
196-
int *ilist, *jrange, *jlist, *array_int;
197-
int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size;
192+
int *ilist, *jrange, *jlist;
193+
int ilist_size, jrange_size, jlist_size;
198194

199195
// function used for nborlist copy
200-
void get_max_sec();
201196
vector<vector<int> > get_sel() const;
202197
void cum_sum(const std::vector<std::vector<int32> > n_sel);
203198
#ifdef USE_CUDA_TOOLKIT

source/lib/include/common.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,6 @@ session_input_tensors (vector<std::pair<string, Tensor>>& input_tensors,
144144
const int * ilist,
145145
const int * jrange,
146146
const int * jlist,
147-
int * array_int,
148-
unsigned long long * array_longlong,
149-
double * array_double,
150147
const vector<VALUETYPE> & fparam_,
151148
const vector<VALUETYPE> & aparam_,
152149
const NNPAtomMap<VALUETYPE> & nnpmap,

source/lib/src/NNPInter.cc

Lines changed: 10 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "SimulationRegion.h"
44
#include <stdexcept>
55

6-
#define MAGIC_NUMBER 1024
76

87
#ifdef USE_CUDA_TOOLKIT
98
#include "cuda_runtime.h"
@@ -14,7 +13,7 @@
1413
#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
1514
inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
1615
{
17-
if (code != cudaSuccess)
16+
if (code != cudaSuccess)
1817
{
1918
fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
2019
if (abort) exit(code);
@@ -273,34 +272,19 @@ NNPInter::~NNPInter() {
273272
cudaErrcheck(cudaFree(ilist));
274273
cudaErrcheck(cudaFree(jrange));
275274
cudaErrcheck(cudaFree(jlist));
276-
cudaErrcheck(cudaFree(array_int));
277-
cudaErrcheck(cudaFree(array_longlong));
278-
cudaErrcheck(cudaFree(array_double));
279275
}
280276
#endif
281277
}
282278

283279
#ifdef USE_CUDA_TOOLKIT
284280
void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
285281
if (!init_nbor) {
286-
sec_a = cum_sum(get_sel_a());
287282
cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
288283
cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
289284
cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
290-
cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
291-
cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
292-
#ifdef HIGH_PREC
293-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
294-
#else
295-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
296-
#endif
297285
ilist_size = nlist.ilist.size();
298286
jrange_size = nlist.jrange.size();
299287
jlist_size = nlist.jlist.size();
300-
arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
301-
arr_ll_size = nloc * MAGIC_NUMBER * 2;
302-
arr_dou_size = nloc * sec_a.back() * 3;
303-
init_nbor = true;
304288
}
305289
if (ilist_size < nlist.ilist.size()) {
306290
cudaErrcheck(cudaFree(ilist));
@@ -317,25 +301,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
317301
cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
318302
jlist_size = nlist.jlist.size();
319303
}
320-
if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) {
321-
cudaErrcheck(cudaFree(array_int));
322-
cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
323-
arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
324-
}
325-
if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
326-
cudaErrcheck(cudaFree(array_longlong));
327-
cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
328-
arr_ll_size = nloc * MAGIC_NUMBER * 2;
329-
}
330-
if (arr_dou_size < nloc * sec_a.back() * 3) {
331-
cudaErrcheck(cudaFree(array_double));
332-
#ifdef HIGH_PREC
333-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
334-
#else
335-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
336-
#endif
337-
arr_dou_size = nloc * sec_a.back() * 3;
338-
}
304+
339305
cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
340306
cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
341307
cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -378,14 +344,10 @@ init (const string & model, const int & gpu_rank)
378344
if (dfparam < 0) dfparam = 0;
379345
if (daparam < 0) daparam = 0;
380346
inited = true;
381-
347+
382348
init_nbor = false;
383-
array_int = NULL;
384-
array_double = NULL;
385-
array_longlong = NULL;
386349
ilist = NULL; jrange = NULL; jlist = NULL;
387350
ilist_size = 0; jrange_size = 0; jlist_size = 0;
388-
arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
389351
}
390352
#else
391353
void
@@ -415,12 +377,8 @@ init (const string & model, const int & gpu_rank)
415377
inited = true;
416378

417379
init_nbor = false;
418-
array_int = NULL;
419-
array_double = NULL;
420-
array_longlong = NULL;
421380
ilist = NULL; jrange = NULL; jlist = NULL;
422381
ilist_size = 0; jrange_size = 0; jlist_size = 0;
423-
arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
424382
}
425383
#endif
426384

@@ -602,7 +560,7 @@ compute_inner (ENERGYTYPE & dener,
602560
}
603561

604562
#ifdef USE_CUDA_TOOLKIT
605-
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
563+
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
606564
#else
607565
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
608566
#endif
@@ -669,7 +627,7 @@ compute (ENERGYTYPE & dener,
669627
}
670628

671629
#ifdef USE_CUDA_TOOLKIT
672-
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
630+
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
673631
#else
674632
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
675633
#endif
@@ -710,9 +668,6 @@ NNPInterModelDevi::~NNPInterModelDevi() {
710668
cudaErrcheck(cudaFree(ilist));
711669
cudaErrcheck(cudaFree(jrange));
712670
cudaErrcheck(cudaFree(jlist));
713-
cudaErrcheck(cudaFree(array_int));
714-
cudaErrcheck(cudaFree(array_longlong));
715-
cudaErrcheck(cudaFree(array_double));
716671
}
717672
#endif
718673
}
@@ -761,14 +716,10 @@ init (const vector<string> & models, const int & gpu_rank)
761716
// cell_size = rcut;
762717
// ntypes = get_ntypes();
763718
inited = true;
764-
719+
765720
init_nbor = false;
766-
array_int = NULL;
767-
array_double = NULL;
768-
array_longlong = NULL;
769721
ilist = NULL; jrange = NULL; jlist = NULL;
770722
ilist_size = 0; jrange_size = 0; jlist_size = 0;
771-
arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
772723
}
773724
#else
774725
void
@@ -798,14 +749,10 @@ init (const vector<string> & models, const int & gpu_rank)
798749
// cell_size = rcut;
799750
// ntypes = get_ntypes();
800751
inited = true;
801-
752+
802753
init_nbor = false;
803-
array_int = NULL;
804-
array_double = NULL;
805-
array_longlong = NULL;
806754
ilist = NULL; jrange = NULL; jlist = NULL;
807755
ilist_size = 0; jrange_size = 0; jlist_size = 0;
808-
arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
809756
}
810757
#endif
811758

@@ -873,41 +820,18 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
873820
}
874821
}
875822

876-
void
877-
NNPInterModelDevi::
878-
get_max_sec()
879-
{
880-
for (int ii = 0; ii < numb_models; ii++) {
881-
this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size;
882-
this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back;
883-
}
884-
}
885-
886823
#ifdef USE_CUDA_TOOLKIT
887824
void
888825
NNPInterModelDevi::
889826
update_nbor(const InternalNeighborList & nlist, const int nloc)
890827
{
891828
if (!init_nbor) {
892-
cum_sum(get_sel());
893-
get_max_sec();
894829
cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
895830
cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
896831
cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
897-
cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
898-
cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
899-
#ifdef HIGH_PREC
900-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
901-
#else
902-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
903-
#endif
904832
ilist_size = nlist.ilist.size();
905833
jrange_size = nlist.jrange.size();
906834
jlist_size = nlist.jlist.size();
907-
arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
908-
arr_ll_size = nloc * MAGIC_NUMBER * 2;
909-
arr_dou_size = nloc * max_sec_back * 3;
910-
init_nbor = true;
911835
}
912836
if (ilist_size < nlist.ilist.size()) {
913837
cudaErrcheck(cudaFree(ilist));
@@ -924,25 +848,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc)
924848
cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
925849
jlist_size = nlist.jlist.size();
926850
}
927-
if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) {
928-
cudaErrcheck(cudaFree(array_int));
929-
cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
930-
arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
931-
}
932-
if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
933-
cudaErrcheck(cudaFree(array_longlong));
934-
cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
935-
arr_ll_size = nloc * MAGIC_NUMBER * 2;
936-
}
937-
if (arr_dou_size < nloc * max_sec_back * 3) {
938-
cudaErrcheck(cudaFree(array_double));
939-
#ifdef HIGH_PREC
940-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
941-
#else
942-
cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
943-
#endif
944-
arr_dou_size = nloc * max_sec_back * 3;
945-
}
851+
946852
cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
947853
cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
948854
cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -1044,7 +950,7 @@ compute (vector<ENERGYTYPE> & all_energy,
1044950

1045951
}
1046952
#ifdef USE_CUDA_TOOLKIT
1047-
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
953+
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
1048954
#else
1049955
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
1050956
#endif
@@ -1094,7 +1000,7 @@ compute (vector<ENERGYTYPE> & all_energy,
10941000

10951001
}
10961002
#ifdef USE_CUDA_TOOLKIT
1097-
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
1003+
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
10981004
#else
10991005
int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
11001006
#endif

source/lib/src/common.cc

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -479,9 +479,6 @@ session_input_tensors (
479479
const int * ilist,
480480
const int * jrange,
481481
const int * jlist,
482-
int * array_int,
483-
unsigned long long * array_longlong,
484-
double * array_double,
485482
const vector<VALUETYPE> & fparam_,
486483
const vector<VALUETYPE> & aparam_,
487484
const NNPAtomMap<VALUETYPE> & nnpmap,
@@ -511,7 +508,7 @@ session_input_tensors (
511508
box_shape.AddDim (nframes);
512509
box_shape.AddDim (9);
513510
TensorShape mesh_shape;
514-
mesh_shape.AddDim (32);
511+
mesh_shape.AddDim (16);
515512
TensorShape natoms_shape;
516513
natoms_shape.AddDim (2 + ntypes);
517514
TensorShape fparam_shape;
@@ -565,7 +562,7 @@ session_input_tensors (
565562
}
566563
}
567564

568-
for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
565+
for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
569566

570567
mesh (0) = sizeof(int *) / sizeof(int);
571568
assert (mesh(0) * sizeof(int) == sizeof(int *));
@@ -577,9 +574,6 @@ session_input_tensors (
577574
memcpy (&mesh(4), &(ilist), sizeof(int *));
578575
memcpy (&mesh(8), &(jrange), sizeof(int *));
579576
memcpy (&mesh(12), &(jlist), sizeof(int *));
580-
memcpy (&mesh(16), &(array_int), sizeof(int *));
581-
memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
582-
memcpy (&mesh(24), &(array_double), sizeof(double *));
583577

584578
natoms (0) = nloc;
585579
natoms (1) = nall;

0 commit comments

Comments
 (0)