Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 1440c4b

Browse files
author
Omar Awile
authored
Print memory use from model_size (#458)
* use model_size to print memory estimation - cleaned up the function a bit * Added detailed memory report - changed --mech_count runtime option into --model-stats, which now prints aggregate (min, max, avg) statistics on NrnThread member sizes over all MPI ranks.
1 parent ec39501 commit 1440c4b

File tree

6 files changed

+199
-43
lines changed

6 files changed

+199
-43
lines changed

coreneuron/apps/corenrn_parameters.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ corenrn_parameters::corenrn_parameters() {
3838
this->verbose,
3939
{verbose_level::NONE, verbose_level::ERROR, verbose_level::INFO, verbose_level::DEBUG},
4040
"Verbose level: 0 = NONE, 1 = ERROR, 2 = INFO, 3 = DEBUG. Default is INFO");
41-
app.add_flag("--count_mechs",
42-
this->count_mechs,
43-
"Print number of instances of each mechanism.");
41+
app.add_flag("--model-stats",
42+
this->model_stats,
43+
"Print number of instances of each mechanism and detailed memory stats.");
4444

4545
auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
4646
sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)

coreneuron/apps/corenrn_parameters.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ struct corenrn_parameters {
5959

6060
bool show_version = false; /// Print version and exit.
6161

62-
bool count_mechs = false; /// Print mechanism counts after initialization
62+
bool model_stats = false; /// Print mechanism counts and model size after initialization
6363

6464
verbose_level verbose{verbose_level::DEFAULT}; /// Verbosity-level
6565

coreneuron/io/mech_report.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ void write_mech_report() {
4040
local_mech_count.size(),
4141
MPI_UNSIGNED_LONG_LONG,
4242
MPI_SUM,
43-
MPI_COMM_WORLD);
43+
nrnmpi_comm);
4444

4545
#else
4646
total_mech_count = local_mech_count;

coreneuron/io/nrn_setup.cpp

Lines changed: 189 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "coreneuron/utils/nrn_assert.h"
2323
#include "coreneuron/utils/nrnmutdec.h"
2424
#include "coreneuron/utils/memory.h"
25+
#include "coreneuron/mpi/nrnmpi.h"
26+
#include "coreneuron/mpi/nrnmpi_impl.h"
2527
#include "coreneuron/io/nrn_setup.hpp"
2628
#include "coreneuron/network/partrans.hpp"
2729
#include "coreneuron/io/nrn_checkpoint.hpp"
@@ -33,6 +35,7 @@
3335
#include "coreneuron/io/phase2.hpp"
3436
#include "coreneuron/io/mech_report.h"
3537
#include "coreneuron/apps/corenrn_parameters.hpp"
38+
#include "coreneuron/io/nrn_setup.hpp"
3639

3740
// callbacks into nrn/src/nrniv/nrnbbcore_write.cpp
3841
#include "coreneuron/sim/fast_imem.hpp"
@@ -149,8 +152,6 @@ extern corenrn_parameters corenrn_param;
149152

150153
static OMP_Mutex mut;
151154

152-
static size_t model_size(void);
153-
154155
/// Vector of maps for negative presyns
155156
std::vector<std::map<int, PreSyn*>> neg_gid2out;
156157
/// Maps for ouput and input presyns
@@ -537,15 +538,30 @@ void nrn_setup(const char* filesdat,
537538
/// which is only executed by StochKV.c.
538539
nrn_mk_table_check(); // was done in nrn_thread_memblist_setup in multicore.c
539540

540-
model_size();
541-
delete[] userParams.gidgroups;
541+
size_t model_size_bytes;
542+
543+
if (corenrn_param.model_stats) {
544+
write_mech_report();
545+
model_size_bytes = model_size(true);
546+
} else {
547+
model_size_bytes = model_size(false);
548+
}
542549

543550
if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
544551
printf(" Setup Done : %.2lf seconds \n", nrn_wtime() - time);
552+
553+
if (model_size_bytes < 1024) {
554+
printf(" Model size : %ld bytes\n", model_size_bytes);
555+
} else if (model_size_bytes < 1024 * 1024) {
556+
printf(" Model size : %.2lf kB\n", model_size_bytes / 1024.);
557+
} else if (model_size_bytes < 1024 * 1024 * 1024) {
558+
printf(" Model size : %.2lf MB\n", model_size_bytes / (1024. * 1024.));
559+
} else {
560+
printf(" Model size : %.2lf GB\n", model_size_bytes / (1024. * 1024. * 1024.));
561+
}
545562
}
546-
if (corenrn_param.count_mechs) {
547-
write_mech_report();
548-
}
563+
564+
delete[] userParams.gidgroups;
549565
}
550566

551567
void setup_ThreadData(NrnThread& nt) {
@@ -938,11 +954,8 @@ void read_phase3(NrnThread& nt, UserParams& userParams) {
938954
}
939955

940956
static size_t memb_list_size(NrnThreadMembList* tml) {
941-
size_t sz_ntml = sizeof(NrnThreadMembList);
942-
size_t sz_ml = sizeof(Memb_list);
943-
size_t szi = sizeof(int);
944-
size_t nbyte = sz_ntml + sz_ml;
945-
nbyte += tml->ml->nodecount * szi;
957+
size_t nbyte = sizeof(NrnThreadMembList) + sizeof(Memb_list);
958+
nbyte += tml->ml->nodecount * sizeof(int);
946959
nbyte += corenrn.get_prop_dparam_size()[tml->index] * tml->ml->nodecount * sizeof(Datum);
947960
#ifdef DEBUG
948961
int i = tml->index;
@@ -982,18 +995,21 @@ size_t input_presyn_size(void) {
982995
return nbyte;
983996
}
984997

985-
size_t model_size(void) {
998+
size_t model_size(bool detailed_report) {
986999
size_t nbyte = 0;
987-
size_t szd = sizeof(double);
988-
size_t szi = sizeof(int);
989-
size_t szv = sizeof(void*);
990-
size_t sz_th = sizeof(NrnThread);
991-
size_t sz_ps = sizeof(PreSyn);
992-
size_t sz_psi = sizeof(InputPreSyn);
993-
size_t sz_nc = sizeof(NetCon);
994-
size_t sz_pp = sizeof(Point_process);
1000+
size_t sz_nrnThread = sizeof(NrnThread);
1001+
size_t sz_presyn = sizeof(PreSyn);
1002+
size_t sz_input_presyn = sizeof(InputPreSyn);
1003+
size_t sz_netcon = sizeof(NetCon);
1004+
size_t sz_pntproc = sizeof(Point_process);
9951005
size_t nccnt = 0;
9961006

1007+
std::vector<size_t> size_data(13, 0);
1008+
std::vector<size_t> global_size_data_min(13, 0);
1009+
std::vector<size_t> global_size_data_max(13, 0);
1010+
std::vector<size_t> global_size_data_sum(13, 0);
1011+
std::vector<float> global_size_data_avg(13, 0.0);
1012+
9971013
for (int i = 0; i < nrn_nthread; ++i) {
9981014
NrnThread& nt = nrn_threads[i];
9991015
size_t nb_nt = 0; // per thread
@@ -1007,9 +1023,14 @@ size_t model_size(void) {
10071023
}
10081024

10091025
// basic thread size includes mechanism data and G*V=I matrix
1010-
nb_nt += sz_th;
1011-
nb_nt += nt._ndata * szd + nt._nidata * szi + nt._nvdata * szv;
1012-
nb_nt += nt.end * szi; // _v_parent_index
1026+
nb_nt += sz_nrnThread;
1027+
nb_nt += nt._ndata * sizeof(double) + nt._nidata * sizeof(int) + nt._nvdata * sizeof(void*);
1028+
nb_nt += nt.end * sizeof(int); // _v_parent_index
1029+
1030+
// network connectivity
1031+
nb_nt += nt.n_pntproc * sz_pntproc + nt.n_netcon * sz_netcon + nt.n_presyn * sz_presyn +
1032+
nt.n_input_presyn * sz_input_presyn + nt.n_weight * sizeof(double);
1033+
nbyte += nb_nt;
10131034

10141035
#ifdef DEBUG
10151036
printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
@@ -1023,35 +1044,167 @@ size_t model_size(void) {
10231044
printf("n_pntproc=%d sz=%ld nbyte=%ld\n", nt.n_pntproc, sz_pp, nt.n_pntproc * sz_pp);
10241045
printf("n_netcon=%d sz=%ld nbyte=%ld\n", nt.n_netcon, sz_nc, nt.n_netcon * sz_nc);
10251046
printf("n_weight = %d\n", nt.n_weight);
1026-
#endif
10271047

1028-
// spike handling
1029-
nb_nt += nt.n_pntproc * sz_pp + nt.n_netcon * sz_nc + nt.n_presyn * sz_ps +
1030-
nt.n_input_presyn * sz_psi + nt.n_weight * szd;
1031-
nbyte += nb_nt;
1032-
#ifdef DEBUG
10331048
printf("%d thread %d total bytes %ld\n", nrnmpi_myid, i, nb_nt);
10341049
#endif
1050+
1051+
if (detailed_report) {
1052+
size_data[0] += nt.ncell;
1053+
size_data[1] += nt.end;
1054+
size_data[2] += nmech;
1055+
size_data[3] += nt._ndata;
1056+
size_data[4] += nt._nidata;
1057+
size_data[5] += nt._nvdata;
1058+
size_data[6] += nt.n_presyn;
1059+
size_data[7] += nt.n_input_presyn;
1060+
size_data[8] += nt.n_pntproc;
1061+
size_data[9] += nt.n_netcon;
1062+
size_data[10] += nt.n_weight;
1063+
size_data[11] += nb_nt;
1064+
}
10351065
}
10361066

1037-
#ifdef DEBUG
1038-
printf("%d netcon pointers %ld nbyte=%ld\n", nrnmpi_myid, nccnt, nccnt * sizeof(NetCon*));
1039-
#endif
10401067
nbyte += nccnt * sizeof(NetCon*);
10411068
nbyte += output_presyn_size();
10421069
nbyte += input_presyn_size();
10431070

1071+
nbyte += nrnran123_instance_count() * nrnran123_state_size();
1072+
10441073
#ifdef DEBUG
1074+
printf("%d netcon pointers %ld nbyte=%ld\n", nrnmpi_myid, nccnt, nccnt * sizeof(NetCon*));
10451075
printf("nrnran123 size=%ld cnt=%ld nbyte=%ld\n",
10461076
nrnran123_state_size(),
10471077
nrnran123_instance_count(),
10481078
nrnran123_instance_count() * nrnran123_state_size());
1079+
printf("%d total bytes %ld\n", nrnmpi_myid, nbyte);
1080+
#endif
1081+
if (detailed_report) {
1082+
size_data[12] = nbyte;
1083+
#if NRNMPI
1084+
MPI_Allreduce(&size_data[0],
1085+
&global_size_data_min[0],
1086+
13,
1087+
MPI_UNSIGNED_LONG_LONG,
1088+
MPI_MIN,
1089+
nrnmpi_comm);
1090+
MPI_Allreduce(&size_data[0],
1091+
&global_size_data_max[0],
1092+
13,
1093+
MPI_UNSIGNED_LONG_LONG,
1094+
MPI_MAX,
1095+
nrnmpi_comm);
1096+
MPI_Allreduce(&size_data[0],
1097+
&global_size_data_sum[0],
1098+
13,
1099+
MPI_UNSIGNED_LONG_LONG,
1100+
MPI_SUM,
1101+
nrnmpi_comm);
1102+
for (int i = 0; i < 13; i++) {
1103+
global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
1104+
}
1105+
#else
1106+
global_size_data_max = size_data;
1107+
global_size_data_min = size_data;
1108+
global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
10491109
#endif
1110+
// now print the collected data:
1111+
if (nrnmpi_myid == 0) {
1112+
printf("Memory size information for all NrnThreads per rank\n");
1113+
printf("------------------------------------------------------------------\n");
1114+
printf("%22s %12s %12s %12s\n", "field", "min", "max", "avg");
1115+
printf("%22s %12ld %12ld %15.2f\n",
1116+
"n_cell",
1117+
global_size_data_min[0],
1118+
global_size_data_max[0],
1119+
global_size_data_avg[0]);
1120+
printf("%22s %12ld %12ld %15.2f\n",
1121+
"n_compartment",
1122+
global_size_data_min[1],
1123+
global_size_data_max[1],
1124+
global_size_data_avg[1]);
1125+
printf("%22s %12ld %12ld %15.2f\n",
1126+
"n_mechanism",
1127+
global_size_data_min[2],
1128+
global_size_data_max[2],
1129+
global_size_data_avg[2]);
1130+
printf("%22s %12ld %12ld %15.2f\n",
1131+
"_ndata",
1132+
global_size_data_min[3],
1133+
global_size_data_max[3],
1134+
global_size_data_avg[3]);
1135+
printf("%22s %12ld %12ld %15.2f\n",
1136+
"_nidata",
1137+
global_size_data_min[4],
1138+
global_size_data_max[4],
1139+
global_size_data_avg[4]);
1140+
printf("%22s %12ld %12ld %15.2f\n",
1141+
"_nvdata",
1142+
global_size_data_min[5],
1143+
global_size_data_max[5],
1144+
global_size_data_avg[5]);
1145+
printf("%22s %12ld %12ld %15.2f\n",
1146+
"n_presyn",
1147+
global_size_data_min[6],
1148+
global_size_data_max[6],
1149+
global_size_data_avg[6]);
1150+
printf("%22s %12ld %12ld %15.2f\n",
1151+
"n_presyn (bytes)",
1152+
global_size_data_min[6] * sz_presyn,
1153+
global_size_data_max[6] * sz_presyn,
1154+
global_size_data_avg[6] * sz_presyn);
1155+
printf("%22s %12ld %12ld %15.2f\n",
1156+
"n_input_presyn",
1157+
global_size_data_min[7],
1158+
global_size_data_max[7],
1159+
global_size_data_avg[7]);
1160+
printf("%22s %12ld %12ld %15.2f\n",
1161+
"n_input_presyn (bytes)",
1162+
global_size_data_min[7] * sz_input_presyn,
1163+
global_size_data_max[7] * sz_input_presyn,
1164+
global_size_data_avg[7] * sz_input_presyn);
1165+
printf("%22s %12ld %12ld %15.2f\n",
1166+
"n_pntproc",
1167+
global_size_data_min[8],
1168+
global_size_data_max[8],
1169+
global_size_data_avg[8]);
1170+
printf("%22s %12ld %12ld %15.2f\n",
1171+
"n_pntproc (bytes)",
1172+
global_size_data_min[8] * sz_pntproc,
1173+
global_size_data_max[8] * sz_pntproc,
1174+
global_size_data_avg[8] * sz_pntproc);
1175+
printf("%22s %12ld %12ld %15.2f\n",
1176+
"n_netcon",
1177+
global_size_data_min[9],
1178+
global_size_data_max[9],
1179+
global_size_data_avg[9]);
1180+
printf("%22s %12ld %12ld %15.2f\n",
1181+
"n_netcon (bytes)",
1182+
global_size_data_min[9] * sz_netcon,
1183+
global_size_data_max[9] * sz_netcon,
1184+
global_size_data_avg[9] * sz_netcon);
1185+
printf("%22s %12ld %12ld %15.2f\n",
1186+
"n_weight",
1187+
global_size_data_min[10],
1188+
global_size_data_max[10],
1189+
global_size_data_avg[10]);
1190+
printf("%22s %12ld %12ld %15.2f\n",
1191+
"NrnThread (bytes)",
1192+
global_size_data_min[11],
1193+
global_size_data_max[11],
1194+
global_size_data_avg[11]);
1195+
printf("%22s %12ld %12ld %15.2f\n",
1196+
"model size (bytes)",
1197+
global_size_data_min[12],
1198+
global_size_data_max[12],
1199+
global_size_data_avg[12]);
1200+
}
1201+
}
10501202

1051-
nbyte += nrnran123_instance_count() * nrnran123_state_size();
1203+
#if NRNMPI
1204+
size_t global_nbyte = 0;
1205+
MPI_Allreduce(&nbyte, &global_nbyte, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, nrnmpi_comm);
1206+
nbyte = global_nbyte;
10521207

1053-
#ifdef DEBUG
1054-
printf("%d total bytes %ld\n", nrnmpi_myid, nbyte);
10551208
#endif
10561209

10571210
return nbyte;

coreneuron/io/nrn_setup.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,11 @@ extern void nrn_setup_cleanup();
3232

3333
extern int nrn_i_layout(int i, int cnt, int j, int size, int layout);
3434

35+
size_t model_size(bool detailed_report);
36+
3537
namespace coreneuron {
3638

39+
3740
/// Reading phase number.
3841
enum phase { one = 1, two, three, gap };
3942

tests/integration/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# =============================================================================
66

77
set(COMMON_ARGS "--tstop 100. --celsius 6.3 --mpi")
8-
set(COUNT_MECHS_ARG "--count_mechs")
8+
set(MODEL_STATS_ARG "--model-stats")
99
set(RING_DATASET_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ring")
1010
set(RING_COMMON_ARGS "--datpath ${RING_DATASET_DIR} ${COMMON_ARGS}")
1111
set(RING_GAP_COMMON_ARGS "--datpath ${CMAKE_CURRENT_SOURCE_DIR}/ring_gap ${COMMON_ARGS}")
@@ -17,7 +17,7 @@ endif()
1717

1818
# List of tests with arguments
1919
set(TEST_CASES_WITH_ARGS
20-
"ring!${RING_COMMON_ARGS} ${COUNT_MECHS_ARG} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring"
20+
"ring!${RING_COMMON_ARGS} ${MODEL_STATS_ARG} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring"
2121
"ring_binqueue!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_binqueue --binqueue"
2222
"ring_multisend!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_multisend --multisend"
2323
"ring_spike_buffer!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer --spikebuf 1"

0 commit comments

Comments
 (0)