Skip to content

Commit c0ca245

Browse files
committed
Add assert in some places and optimize redundant index calculations in nested loops
1 parent 079f791 commit c0ca245

File tree

6 files changed

+194
-179
lines changed

6 files changed

+194
-179
lines changed

source/module_hamilt_lcao/module_tddft/band_energy.cpp

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@ void compute_ekb(const Parallel_Orbitals* pv,
2828
double* ekb,
2929
std::ofstream& ofs_running)
3030
{
31+
assert(pv->nloc_wfc > 0 && pv->nloc > 0);
3132

3233
std::complex<double>* tmp1 = new std::complex<double>[pv->nloc_wfc];
3334
ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc);
3435

35-
std::complex<double>* Eij = new std::complex<double>[pv->nloc];
36-
ModuleBase::GlobalFunc::ZEROS(Eij, pv->nloc);
36+
std::complex<double>* eij = new std::complex<double>[pv->nloc];
37+
ModuleBase::GlobalFunc::ZEROS(eij, pv->nloc);
3738

3839
ScalapackConnector::gemm('N',
3940
'N',
@@ -70,7 +71,7 @@ void compute_ekb(const Parallel_Orbitals* pv,
7071
1,
7172
pv->desc_wfc,
7273
0.0,
73-
Eij,
74+
eij,
7475
1,
7576
1,
7677
pv->desc_Eij);
@@ -83,12 +84,11 @@ void compute_ekb(const Parallel_Orbitals* pv,
8384
ofs_running << " Eij:" << std::endl;
8485
for (int i = 0; i < pv->nrow_bands; i++)
8586
{
87+
const int in = i * pv->ncol;
8688
for (int j = 0; j < pv->ncol_bands; j++)
8789
{
88-
double aa = 0.0;
89-
double bb = 0.0;
90-
aa = Eij[i * pv->ncol + j].real();
91-
bb = Eij[i * pv->ncol + j].imag();
90+
double aa = eij[in + j].real();
91+
double bb = eij[in + j].imag();
9292
if (std::abs(aa) < PARAM.inp.td_print_eij)
9393
{
9494
aa = 0.0;
@@ -112,8 +112,10 @@ void compute_ekb(const Parallel_Orbitals* pv,
112112
int info = 0;
113113
int naroc[2] = {0, 0};
114114

115-
double* Eii = new double[nband];
116-
ModuleBase::GlobalFunc::ZEROS(Eii, nband);
115+
assert(nband > 0);
116+
double* eii = new double[nband];
117+
ModuleBase::GlobalFunc::ZEROS(eii, nband);
118+
117119
for (int iprow = 0; iprow < pv->dim0; ++iprow)
118120
{
119121
for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
@@ -138,18 +140,18 @@ void compute_ekb(const Parallel_Orbitals* pv,
138140
}
139141
if (igcol == igrow)
140142
{
141-
Eii[igcol] = Eij[j * naroc[0] + i].real();
143+
eii[igcol] = eij[j * naroc[0] + i].real();
142144
}
143145
}
144146
}
145147
}
146148
} // loop ipcol
147149
} // loop iprow
148-
info = MPI_Allreduce(Eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm());
150+
info = MPI_Allreduce(eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm());
149151

150152
delete[] tmp1;
151-
delete[] Eij;
152-
delete[] Eii;
153+
delete[] eij;
154+
delete[] eii;
153155
}
154156

155157
void compute_ekb_tensor(const Parallel_Orbitals* pv,
@@ -160,12 +162,14 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
160162
ct::Tensor& ekb,
161163
std::ofstream& ofs_running)
162164
{
165+
assert(pv->nloc_wfc > 0 && pv->nloc > 0);
166+
163167
// Create Tensor objects for temporary data
164168
ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc}));
165169
tmp1.zero();
166170

167-
ct::Tensor Eij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
168-
Eij.zero();
171+
ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
172+
eij.zero();
169173

170174
// Perform matrix multiplication: tmp1 = Htmp * psi_k
171175
ScalapackConnector::gemm('N',
@@ -188,7 +192,7 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
188192
1,
189193
pv->desc_wfc);
190194

191-
// Perform matrix multiplication: Eij = psi_k^dagger * tmp1
195+
// Perform matrix multiplication: eij = psi_k^dagger * tmp1
192196
ScalapackConnector::gemm('C',
193197
'N',
194198
nband,
@@ -204,7 +208,7 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
204208
1,
205209
pv->desc_wfc,
206210
0.0,
207-
Eij.data<std::complex<double>>(),
211+
eij.data<std::complex<double>>(),
208212
1,
209213
1,
210214
pv->desc_Eij);
@@ -217,12 +221,11 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
217221
ofs_running << " Eij:" << std::endl;
218222
for (int i = 0; i < pv->nrow_bands; i++)
219223
{
224+
const int in = i * pv->ncol;
220225
for (int j = 0; j < pv->ncol_bands; j++)
221226
{
222-
double aa = 0.0;
223-
double bb = 0.0;
224-
aa = Eij.data<std::complex<double>>()[i * pv->ncol + j].real();
225-
bb = Eij.data<std::complex<double>>()[i * pv->ncol + j].imag();
227+
double aa = eij.data<std::complex<double>>()[in + j].real();
228+
double bb = eij.data<std::complex<double>>()[in + j].imag();
226229
if (std::abs(aa) < PARAM.inp.td_print_eij)
227230
{
228231
aa = 0.0;
@@ -246,9 +249,10 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
246249
int info = 0;
247250
int naroc[2] = {0, 0};
248251

249-
// Create a Tensor for Eii
250-
ct::Tensor Eii(ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({nband}));
251-
Eii.zero();
252+
// Create a Tensor for eii
253+
assert(nband > 0);
254+
ct::Tensor eii(ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({nband}));
255+
eii.zero();
252256

253257
for (int iprow = 0; iprow < pv->dim0; ++iprow)
254258
{
@@ -274,7 +278,7 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
274278
}
275279
if (igcol == igrow)
276280
{
277-
Eii.data<double>()[igcol] = Eij.data<std::complex<double>>()[j * naroc[0] + i].real();
281+
eii.data<double>()[igcol] = eij.data<std::complex<double>>()[j * naroc[0] + i].real();
278282
}
279283
}
280284
}
@@ -283,7 +287,7 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
283287
} // loop iprow
284288

285289
// Perform MPI reduction to compute ekb
286-
info = MPI_Allreduce(Eii.data<double>(), ekb.data<double>(), nband, MPI_DOUBLE, MPI_SUM, pv->comm());
290+
info = MPI_Allreduce(eii.data<double>(), ekb.data<double>(), nband, MPI_DOUBLE, MPI_SUM, pv->comm());
287291
}
288292

289293
template <typename Device>
@@ -306,11 +310,11 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
306310
ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband
307311
tmp1.zero();
308312

309-
ct::Tensor Eij(ct::DataType::DT_COMPLEX_DOUBLE,
313+
ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE,
310314
ct_device_type,
311-
ct::TensorShape({nlocal * nlocal})); // Eij shape: nlocal * nlocal
315+
ct::TensorShape({nlocal * nlocal})); // eij shape: nlocal * nlocal
312316
// Why not use nband * nband ?????
313-
Eij.zero();
317+
eij.zero();
314318

315319
std::complex<double> alpha = {1.0, 0.0};
316320
std::complex<double> beta = {0.0, 0.0};
@@ -330,7 +334,7 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
330334
tmp1.data<std::complex<double>>(),
331335
nlocal); // Leading dimension of tmp1
332336

333-
// Perform matrix multiplication: Eij = psi_k^dagger * tmp1
337+
// Perform matrix multiplication: eij = psi_k^dagger * tmp1
334338
ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('C',
335339
'N',
336340
nband,
@@ -342,25 +346,24 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
342346
tmp1.data<std::complex<double>>(),
343347
nlocal, // Leading dimension of tmp1
344348
&beta,
345-
Eij.data<std::complex<double>>(),
346-
nlocal); // Leading dimension of Eij
349+
eij.data<std::complex<double>>(),
350+
nlocal); // Leading dimension of eij
347351

348352
if (PARAM.inp.td_print_eij >= 0.0)
349353
{
350-
ct::Tensor Eij_cpu = Eij.to_device<ct::DEVICE_CPU>();
354+
ct::Tensor eij_cpu = eij.to_device<ct::DEVICE_CPU>();
351355

352356
ofs_running
353357
<< "------------------------------------------------------------------------------------------------"
354358
<< std::endl;
355359
ofs_running << " Eij:" << std::endl;
356360
for (int i = 0; i < nband; i++)
357361
{
362+
const int in = i * nlocal;
358363
for (int j = 0; j < nband; j++)
359364
{
360-
double aa = 0.0;
361-
double bb = 0.0;
362-
aa = Eij_cpu.data<std::complex<double>>()[i * nlocal + j].real();
363-
bb = Eij_cpu.data<std::complex<double>>()[i * nlocal + j].imag();
365+
double aa = eij_cpu.data<std::complex<double>>()[in + j].real();
366+
double bb = eij_cpu.data<std::complex<double>>()[in + j].imag();
364367
if (std::abs(aa) < PARAM.inp.td_print_eij)
365368
{
366369
aa = 0.0;
@@ -381,15 +384,15 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
381384
<< std::endl;
382385
}
383386

384-
// Extract diagonal elements of Eij into ekb
387+
// Extract diagonal elements of eij into ekb
385388
if (ct_device_type == ct::DeviceType::GpuDevice)
386389
{
387390
// GPU implementation
388391
for (int i = 0; i < nband; ++i)
389392
{
390393
base_device::memory::synchronize_memory_op<double, Device, Device>()(
391394
ekb.data<double>() + i,
392-
reinterpret_cast<const double*>(Eij.data<std::complex<double>>() + i * nlocal + i),
395+
reinterpret_cast<const double*>(eij.data<std::complex<double>>() + i * nlocal + i),
393396
1);
394397
}
395398
}
@@ -398,7 +401,7 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
398401
// CPU implementation
399402
for (int i = 0; i < nband; ++i)
400403
{
401-
ekb.data<double>()[i] = Eij.data<std::complex<double>>()[i * nlocal + i].real();
404+
ekb.data<double>()[i] = eij.data<std::complex<double>>()[i * nlocal + i].real();
402405
}
403406
}
404407
}

source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
2929
ofs_running << " H(t+dt) :" << std::endl;
3030
for (int i = 0; i < pv->nrow; i++)
3131
{
32+
const int in = i * pv->ncol;
3233
for (int j = 0; j < pv->ncol; j++)
3334
{
34-
ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i ";
35+
ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i ";
3536
}
3637
ofs_running << std::endl;
3738
}
@@ -40,10 +41,10 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
4041
ofs_running << " H(t):" << std::endl;
4142
for (int i = 0; i < pv->nrow; i++)
4243
{
44+
const int in = i * pv->ncol;
4345
for (int j = 0; j < pv->ncol; j++)
4446
{
45-
ofs_running << H_laststep[i * pv->ncol + j].real() << "+" << H_laststep[i * pv->ncol + j].imag()
46-
<< "i ";
47+
ofs_running << H_laststep[in + j].real() << "+" << H_laststep[in + j].imag() << "i ";
4748
}
4849
ofs_running << std::endl;
4950
}
@@ -61,9 +62,10 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
6162
ofs_running << " H (t+dt/2) :" << std::endl;
6263
for (int i = 0; i < pv->nrow; i++)
6364
{
65+
const int in = i * pv->ncol;
6466
for (int j = 0; j < pv->ncol; j++)
6567
{
66-
ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i ";
68+
ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i ";
6769
}
6870
ofs_running << std::endl;
6971
}
@@ -88,10 +90,11 @@ void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
8890
ofs_running << " H(t+dt) :" << std::endl;
8991
for (int i = 0; i < pv->nrow; i++)
9092
{
93+
const int in = i * pv->ncol;
9194
for (int j = 0; j < pv->ncol; j++)
9295
{
93-
ofs_running << Htmp.data<std::complex<double>>()[i * pv->ncol + j].real() << "+"
94-
<< Htmp.data<std::complex<double>>()[i * pv->ncol + j].imag() << "i ";
96+
ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
97+
<< Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
9598
}
9699
ofs_running << std::endl;
97100
}
@@ -100,10 +103,11 @@ void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
100103
ofs_running << " H(t):" << std::endl;
101104
for (int i = 0; i < pv->nrow; i++)
102105
{
106+
const int in = i * pv->ncol;
103107
for (int j = 0; j < pv->ncol; j++)
104108
{
105-
ofs_running << H_laststep.data<std::complex<double>>()[i * pv->ncol + j].real() << "+"
106-
<< H_laststep.data<std::complex<double>>()[i * pv->ncol + j].imag() << "i ";
109+
ofs_running << H_laststep.data<std::complex<double>>()[in + j].real() << "+"
110+
<< H_laststep.data<std::complex<double>>()[in + j].imag() << "i ";
107111
}
108112
ofs_running << std::endl;
109113
}
@@ -149,10 +153,11 @@ void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
149153
ofs_running << " H (t+dt/2) :" << std::endl;
150154
for (int i = 0; i < pv->nrow; i++)
151155
{
156+
const int in = i * pv->ncol;
152157
for (int j = 0; j < pv->ncol; j++)
153158
{
154-
ofs_running << Htmp.data<std::complex<double>>()[i * pv->ncol + j].real() << "+"
155-
<< Htmp.data<std::complex<double>>()[i * pv->ncol + j].imag() << "i ";
159+
ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
160+
<< Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
156161
}
157162
ofs_running << std::endl;
158163
}
@@ -186,10 +191,11 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
186191
ofs_running << " H(t+dt) :" << std::endl;
187192
for (int i = 0; i < nlocal; i++)
188193
{
194+
const int in = i * nlocal;
189195
for (int j = 0; j < nlocal; j++)
190196
{
191-
ofs_running << Htmp_cpu.data<std::complex<double>>()[i * nlocal + j].real() << "+"
192-
<< Htmp_cpu.data<std::complex<double>>()[i * nlocal + j].imag() << "i ";
197+
ofs_running << Htmp_cpu.data<std::complex<double>>()[in + j].real() << "+"
198+
<< Htmp_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
193199
}
194200
ofs_running << std::endl;
195201
}
@@ -198,10 +204,11 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
198204
ofs_running << " H(t):" << std::endl;
199205
for (int i = 0; i < nlocal; i++)
200206
{
207+
const int in = i * nlocal;
201208
for (int j = 0; j < nlocal; j++)
202209
{
203-
ofs_running << H_laststep_cpu.data<std::complex<double>>()[i * nlocal + j].real() << "+"
204-
<< H_laststep_cpu.data<std::complex<double>>()[i * nlocal + j].imag() << "i ";
210+
ofs_running << H_laststep_cpu.data<std::complex<double>>()[in + j].real() << "+"
211+
<< H_laststep_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
205212
}
206213
ofs_running << std::endl;
207214
}
@@ -246,10 +253,11 @@ void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
246253
ofs_running << " H (t+dt/2) :" << std::endl;
247254
for (int i = 0; i < nlocal; i++)
248255
{
256+
const int in = i * nlocal;
249257
for (int j = 0; j < nlocal; j++)
250258
{
251-
ofs_running << Htmp_cpu.data<std::complex<double>>()[i * nlocal + j].real() << "+"
252-
<< Htmp_cpu.data<std::complex<double>>()[i * nlocal + j].imag() << "i ";
259+
ofs_running << Htmp_cpu.data<std::complex<double>>()[in + j].real() << "+"
260+
<< Htmp_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
253261
}
254262
ofs_running << std::endl;
255263
}

0 commit comments

Comments
 (0)