Skip to content

Commit e918b93

Browse files
committed
expol: bug fix in cuda code
1 parent 2896281 commit e918b93

File tree

4 files changed

+88
-94
lines changed

4 files changed

+88
-94
lines changed

ext/ext/yaml/alterpol_cu1.yaml

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,22 +68,22 @@ FULL_PAIRWISE_INTERACTION: |
6868
real ks2i[3][3], ks2k[3][3];
6969
pair_alterpol(scrtyp, r, scaleb, cut, off, xr, yr, zr, @springi@, @sizi@, @alphai@,
7070
springk, sizk, alphak, ks2i, ks2k);
71-
@psci00@ = ks2i[0][0];
72-
@psci01@ = ks2i[0][1];
73-
@psci02@ = ks2i[0][2];
74-
@psci10@ = ks2i[1][0];
75-
@psci11@ = ks2i[1][1];
76-
@psci12@ = ks2i[1][2];
77-
@psci20@ = ks2i[2][0];
78-
@psci21@ = ks2i[2][1];
79-
@psci22@ = ks2i[2][2];
80-
psck00 = ks2k[0][0];
81-
psck01 = ks2k[0][1];
82-
psck02 = ks2k[0][2];
83-
psck10 = ks2k[1][0];
84-
psck11 = ks2k[1][1];
85-
psck12 = ks2k[1][2];
86-
psck20 = ks2k[2][0];
87-
psck21 = ks2k[2][1];
88-
psck22 = ks2k[2][2];
71+
@psci00@ += ks2i[0][0];
72+
@psci01@ += ks2i[0][1];
73+
@psci02@ += ks2i[0][2];
74+
@psci10@ += ks2i[1][0];
75+
@psci11@ += ks2i[1][1];
76+
@psci12@ += ks2i[1][2];
77+
@psci20@ += ks2i[2][0];
78+
@psci21@ += ks2i[2][1];
79+
@psci22@ += ks2i[2][2];
80+
psck00 += ks2k[0][0];
81+
psck01 += ks2k[0][1];
82+
psck02 += ks2k[0][2];
83+
psck10 += ks2k[1][0];
84+
psck11 += ks2k[1][1];
85+
psck12 += ks2k[1][2];
86+
psck20 += ks2k[2][0];
87+
psck21 += ks2k[2][1];
88+
psck22 += ks2k[2][2];
8989
}

include/seq/pair_alterpol.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,11 @@ inline void damp_expl(ExpolScr scrtyp, real& restrict s2, real& restrict ds2, re
2323
expik = REAL_EXP(-dampik);
2424
s = (one + dampik + dampik2 * inv3) * expik;
2525
s2 = s * s;
26-
if (DO_G)
27-
ds2 = s * (-alphaik * inv3) * (dampik + dampik2) * expik;
26+
if (DO_G) ds2 = s * (-alphaik * inv3) * (dampik + dampik2) * expik;
2827
}
2928

3029
s2 = sizik * s2;
31-
if (DO_G)
32-
ds2 = sizik * ds2;
30+
if (DO_G) ds2 = sizik * ds2;
3331
}
3432

3533
SEQ_ROUTINE
@@ -136,8 +134,8 @@ inline void pair_dexpol(ExpolScr scrtyp, real r, real pscale, real cut, real off
136134
real ukxl = -(ukx * ak[0][0] + uky * ak[1][0] + ukz * ak[2][0]);
137135
real ukyl = -(ukx * ak[0][1] + uky * ak[1][1] + ukz * ak[2][1]);
138136
real ukzl = -(ukx * ak[0][2] + uky * ak[1][2] + ukz * ak[2][2]);
139-
frcil[2] = REAL_POW(uizl, 2) * ds2i;
140-
frckl[2] = REAL_POW(ukzl, 2) * ds2k;
137+
frcil[2] = uizl * uizl * ds2i;
138+
frckl[2] = ukzl * ukzl * ds2k;
141139
// local frame torque
142140
constexpr real two = 2.;
143141
real tqxil = two * uiyl * uizl * s2i;

src/cu/hippo/expol.cu

Lines changed: 65 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -111,24 +111,24 @@ static void alterpol_cu1(int n, TINKER_IMAGE_PARAMS, real cut, real off,
111111
real ks2i[3][3], ks2k[3][3];
112112
pair_alterpol(scrtyp, r, scaleb, cut, off, xr, yr, zr, springi[klane], sizi[klane],
113113
alphai[klane], springk, sizk, alphak, ks2i, ks2k);
114-
psci00[klane] = ks2i[0][0];
115-
psci01[klane] = ks2i[0][1];
116-
psci02[klane] = ks2i[0][2];
117-
psci10[klane] = ks2i[1][0];
118-
psci11[klane] = ks2i[1][1];
119-
psci12[klane] = ks2i[1][2];
120-
psci20[klane] = ks2i[2][0];
121-
psci21[klane] = ks2i[2][1];
122-
psci22[klane] = ks2i[2][2];
123-
psck00 = ks2k[0][0];
124-
psck01 = ks2k[0][1];
125-
psck02 = ks2k[0][2];
126-
psck10 = ks2k[1][0];
127-
psck11 = ks2k[1][1];
128-
psck12 = ks2k[1][2];
129-
psck20 = ks2k[2][0];
130-
psck21 = ks2k[2][1];
131-
psck22 = ks2k[2][2];
114+
psci00[klane] += ks2i[0][0];
115+
psci01[klane] += ks2i[0][1];
116+
psci02[klane] += ks2i[0][2];
117+
psci10[klane] += ks2i[1][0];
118+
psci11[klane] += ks2i[1][1];
119+
psci12[klane] += ks2i[1][2];
120+
psci20[klane] += ks2i[2][0];
121+
psci21[klane] += ks2i[2][1];
122+
psci22[klane] += ks2i[2][2];
123+
psck00 += ks2k[0][0];
124+
psck01 += ks2k[0][1];
125+
psck02 += ks2k[0][2];
126+
psck10 += ks2k[1][0];
127+
psck11 += ks2k[1][1];
128+
psck12 += ks2k[1][2];
129+
psck20 += ks2k[2][0];
130+
psck21 += ks2k[2][1];
131+
psck22 += ks2k[2][2];
132132
}
133133

134134
atomic_add(psci00[threadIdx.x], &polscale[i][0]);
@@ -215,24 +215,24 @@ static void alterpol_cu1(int n, TINKER_IMAGE_PARAMS, real cut, real off,
215215
real ks2i[3][3], ks2k[3][3];
216216
pair_alterpol(scrtyp, r, scaleb, cut, off, xr, yr, zr, springi[klane], sizi[klane],
217217
alphai[klane], springk, sizk, alphak, ks2i, ks2k);
218-
psci00[klane] = ks2i[0][0];
219-
psci01[klane] = ks2i[0][1];
220-
psci02[klane] = ks2i[0][2];
221-
psci10[klane] = ks2i[1][0];
222-
psci11[klane] = ks2i[1][1];
223-
psci12[klane] = ks2i[1][2];
224-
psci20[klane] = ks2i[2][0];
225-
psci21[klane] = ks2i[2][1];
226-
psci22[klane] = ks2i[2][2];
227-
psck00 = ks2k[0][0];
228-
psck01 = ks2k[0][1];
229-
psck02 = ks2k[0][2];
230-
psck10 = ks2k[1][0];
231-
psck11 = ks2k[1][1];
232-
psck12 = ks2k[1][2];
233-
psck20 = ks2k[2][0];
234-
psck21 = ks2k[2][1];
235-
psck22 = ks2k[2][2];
218+
psci00[klane] += ks2i[0][0];
219+
psci01[klane] += ks2i[0][1];
220+
psci02[klane] += ks2i[0][2];
221+
psci10[klane] += ks2i[1][0];
222+
psci11[klane] += ks2i[1][1];
223+
psci12[klane] += ks2i[1][2];
224+
psci20[klane] += ks2i[2][0];
225+
psci21[klane] += ks2i[2][1];
226+
psci22[klane] += ks2i[2][2];
227+
psck00 += ks2k[0][0];
228+
psck01 += ks2k[0][1];
229+
psck02 += ks2k[0][2];
230+
psck10 += ks2k[1][0];
231+
psck11 += ks2k[1][1];
232+
psck12 += ks2k[1][2];
233+
psck20 += ks2k[2][0];
234+
psck21 += ks2k[2][1];
235+
psck22 += ks2k[2][2];
236236
}
237237

238238
iid = __shfl_sync(ALL_LANES, iid, ilane + 1);
@@ -313,24 +313,24 @@ static void alterpol_cu1(int n, TINKER_IMAGE_PARAMS, real cut, real off,
313313
real ks2i[3][3], ks2k[3][3];
314314
pair_alterpol(scrtyp, r, scaleb, cut, off, xr, yr, zr, springi[klane], sizi[klane],
315315
alphai[klane], springk, sizk, alphak, ks2i, ks2k);
316-
psci00[klane] = ks2i[0][0];
317-
psci01[klane] = ks2i[0][1];
318-
psci02[klane] = ks2i[0][2];
319-
psci10[klane] = ks2i[1][0];
320-
psci11[klane] = ks2i[1][1];
321-
psci12[klane] = ks2i[1][2];
322-
psci20[klane] = ks2i[2][0];
323-
psci21[klane] = ks2i[2][1];
324-
psci22[klane] = ks2i[2][2];
325-
psck00 = ks2k[0][0];
326-
psck01 = ks2k[0][1];
327-
psck02 = ks2k[0][2];
328-
psck10 = ks2k[1][0];
329-
psck11 = ks2k[1][1];
330-
psck12 = ks2k[1][2];
331-
psck20 = ks2k[2][0];
332-
psck21 = ks2k[2][1];
333-
psck22 = ks2k[2][2];
316+
psci00[klane] += ks2i[0][0];
317+
psci01[klane] += ks2i[0][1];
318+
psci02[klane] += ks2i[0][2];
319+
psci10[klane] += ks2i[1][0];
320+
psci11[klane] += ks2i[1][1];
321+
psci12[klane] += ks2i[1][2];
322+
psci20[klane] += ks2i[2][0];
323+
psci21[klane] += ks2i[2][1];
324+
psci22[klane] += ks2i[2][2];
325+
psck00 += ks2k[0][0];
326+
psck01 += ks2k[0][1];
327+
psck02 += ks2k[0][2];
328+
psck10 += ks2k[1][0];
329+
psck11 += ks2k[1][1];
330+
psck12 += ks2k[1][2];
331+
psck20 += ks2k[2][0];
332+
psck21 += ks2k[2][1];
333+
psck22 += ks2k[2][2];
334334
}
335335
}
336336

@@ -861,8 +861,7 @@ void eppcgP5(int n, const real* restrict polarity, //
861861
{
862862
real kaval = *ka;
863863
real a = *ksum / kaval;
864-
if (kaval == 0)
865-
a = 0;
864+
if (kaval == 0) a = 0;
866865
for (int i = ITHREAD; i < n; i += STRIDE) {
867866
#pragma unroll
868867
for (int j = 0; j < 3; ++j) {
@@ -883,8 +882,7 @@ void eppcgP6(int n, const real* restrict ksum, const real* restrict ksum1, real
883882
{
884883
real ksumval = *ksum;
885884
real b = *ksum1 / ksumval;
886-
if (ksumval == 0)
887-
b = 0;
885+
if (ksumval == 0) b = 0;
888886
for (int i = ITHREAD; i < n; i += STRIDE) {
889887
#pragma unroll
890888
for (int j = 0; j < 3; ++j)
@@ -968,11 +966,12 @@ void induceMutualPcg4_cu(real (*uind)[3])
968966
const real debye = units::debye;
969967
const real pcgpeek = polpcg::pcgpeek;
970968
const int maxiter = 100; // see also subroutine induce0a in induce.f
969+
const int miniter = std::min(3, n);
971970

972971
bool done = false;
973972
int iter = 0;
974973
real eps = 100;
975-
real epsold;
974+
// real epsold;
976975

977976
while (not done) {
978977
++iter;
@@ -1013,7 +1012,7 @@ void induceMutualPcg4_cu(real (*uind)[3])
10131012
check_rt(
10141013
cudaMemcpyAsync((real*)pinned_buf, epsd, sizeof(real), cudaMemcpyDeviceToHost, g::s0));
10151014
check_rt(cudaStreamSynchronize(g::s0));
1016-
epsold = eps;
1015+
// epsold = eps;
10171016
eps = ((real*)pinned_buf)[0];
10181017
eps = debye * REAL_SQRT(eps / n);
10191018

@@ -1026,16 +1025,13 @@ void induceMutualPcg4_cu(real (*uind)[3])
10261025
print(stdout, " %8d %-16.10f\n", iter, eps);
10271026
}
10281027

1029-
if (eps < poleps)
1030-
done = true;
1031-
if (eps > epsold)
1032-
done = true;
1033-
if (iter >= politer)
1034-
done = true;
1028+
if (eps < poleps) done = true;
1029+
// if (eps > epsold) done = true;
1030+
if (iter < miniter) done = false;
1031+
if (iter >= politer) done = true;
10351032

10361033
// apply a "peek" iteration to the mutual induced dipoles
1037-
if (done)
1038-
launch_k1s(g::s0, n, eppcgPeek1, n, pcgpeek, polarity, uind, rsd);
1034+
if (done) launch_k1s(g::s0, n, eppcgPeek1, n, pcgpeek, polarity, uind, rsd);
10391035
}
10401036

10411037
// print the results from the conjugate gradient iteration
@@ -1047,7 +1043,7 @@ void induceMutualPcg4_cu(real (*uind)[3])
10471043
}
10481044

10491045
// terminate the calculation if dipoles failed to converge
1050-
if (iter >= maxiter || eps > epsold) {
1046+
if (iter >= maxiter) {
10511047
printError();
10521048
TINKER_THROW("INDUCE -- Warning, Induced Dipoles are not Converged");
10531049
}

test/expol.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ TEST_CASE("ExchangePolarization-3", "[ff][hippo][expol]")
129129
int argc = 4;
130130

131131
const double eps_e = testGetEps(0.0001, 0.0001);
132-
const double eps_g = testGetEps(0.0001, 0.0001);
132+
const double eps_g = testGetEps(0.0002, 0.0001);
133133
const double eps_v = testGetEps(0.001, 0.001);
134134

135135
TestReference r(TINKER9_DIRSTR "/test/ref/expol.3.txt");

0 commit comments

Comments
 (0)