Skip to content

Commit 79a48f0

Browse files
committed
add pw_basis_k C2C
1 parent 0f3bf0a commit 79a48f0

File tree

1 file changed

+222
-73
lines changed

1 file changed

+222
-73
lines changed

source/module_basis/module_pw/test_gpu/pw_basis_k_C2C.cpp

Lines changed: 222 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -3,40 +3,38 @@
33
//---------------------------------------------
44
#include "../pw_basis_k.h"
55
#ifdef __MPI
6-
#include "test_tool.h"
76
#include "module_base/parallel_global.h"
87
#include "mpi.h"
8+
#include "test_tool.h"
99
#endif
10+
#include "cuda_runtime.h"
1011
#include "module_base/constants.h"
1112
#include "module_base/global_function.h"
1213
#include "pw_test.h"
13-
#include "cuda_runtime.h"
1414
using namespace std;
15-
TEST_F(PWTEST,pw_basis_k_C2C_double)
15+
TEST_F(PWTEST, pw_basis_k_C2C_double)
1616
{
17-
cout<<"dividemthd 1, gamma_only: on, xprime: false, gamma kpoint, check fft"<<endl;
17+
cout << "dividemthd 1, gamma_only: on, xprime: false, gamma kpoint, check fft" << endl;
1818
ModulePW::PW_Basis_K pwtest("gpu", "double");
1919
ModuleBase::Matrix3 latvec(1, 0.3, 0, 0, 2, 0, 0, 0, 2);
20-
double wfcecut;
21-
double lat0= 2.7;
20+
double wfcecut =10;
21+
double lat0 = 2.7;
2222
bool gamma_only;
23-
ModuleBase::Vector3<double> *kvec_d;
24-
int nks;
25-
//--------------------------------------------------
26-
nks = 1;
23+
ModuleBase::Vector3<double>* kvec_d;
24+
int nks = 1;
2725
kvec_d = new ModuleBase::Vector3<double>[nks];
28-
kvec_d[0].set(0,0,0);
26+
kvec_d[0].set(0, 0, 0);
2927
wfcecut = 10;
30-
gamma_only = true;
28+
gamma_only = false;
3129
int distribution_type = 1;
3230
bool xprime = false;
3331
//--------------------------------------------------
3432
#ifdef __MPI
3533
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
3634
#endif
37-
//init //real parameter
38-
pwtest.initgrids(lat0,latvec,4*wfcecut);
39-
pwtest.initparameters(gamma_only,wfcecut,nks,kvec_d,distribution_type, xprime);
35+
// init //real parameter
36+
pwtest.initgrids(lat0, latvec, 4 * wfcecut);
37+
pwtest.initparameters(gamma_only, wfcecut, nks, kvec_d, distribution_type, xprime);
4038
pwtest.setuptransform();
4139
pwtest.collect_local_pw();
4240

@@ -48,101 +46,252 @@ TEST_F(PWTEST,pw_basis_k_C2C_double)
4846
const int nplane = pwtest.nplane;
4947
const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
5048
const double ggecut = wfcecut / tpiba2;
51-
ModuleBase::Matrix3 GT,G,GGT;
49+
ModuleBase::Matrix3 GT, G, GGT;
5250
GT = latvec.Inverse();
53-
G = GT.Transpose();
54-
GGT = G * GT;
55-
complex<double> *tmp = new complex<double> [nx*ny*nz];
56-
double * rhor = new double [nrxx];
57-
for(int ik = 0; ik < nks; ++ik)
51+
G = GT.Transpose();
52+
GGT = G * GT;
53+
complex<double>* tmp = new complex<double>[nx * ny * nz];
54+
double* rhor = new double[nrxx];
55+
for (int ik = 0; ik < nks; ++ik)
5856
{
5957
int npwk = pwtest.npwk[ik];
60-
if(rank_in_pool == 0)
58+
if (rank_in_pool == 0)
6159
{
6260
ModuleBase::Vector3<double> kk = kvec_d[ik];
63-
for(int ix = 0 ; ix < nx ; ++ix)
61+
for (int ix = 0; ix < nx; ++ix)
6462
{
65-
for(int iy = 0 ; iy < ny ; ++iy)
63+
for (int iy = 0; iy < ny; ++iy)
6664
{
67-
for(int iz = 0 ; iz < nz ; ++iz)
65+
for (int iz = 0; iz < nz; ++iz)
6866
{
69-
tmp[ix*ny*nz + iy*nz + iz]=0.0;
70-
double vx = ix - int(nx/2);
71-
double vy = iy - int(ny/2);
72-
double vz = iz - int(nz/2);
73-
ModuleBase::Vector3<double> v(vx,vy,vz);
74-
// double modulus = v * (GGT * v);
75-
double modulusgk = (v+kk) * (GGT * (v+kk));
67+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
68+
double vx = ix - int(nx / 2);
69+
double vy = iy - int(ny / 2);
70+
double vz = iz - int(nz / 2);
71+
ModuleBase::Vector3<double> v(vx, vy, vz);
72+
double modulusgk = (v + kk) * (GGT * (v + kk));
7673
if (modulusgk <= ggecut)
7774
{
78-
tmp[ix*ny*nz + iy*nz + iz]=1.0/(modulusgk+1);
79-
if(vy > 0) tmp[ix*ny*nz + iy*nz + iz]+=ModuleBase::IMAG_UNIT / (std::abs(v.x+1) + 1);
80-
else if(vy < 0) tmp[ix*ny*nz + iy*nz + iz]-=ModuleBase::IMAG_UNIT / (std::abs(-v.x+1) + 1);
75+
tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulusgk + 1);
76+
if (vy > 0)
77+
tmp[ix * ny * nz + iy * nz + iz] += ModuleBase::IMAG_UNIT / (std::abs(v.x + 1) + 1);
78+
else if (vy < 0)
79+
tmp[ix * ny * nz + iy * nz + iz] -= ModuleBase::IMAG_UNIT / (std::abs(-v.x + 1) + 1);
8180
}
8281
}
83-
}
82+
}
8483
}
85-
fftw_plan pp = fftw_plan_dft_3d(nx,ny,nz,(fftw_complex *) tmp, (fftw_complex *) tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
86-
fftw_execute(pp);
87-
fftw_destroy_plan(pp);
84+
fftw_plan pp
85+
= fftw_plan_dft_3d(nx, ny, nz, (fftw_complex*)tmp, (fftw_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
86+
fftw_execute(pp);
87+
fftw_destroy_plan(pp);
8888

89-
ModuleBase::Vector3<double> delta_g(double(int(nx/2))/nx, double(int(ny/2))/ny, double(int(nz/2))/nz);
90-
for(int ixy = 0 ; ixy < nx * ny ; ++ixy)
89+
ModuleBase::Vector3<double> delta_g(double(int(nx / 2)) / nx,
90+
double(int(ny / 2)) / ny,
91+
double(int(nz / 2)) / nz);
92+
for (int ixy = 0; ixy < nx * ny; ++ixy)
9193
{
92-
for(int iz = 0 ; iz < nz ; ++iz)
94+
for (int iz = 0; iz < nz; ++iz)
9395
{
9496
int ix = ixy / ny;
9597
int iy = ixy % ny;
9698
ModuleBase::Vector3<double> real_r(ix, iy, iz);
9799
double phase_im = -delta_g * real_r;
98-
complex<double> phase(0,ModuleBase::TWO_PI * phase_im);
100+
complex<double> phase(0, ModuleBase::TWO_PI * phase_im);
99101
tmp[ixy * nz + iz] *= exp(phase);
100102
}
101103
}
102104
}
103105
#ifdef __MPI
104-
MPI_Bcast(tmp,2*nx*ny*nz,MPI_DOUBLE,0,POOL_WORLD);
106+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
105107
#endif
106-
complex<double> * h_rhog = new complex<double> [npwk];
107-
complex<double> * h_rhor = new complex<double> [nrxx];
108-
for(int ig = 0 ; ig < npwk ; ++ig)
109-
{
110-
h_rhog[ig] = 1.0/(pwtest.getgk2(ik,ig)+1);
111-
ModuleBase::Vector3<double> f = pwtest.getgdirect(ik,ig);
112-
if(f.y > 0)
113-
{
114-
h_rhog[ig]+=ModuleBase::IMAG_UNIT / (std::abs(f.x+1) + 1);
115-
116-
}
117-
}
118-
108+
complex<double>* h_rhog = new complex<double>[npwk];
119109
complex<double>* h_rhogout = new complex<double>[npwk];
110+
complex<double>* h_rhor = new complex<double>[nrxx];
111+
for (int ig = 0; ig < npwk; ++ig)
112+
{
113+
h_rhog[ig] = 1.0 / (pwtest.getgk2(ik, ig) + 1);
114+
ModuleBase::Vector3<double> f = pwtest.getgdirect(ik, ig);
115+
if (f.y > 0)
116+
h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(f.x + 1) + 1);
117+
else if (f.y < 0)
118+
h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-f.x + 1) + 1);
119+
}
120120
complex<double>* d_rhog;
121121
complex<double>* d_rhor;
122-
complex<double>* d_rhogout;
123122
cudaMalloc((void**)&d_rhog, npwk * sizeof(complex<double>));
124-
cudaMalloc((void**)&d_rhor, npwk * sizeof(complex<double>));
125-
cudaMalloc((void**)&d_rhogout, npwk * sizeof(complex<double>));
126-
cudaMemcpy(d_rhog,h_rhog,npwk*sizeof(complex<double>),cudaMemcpyHostToDevice);
127-
pwtest.recip_to_real<std::complex<double>,std::complex<double>,base_device::DEVICE_GPU>(h_rhog,d_rhor,ik); //check out-of-place transform
128-
cudaMemcpy(h_rhor,d_rhor,nrxx*sizeof(complex<double>),cudaMemcpyHostToDevice);
123+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(complex<double>));
124+
cudaMemcpy(d_rhog, h_rhog, npwk * sizeof(complex<double>), cudaMemcpyHostToDevice);
125+
pwtest.recip_to_real<std::complex<double>, base_device::DEVICE_GPU>(d_rhog,
126+
d_rhor,
127+
ik);
128+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(complex<double>), cudaMemcpyDeviceToHost);
129129
int startiz = pwtest.startz_current;
130-
for(int ixy = 0 ; ixy < nx * ny ; ++ixy)
130+
for (int ixy = 0; ixy < nx * ny; ++ixy)
131131
{
132-
for(int iz = 0 ; iz < nplane ; ++iz)
132+
for (int iz = 0; iz < nplane; ++iz)
133133
{
134-
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(),h_rhor[ixy*nplane+iz].real(),1e-6);
134+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-6);
135+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-6);
135136
}
136137
}
137138

138-
delete [] h_rhog;
139-
delete [] h_rhor;
140-
139+
pwtest.real_to_recip<std::complex<double>,base_device::DEVICE_GPU>(d_rhor,d_rhog,ik);
140+
cudaMemcpy(h_rhogout,d_rhog,npwk * sizeof(complex<double>),cudaMemcpyDeviceToHost);
141+
for (int ig = 0; ig < npwk; ++ig)
142+
{
143+
EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-6);
144+
EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-6);
145+
}
146+
delete[] h_rhog;
147+
delete[] h_rhor;
148+
delete[] h_rhogout;
149+
cudaFree(d_rhor);
150+
cudaFree(d_rhog);
151+
}
152+
delete[] tmp;
153+
delete[] rhor;
154+
delete[] kvec_d;
155+
fftw_cleanup();
156+
}
141157

158+
TEST_F(PWTEST, pw_basis_k_C2C_float)
159+
{
160+
cout << "dividemthd 1, gamma_only: on, xprime: false, gamma kpoint, check fft" << endl;
161+
ModulePW::PW_Basis_K pwtest("gpu", "single");
162+
ModuleBase::Matrix3 latvec(1, 0.3, 0, 0, 2, 0, 0, 0, 2);
163+
double wfcecut =10;
164+
double lat0 = 2.7;
165+
bool gamma_only;
166+
ModuleBase::Vector3<double>* kvec_d;
167+
int nks = 1;
168+
kvec_d = new ModuleBase::Vector3<double>[nks];
169+
kvec_d[0].set(0, 0, 0);
170+
wfcecut = 10;
171+
gamma_only = false;
172+
int distribution_type = 1;
173+
bool xprime = false;
174+
//--------------------------------------------------
175+
#ifdef __MPI
176+
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
177+
#endif
178+
// init //real parameter
179+
pwtest.initgrids(lat0, latvec, 4 * wfcecut);
180+
pwtest.initparameters(gamma_only, wfcecut, nks, kvec_d, distribution_type, xprime);
181+
pwtest.setuptransform();
182+
pwtest.collect_local_pw();
183+
184+
const int nrxx = pwtest.nrxx;
185+
const int nmaxgr = pwtest.nmaxgr;
186+
const int nx = pwtest.nx;
187+
const int ny = pwtest.ny;
188+
const int nz = pwtest.nz;
189+
const int nplane = pwtest.nplane;
190+
const float tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
191+
const float ggecut = wfcecut / tpiba2;
192+
ModuleBase::Matrix3 GT, G, GGT;
193+
GT = latvec.Inverse();
194+
G = GT.Transpose();
195+
GGT = G * GT;
196+
complex<float>* tmp = new complex<float>[nx * ny * nz];
197+
for (int ik = 0; ik < nks; ++ik)
198+
{
199+
int npwk = pwtest.npwk[ik];
200+
if (rank_in_pool == 0)
201+
{
202+
ModuleBase::Vector3<double> kk = kvec_d[ik];
203+
for (int ix = 0; ix < nx; ++ix)
204+
{
205+
for (int iy = 0; iy < ny; ++iy)
206+
{
207+
for (int iz = 0; iz < nz; ++iz)
208+
{
209+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
210+
double vx = ix - int(nx / 2);
211+
double vy = iy - int(ny / 2);
212+
double vz = iz - int(nz / 2);
213+
ModuleBase::Vector3<double> v(vx, vy, vz);
214+
float modulusgk = float((v + kk) * (GGT * (v + kk)));
215+
if (modulusgk <= ggecut)
216+
{
217+
tmp[ix * ny * nz + iy * nz + iz] = float(1.0 / (modulusgk + 1));
218+
if (vy > 0)
219+
tmp[ix * ny * nz + iy * nz + iz] += std::complex<float>(0,1.0) / (std::abs(float(v.x) + 1) + 1);
220+
else if (vy < 0)
221+
tmp[ix * ny * nz + iy * nz + iz] -= std::complex<float>(0,1.0) / (std::abs(float(-v.x) + 1) + 1);
222+
}
223+
}
224+
}
225+
}
226+
fftwf_plan pp
227+
= fftwf_plan_dft_3d(nx, ny, nz, (fftwf_complex*)tmp, (fftwf_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
228+
fftwf_execute(pp);
229+
fftwf_destroy_plan(pp);
230+
231+
ModuleBase::Vector3<float> delta_g(float(int(nx / 2)) / nx,
232+
float(int(ny / 2)) / ny,
233+
float(int(nz / 2)) / nz);
234+
for (int ixy = 0; ixy < nx * ny; ++ixy)
235+
{
236+
for (int iz = 0; iz < nz; ++iz)
237+
{
238+
int ix = ixy / ny;
239+
int iy = ixy % ny;
240+
ModuleBase::Vector3<float> real_r(ix, iy, iz);
241+
float phase_im = -delta_g * real_r;
242+
complex<float> phase(0, ModuleBase::TWO_PI * phase_im);
243+
tmp[ixy * nz + iz] *= exp(phase);
244+
}
245+
}
246+
}
247+
#ifdef __MPI
248+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
249+
#endif
250+
complex<float>* h_rhog = new complex<float>[npwk];
251+
complex<float>* h_rhogout = new complex<float>[npwk];
252+
complex<float>* h_rhor = new complex<float>[nrxx];
253+
for (int ig = 0; ig < npwk; ++ig)
254+
{
255+
h_rhog[ig] = float(1.0 / (pwtest.getgk2(ik, ig) + 1));
256+
ModuleBase::Vector3<double> f = pwtest.getgdirect(ik, ig);
257+
if (f.y > 0)
258+
h_rhog[ig] += std::complex<float>(0,1.0) / (std::abs(float(f.x) + 1) + 1);
259+
else if (f.y < 0)
260+
h_rhog[ig] -= std::complex<float>(0,1.0) / (std::abs(float(-f.x) + 1) + 1);
261+
}
262+
complex<float>* d_rhog;
263+
complex<float>* d_rhor;
264+
cudaMalloc((void**)&d_rhog, npwk * sizeof(complex<float>));
265+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(complex<float>));
266+
cudaMemcpy(d_rhog, h_rhog, npwk * sizeof(complex<float>), cudaMemcpyHostToDevice);
267+
pwtest.recip_to_real<std::complex<float>, base_device::DEVICE_GPU>(d_rhog,
268+
d_rhor,
269+
ik);
270+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(complex<float>), cudaMemcpyDeviceToHost);
271+
int startiz = pwtest.startz_current;
272+
for (int ixy = 0; ixy < nx * ny; ++ixy)
273+
{
274+
for (int iz = 0; iz < nplane; ++iz)
275+
{
276+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-4);
277+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-4);
278+
}
279+
}
280+
281+
pwtest.real_to_recip<std::complex<float>,base_device::DEVICE_GPU>(d_rhor,d_rhog,ik);
282+
cudaMemcpy(h_rhogout,d_rhog,npwk * sizeof(complex<float>),cudaMemcpyDeviceToHost);
283+
for (int ig = 0; ig < npwk; ++ig)
284+
{
285+
EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-4);
286+
EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-4);
287+
}
288+
delete[] h_rhog;
289+
delete[] h_rhor;
290+
delete[] h_rhogout;
291+
cudaFree(d_rhor);
292+
cudaFree(d_rhog);
142293
}
143-
delete []tmp;
144-
delete [] rhor;
294+
delete[] tmp;
145295
delete[] kvec_d;
146-
delete[] rhogr;
147296
fftw_cleanup();
148-
}
297+
}

0 commit comments

Comments
 (0)