Skip to content

Commit 0f3bf0a

Browse files
committed
set C2C and C2R
1 parent 2fdecfa commit 0f3bf0a

File tree

3 files changed

+306
-33
lines changed

3 files changed

+306
-33
lines changed

source/module_basis/module_pw/pw_transform_gpu.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ void PW_Basis::real2recip_gpu(const std::complex<FPTYPE>* in,
4040
base_device::DEVICE_GPU>()(this->fft_bundle.get_auxr_3d_data<FPTYPE>(),
4141
in,
4242
this->nrxx);
43-
4443
this->fft_bundle.fft3D_forward(this->fft_bundle.get_auxr_3d_data<FPTYPE>(),
4544
this->fft_bundle.get_auxr_3d_data<FPTYPE>());
4645

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
//---------------------------------------------
2+
// TEST for CUFFT
3+
//---------------------------------------------
4+
#include "../pw_basis.h"
5+
#ifdef __MPI
6+
#include "module_base/parallel_global.h"
7+
#include "mpi.h"
8+
#include "test_tool.h"
9+
#endif
10+
#include "cuda_runtime.h"
11+
#include "fftw3.h"
12+
#include "module_base/constants.h"
13+
#include "module_base/global_function.h"
14+
#include "pw_test.h"
15+
16+
using namespace std;
17+
TEST_F(PWTEST, recip_to_real_C2C_double)
18+
{
19+
cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl;
20+
ModulePW::PW_Basis pwtest("gpu", precision_flag);
21+
pwtest.fft_bundle.setfft("gpu", "double");
22+
ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2);
23+
double wfcecut;
24+
double lat0 = 2.2;
25+
bool gamma_only = false;
26+
wfcecut = 18;
27+
gamma_only = false;
28+
int distribution_type = 1;
29+
bool xprime = false;
30+
31+
// init
32+
#ifdef __MPI
33+
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
34+
#endif
35+
pwtest.initgrids(lat0, latvec, wfcecut);
36+
pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime);
37+
pwtest.setuptransform();
38+
pwtest.collect_local_pw();
39+
40+
const int npw = pwtest.npw;
41+
const int nrxx = pwtest.nrxx;
42+
const int nmaxgr = pwtest.nmaxgr;
43+
const int nx = pwtest.nx;
44+
const int ny = pwtest.ny;
45+
const int nz = pwtest.nz;
46+
const int nplane = pwtest.nplane;
47+
48+
const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
49+
const double ggecut = wfcecut / tpiba2;
50+
ModuleBase::Matrix3 GT, G, GGT;
51+
GT = latvec.Inverse();
52+
G = GT.Transpose();
53+
GGT = G * GT;
54+
complex<double>* tmp = new complex<double>[nx * ny * nz];
55+
if (rank_in_pool == 0)
56+
{
57+
for (int ix = 0; ix < nx; ++ix)
58+
{
59+
for (int iy = 0; iy < ny; ++iy)
60+
{
61+
for (int iz = 0; iz < nz; ++iz)
62+
{
63+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
64+
double vx = ix - int(nx / 2);
65+
double vy = iy - int(ny / 2);
66+
double vz = iz - int(nz / 2);
67+
ModuleBase::Vector3<double> v(vx, vy, vz);
68+
double modulus = v * (GGT * v);
69+
if (modulus <= ggecut)
70+
{
71+
tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1);
72+
if (vy > 0)
73+
tmp[ix * ny * nz + iy * nz + iz] += ModuleBase::IMAG_UNIT / (std::abs(v.x + 1) + 1);
74+
else if (vy < 0)
75+
tmp[ix * ny * nz + iy * nz + iz] -= ModuleBase::IMAG_UNIT / (std::abs(-v.x + 1) + 1);
76+
}
77+
}
78+
}
79+
}
80+
fftw_plan pp
81+
= fftw_plan_dft_3d(nx, ny, nz, (fftw_complex*)tmp, (fftw_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
82+
fftw_execute(pp);
83+
fftw_destroy_plan(pp);
84+
85+
ModuleBase::Vector3<double> delta_g(double(int(nx / 2)) / nx,
86+
double(int(ny / 2)) / ny,
87+
double(int(nz / 2)) / nz);
88+
for (int ixy = 0; ixy < nx * ny; ++ixy)
89+
{
90+
for (int iz = 0; iz < nz; ++iz)
91+
{
92+
int ix = ixy / ny;
93+
int iy = ixy % ny;
94+
ModuleBase::Vector3<double> real_r(ix, iy, iz);
95+
double phase_im = -delta_g * real_r;
96+
complex<double> phase(0, ModuleBase::TWO_PI * phase_im);
97+
tmp[ixy * nz + iz] *= exp(phase);
98+
}
99+
}
100+
}
101+
#ifdef __MPI
102+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
103+
#endif
104+
// const int size = nx * ny * nz;
105+
complex<double>* h_rhog = new complex<double>[npw];
106+
complex<double>* h_rhogout = new complex<double>[npw];
107+
complex<double>* d_rhog;
108+
complex<double>* d_rhogr;
109+
complex<double>* d_rhogout;
110+
cudaMalloc((void**)&d_rhog, npw * sizeof(complex<double>));
111+
cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<double>));
112+
cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<double>));
113+
114+
for (int ig = 0; ig < npw; ++ig)
115+
{
116+
h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1);
117+
if (pwtest.gdirect[ig].y > 0)
118+
{
119+
h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1);
120+
}
121+
else if (pwtest.gdirect[ig].y < 0)
122+
{
123+
h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1);
124+
}
125+
}
126+
cudaMemcpy(d_rhog, h_rhog, npw * sizeof(complex<double>), cudaMemcpyHostToDevice);
127+
128+
std::complex<double>* h_rhor = new std::complex<double>[nrxx];
129+
std::complex<double>* d_rhor;
130+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<double>));
131+
pwtest.recip_to_real<std::complex<double>, std::complex<double>, base_device::DEVICE_GPU>(d_rhog, d_rhor);
132+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<double>), cudaMemcpyDeviceToHost);
133+
134+
int startiz = pwtest.startz_current;
135+
for (int ixy = 0; ixy < nx * ny; ++ixy)
136+
{
137+
for (int iz = 0; iz < nplane; ++iz)
138+
{
139+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-6);
140+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-6);
141+
}
142+
}
143+
144+
pwtest.real_to_recip<std::complex<double>,std::complex<double>,base_device::DEVICE_GPU>(d_rhor,d_rhog);
145+
cudaMemcpy(h_rhogout,d_rhog,npw * sizeof(complex<double>),cudaMemcpyDeviceToHost);
146+
for (int ig = 0; ig < npw; ++ig)
147+
{
148+
EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-6);
149+
EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-6);
150+
}
151+
delete[] h_rhog;
152+
delete[] h_rhogout;
153+
delete[] h_rhor;
154+
delete[] tmp;
155+
cudaFree(d_rhog);
156+
cudaFree(d_rhogr);
157+
cudaFree(d_rhogout);
158+
cudaFree(d_rhor);
159+
}
160+
161+
TEST_F(PWTEST, recip_to_real_C2C_float)
162+
{
163+
cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl;
164+
ModulePW::PW_Basis pwtest("gpu", precision_flag);
165+
pwtest.fft_bundle.setfft("gpu", "single");
166+
ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2);
167+
double wfcecut = 18;
168+
double lat0 = 2.2;
169+
bool gamma_only = false;
170+
gamma_only = false;
171+
int distribution_type = 1;
172+
bool xprime = false;
173+
174+
#ifdef __MPI
175+
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
176+
#endif
177+
pwtest.initgrids(lat0, latvec, wfcecut);
178+
pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime);
179+
pwtest.setuptransform();
180+
pwtest.collect_local_pw();
181+
182+
const int npw = pwtest.npw;
183+
const int nrxx = pwtest.nrxx;
184+
const int nmaxgr = pwtest.nmaxgr;
185+
const int nx = pwtest.nx;
186+
const int ny = pwtest.ny;
187+
const int nz = pwtest.nz;
188+
const int nplane = pwtest.nplane;
189+
const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
190+
const double ggecut = wfcecut / tpiba2;
191+
ModuleBase::Matrix3 GT = latvec.Inverse();
192+
ModuleBase::Matrix3 G = GT.Transpose();
193+
ModuleBase::Matrix3 GGT = G * GT;
194+
complex<float>* tmp = new complex<float>[nx * ny * nz];
195+
if (rank_in_pool == 0)
196+
{
197+
for (int ix = 0; ix < nx; ++ix)
198+
{
199+
for (int iy = 0; iy < ny; ++iy)
200+
{
201+
for (int iz = 0; iz < nz; ++iz)
202+
{
203+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
204+
float vx = ix - int(nx / 2);
205+
float vy = iy - int(ny / 2);
206+
float vz = iz - int(nz / 2);
207+
ModuleBase::Vector3<double> v(vx, vy, vz);
208+
float modulus = v * (GGT * v);
209+
if (modulus <= ggecut)
210+
{
211+
tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1);
212+
if (vy > 0)
213+
tmp[ix * ny * nz + iy * nz + iz] += std::complex<float>(0, 1.0) / (std::abs(vx + 1) + 1);
214+
else if (vy < 0)
215+
tmp[ix * ny * nz + iy * nz + iz] -= std::complex<float>(0, 1.0) / (std::abs(-vx + 1) + 1);
216+
}
217+
}
218+
}
219+
}
220+
fftwf_plan pp
221+
= fftwf_plan_dft_3d(nx, ny, nz, (fftwf_complex*)tmp, (fftwf_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
222+
fftwf_execute(pp);
223+
fftwf_destroy_plan(pp);
224+
225+
ModuleBase::Vector3<float> delta_g(float(int(nx / 2)) / nx, float(int(ny / 2)) / ny, float(int(nz / 2)) / nz);
226+
for (int ixy = 0; ixy < nx * ny; ++ixy)
227+
{
228+
for (int iz = 0; iz < nz; ++iz)
229+
{
230+
int ix = ixy / ny;
231+
int iy = ixy % ny;
232+
ModuleBase::Vector3<float> real_r(ix, iy, iz);
233+
float phase_im = -delta_g * real_r;
234+
complex<float> phase(0, ModuleBase::TWO_PI * phase_im);
235+
tmp[ixy * nz + iz] *= exp(phase);
236+
}
237+
}
238+
}
239+
#ifdef __MPI
240+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
241+
#endif
242+
// const int size = nx * ny * nz;
243+
complex<float>* h_rhog = new complex<float>[npw];
244+
complex<float>* h_rhogout = new complex<float>[npw];
245+
complex<float>* d_rhog;
246+
complex<float>* d_rhogr;
247+
complex<float>* d_rhogout;
248+
cudaMalloc((void**)&d_rhog, npw * sizeof(complex<float>));
249+
cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<float>));
250+
cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<float>));
251+
252+
for (int ig = 0; ig < npw; ++ig)
253+
{
254+
h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1);
255+
if (pwtest.gdirect[ig].y > 0)
256+
{
257+
h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1);
258+
}
259+
else if (pwtest.gdirect[ig].y < 0)
260+
{
261+
h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1);
262+
}
263+
}
264+
cudaMemcpy(d_rhog, h_rhog, npw * sizeof(complex<float>), cudaMemcpyHostToDevice);
265+
cudaMemcpy(d_rhogout, h_rhogout, npw * sizeof(complex<float>), cudaMemcpyHostToDevice);
266+
267+
std::complex<float>* h_rhor = new std::complex<float>[nrxx];
268+
std::complex<float>* d_rhor;
269+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<float>));
270+
pwtest.recip_to_real<std::complex<float>, std::complex<float>, base_device::DEVICE_GPU>(d_rhog, d_rhor);
271+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
272+
273+
int startiz = pwtest.startz_current;
274+
for (int ixy = 0; ixy < nx * ny; ++ixy)
275+
{
276+
for (int iz = 0; iz < nplane; ++iz)
277+
{
278+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-4);
279+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-4);
280+
}
281+
}
282+
283+
pwtest.real_to_recip<std::complex<float>,std::complex<float>,base_device::DEVICE_GPU>(d_rhor,d_rhog);
284+
cudaMemcpy(h_rhogout,d_rhog,npw * sizeof(complex<float>),cudaMemcpyDeviceToHost);
285+
for (int ig = 0; ig < npw; ++ig)
286+
{
287+
EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-6);
288+
EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-6);
289+
}
290+
291+
delete[] h_rhog;
292+
delete[] h_rhogout;
293+
delete[] h_rhor;
294+
delete[] tmp;
295+
cudaFree(d_rhog);
296+
cudaFree(d_rhogr);
297+
cudaFree(d_rhogout);
298+
cudaFree(d_rhor);
299+
}

source/module_basis/module_pw/test_gpu/pw_basis_k_C2C.cpp

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ TEST_F(PWTEST,pw_basis_k_C2C_double)
5353
G = GT.Transpose();
5454
GGT = G * GT;
5555
complex<double> *tmp = new complex<double> [nx*ny*nz];
56-
complex<double> * rhogr = new complex<double> [nmaxgr];
5756
double * rhor = new double [nrxx];
5857
for(int ik = 0; ik < nks; ++ik)
5958
{
@@ -105,16 +104,15 @@ TEST_F(PWTEST,pw_basis_k_C2C_double)
105104
MPI_Bcast(tmp,2*nx*ny*nz,MPI_DOUBLE,0,POOL_WORLD);
106105
#endif
107106
complex<double> * h_rhog = new complex<double> [npwk];
108-
complex<double> * rhogout = new complex<double> [npwk];
107+
complex<double> * h_rhor = new complex<double> [nrxx];
109108
for(int ig = 0 ; ig < npwk ; ++ig)
110109
{
111110
h_rhog[ig] = 1.0/(pwtest.getgk2(ik,ig)+1);
112-
rhogr[ig] = 1.0/(pwtest.getgk2(ik,ig)+1);
113111
ModuleBase::Vector3<double> f = pwtest.getgdirect(ik,ig);
114112
if(f.y > 0)
115113
{
116114
h_rhog[ig]+=ModuleBase::IMAG_UNIT / (std::abs(f.x+1) + 1);
117-
rhogr[ig]+=ModuleBase::IMAG_UNIT / (std::abs(f.x+1) + 1);
115+
118116
}
119117
}
120118

@@ -125,44 +123,21 @@ TEST_F(PWTEST,pw_basis_k_C2C_double)
125123
cudaMalloc((void**)&d_rhog, npwk * sizeof(complex<double>));
126124
cudaMalloc((void**)&d_rhor, npwk * sizeof(complex<double>));
127125
cudaMalloc((void**)&d_rhogout, npwk * sizeof(complex<double>));
126+
cudaMemcpy(d_rhog,h_rhog,npwk*sizeof(complex<double>),cudaMemcpyHostToDevice);
128127
pwtest.recip_to_real<std::complex<double>,std::complex<double>,base_device::DEVICE_GPU>(h_rhog,d_rhor,ik); //check out-of-place transform
129-
128+
cudaMemcpy(h_rhor,d_rhor,nrxx*sizeof(complex<double>),cudaMemcpyHostToDevice);
130129
int startiz = pwtest.startz_current;
131130
for(int ixy = 0 ; ixy < nx * ny ; ++ixy)
132131
{
133132
for(int iz = 0 ; iz < nplane ; ++iz)
134133
{
135-
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(),rhor[ixy*nplane+iz],1e-6);
136-
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(),((double*)rhogr)[ixy*nplane+iz],1e-6);
134+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(),h_rhor[ixy*nplane+iz].real(),1e-6);
137135
}
138136
}
139137

140-
pwtest.real2recip(rhor,rhogout,ik);
141-
142-
pwtest.real2recip((double*)rhogr,rhogr,ik);
143-
144-
145-
for(int ig = 0 ; ig < npwk ; ++ig)
146-
{
147-
EXPECT_NEAR(h_rhog[ig].real(),rhogout[ig].real(),1e-6);
148-
EXPECT_NEAR(h_rhog[ig].imag(),rhogout[ig].imag(),1e-6);
149-
EXPECT_NEAR(h_rhog[ig].real(),rhogr[ig].real(),1e-6);
150-
EXPECT_NEAR(h_rhog[ig].imag(),rhogr[ig].imag(),1e-6);
151-
}
152-
153-
154138
delete [] h_rhog;
155-
delete [] rhogout;
156-
//check igl2ig
157-
for(int igl = 0; igl < npwk ; ++igl)
158-
{
159-
const int isz = pwtest.getigl2isz(ik,igl);
160-
for(int ig = 0 ; ig < pwtest.npwk; ++ig)
161-
{
162-
if(isz == pwtest.ig2isz[ig]){
163-
EXPECT_EQ(ig,pwtest.getigl2ig(ik,igl));}
164-
}
165-
}
139+
delete [] h_rhor;
140+
166141

167142
}
168143
delete []tmp;

0 commit comments

Comments
 (0)