Skip to content

Commit 95c1108

Browse files
committed
set C2C
1 parent ecc7f23 commit 95c1108

File tree

8 files changed

+849
-536
lines changed

8 files changed

+849
-536
lines changed

source/module_basis/module_pw/module_fft/fft_bundle.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ void FFT_Bundle::initfft(int nx_in,
8787
->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in);
8888
}
8989
}
90-
printf("the device is %s\n",device.c_str());
9190
if (device == "gpu")
9291
{
9392
#if defined(__ROCM)

source/module_basis/module_pw/pw_basis.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ void PW_Basis::setuptransform()
6969
this->fft_bundle.initfft(this->nx,this->ny,this->nz,this->liy,this->riy,this->nst,this->nplane,this->poolnproc,this->gamma_only, this->xprime);
7070
}
7171
this->fft_bundle.setupFFT();
72-
printf("here is the flag\n");
7372
ModuleBase::timer::tick(this->classname, "setuptransform");
7473
}
7574

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
//---------------------------------------------
2+
// TEST for CUFFT
3+
//---------------------------------------------
4+
#include "../pw_basis.h"
5+
#ifdef __MPI
6+
#include "module_base/parallel_global.h"
7+
#include "mpi.h"
8+
#include "test_tool.h"
9+
#endif
10+
#include "cuda_runtime.h"
11+
#include "fftw3.h"
12+
#include "module_base/constants.h"
13+
#include "module_base/global_function.h"
14+
#include "pw_test.h"
15+
16+
using namespace std;
17+
TEST_F(PWTEST, real_to_recip_C2C_double)
18+
{
19+
cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl;
20+
ModulePW::PW_Basis pwtest("gpu", precision_flag);
21+
pwtest.fft_bundle.setfft("gpu", "double");
22+
ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2);
23+
double wfcecut;
24+
double lat0 = 2.2;
25+
bool gamma_only = false;
26+
wfcecut = 18;
27+
gamma_only = false;
28+
int distribution_type = 1;
29+
bool xprime = false;
30+
31+
// init
32+
#ifdef __MPI
33+
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
34+
#endif
35+
pwtest.initgrids(lat0, latvec, wfcecut);
36+
pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime);
37+
pwtest.setuptransform();
38+
pwtest.collect_local_pw();
39+
40+
const int npw = pwtest.npw;
41+
const int nrxx = pwtest.nrxx;
42+
const int nmaxgr = pwtest.nmaxgr;
43+
const int nx = pwtest.nx;
44+
const int ny = pwtest.ny;
45+
const int nz = pwtest.nz;
46+
const int nplane = pwtest.nplane;
47+
48+
const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
49+
const double ggecut = wfcecut / tpiba2;
50+
ModuleBase::Matrix3 GT, G, GGT;
51+
GT = latvec.Inverse();
52+
G = GT.Transpose();
53+
GGT = G * GT;
54+
complex<double>* tmp = new complex<double>[nx * ny * nz];
55+
if (rank_in_pool == 0)
56+
{
57+
for (int ix = 0; ix < nx; ++ix)
58+
{
59+
for (int iy = 0; iy < ny; ++iy)
60+
{
61+
for (int iz = 0; iz < nz; ++iz)
62+
{
63+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
64+
double vx = ix - int(nx / 2);
65+
double vy = iy - int(ny / 2);
66+
double vz = iz - int(nz / 2);
67+
ModuleBase::Vector3<double> v(vx, vy, vz);
68+
double modulus = v * (GGT * v);
69+
if (modulus <= ggecut)
70+
{
71+
tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1);
72+
if (vy > 0)
73+
tmp[ix * ny * nz + iy * nz + iz] += ModuleBase::IMAG_UNIT / (std::abs(v.x + 1) + 1);
74+
else if (vy < 0)
75+
tmp[ix * ny * nz + iy * nz + iz] -= ModuleBase::IMAG_UNIT / (std::abs(-v.x + 1) + 1);
76+
}
77+
}
78+
}
79+
}
80+
fftw_plan pp
81+
= fftw_plan_dft_3d(nx, ny, nz, (fftw_complex*)tmp, (fftw_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
82+
fftw_execute(pp);
83+
fftw_destroy_plan(pp);
84+
85+
ModuleBase::Vector3<double> delta_g(double(int(nx / 2)) / nx,
86+
double(int(ny / 2)) / ny,
87+
double(int(nz / 2)) / nz);
88+
for (int ixy = 0; ixy < nx * ny; ++ixy)
89+
{
90+
for (int iz = 0; iz < nz; ++iz)
91+
{
92+
int ix = ixy / ny;
93+
int iy = ixy % ny;
94+
ModuleBase::Vector3<double> real_r(ix, iy, iz);
95+
double phase_im = -delta_g * real_r;
96+
complex<double> phase(0, ModuleBase::TWO_PI * phase_im);
97+
tmp[ixy * nz + iz] *= exp(phase);
98+
}
99+
}
100+
}
101+
#ifdef __MPI
102+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
103+
#endif
104+
// const int size = nx * ny * nz;
105+
complex<double>* h_rhog = new complex<double>[npw];
106+
complex<double>* h_rhogout = new complex<double>[npw];
107+
complex<double>* d_rhog;
108+
complex<double>* d_rhogr;
109+
complex<double>* d_rhogout;
110+
cudaMalloc((void**)&d_rhog, npw * sizeof(complex<double>));
111+
cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<double>));
112+
cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<double>));
113+
114+
for (int ig = 0; ig < npw; ++ig)
115+
{
116+
h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1);
117+
if (pwtest.gdirect[ig].y > 0)
118+
{
119+
h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1);
120+
}
121+
else if (pwtest.gdirect[ig].y < 0)
122+
{
123+
h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1);
124+
}
125+
}
126+
cudaMemcpy(d_rhor, h_rhog, nrxx * sizeof(complex<double>), cudaMemcpyHostToDevice);
127+
128+
std::complex<double>* h_rhor = new std::complex<double>[nrxx];
129+
std::complex<double>* d_rhor;
130+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<double>));
131+
pwtest.real_to_recip<std::complex<double>, std::complex<double>, base_device::DEVICE_GPU>(d_rhor, d_rhog);
132+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<double>), cudaMemcpyDeviceToHost);
133+
int startiz = pwtest.startz_current;
134+
for (int ixy = 0; ixy < nx * ny; ++ixy)
135+
{
136+
for (int iz = 0; iz < nplane; ++iz)
137+
{
138+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhog[ixy * nplane + iz].real(), 1e-6);
139+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhog[ixy * nplane + iz].imag(), 1e-6);
140+
}
141+
}
142+
delete[] h_rhog;
143+
delete[] h_rhogout;
144+
delete[] h_rhor;
145+
delete[] tmp;
146+
cudaFree(d_rhog);
147+
cudaFree(d_rhogr);
148+
cudaFree(d_rhogout);
149+
cudaFree(d_rhor);
150+
}
151+
152+
TEST_F(PWTEST, real_to_recip_C2C_float)
153+
{
154+
cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl;
155+
ModulePW::PW_Basis pwtest("gpu", precision_flag);
156+
pwtest.fft_bundle.setfft("gpu", "single");
157+
ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2);
158+
double wfcecut = 18;
159+
double lat0 = 2.2;
160+
bool gamma_only = false;
161+
gamma_only = false;
162+
int distribution_type = 1;
163+
bool xprime = false;
164+
165+
#ifdef __MPI
166+
pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD);
167+
#endif
168+
pwtest.initgrids(lat0, latvec, wfcecut);
169+
pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime);
170+
pwtest.setuptransform();
171+
pwtest.collect_local_pw();
172+
173+
const int npw = pwtest.npw;
174+
const int nrxx = pwtest.nrxx;
175+
const int nmaxgr = pwtest.nmaxgr;
176+
const int nx = pwtest.nx;
177+
const int ny = pwtest.ny;
178+
const int nz = pwtest.nz;
179+
const int nplane = pwtest.nplane;
180+
const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0;
181+
const double ggecut = wfcecut / tpiba2;
182+
ModuleBase::Matrix3 GT = latvec.Inverse();
183+
ModuleBase::Matrix3 G = GT.Transpose();
184+
ModuleBase::Matrix3 GGT = G * GT;
185+
complex<float>* tmp = new complex<float>[nx * ny * nz];
186+
if (rank_in_pool == 0)
187+
{
188+
for (int ix = 0; ix < nx; ++ix)
189+
{
190+
for (int iy = 0; iy < ny; ++iy)
191+
{
192+
for (int iz = 0; iz < nz; ++iz)
193+
{
194+
tmp[ix * ny * nz + iy * nz + iz] = 0.0;
195+
float vx = ix - int(nx / 2);
196+
float vy = iy - int(ny / 2);
197+
float vz = iz - int(nz / 2);
198+
ModuleBase::Vector3<double> v(vx, vy, vz);
199+
float modulus = v * (GGT * v);
200+
if (modulus <= ggecut)
201+
{
202+
tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1);
203+
if (vy > 0)
204+
tmp[ix * ny * nz + iy * nz + iz] += std::complex<float>(0, 1.0) / (std::abs(vx + 1) + 1);
205+
else if (vy < 0)
206+
tmp[ix * ny * nz + iy * nz + iz] -= std::complex<float>(0, 1.0) / (std::abs(-vx + 1) + 1);
207+
}
208+
}
209+
}
210+
}
211+
fftwf_plan pp
212+
= fftwf_plan_dft_3d(nx, ny, nz, (fftwf_complex*)tmp, (fftwf_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE);
213+
fftwf_execute(pp);
214+
fftwf_destroy_plan(pp);
215+
216+
ModuleBase::Vector3<float> delta_g(float(int(nx / 2)) / nx, float(int(ny / 2)) / ny, float(int(nz / 2)) / nz);
217+
for (int ixy = 0; ixy < nx * ny; ++ixy)
218+
{
219+
for (int iz = 0; iz < nz; ++iz)
220+
{
221+
int ix = ixy / ny;
222+
int iy = ixy % ny;
223+
ModuleBase::Vector3<float> real_r(ix, iy, iz);
224+
float phase_im = -delta_g * real_r;
225+
complex<float> phase(0, ModuleBase::TWO_PI * phase_im);
226+
tmp[ixy * nz + iz] *= exp(phase);
227+
}
228+
}
229+
}
230+
#ifdef __MPI
231+
MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD);
232+
#endif
233+
// const int size = nx * ny * nz;
234+
complex<float>* h_rhog = new complex<float>[npw];
235+
complex<float>* h_rhogout = new complex<float>[npw];
236+
complex<float>* d_rhog;
237+
complex<float>* d_rhogr;
238+
complex<float>* d_rhogout;
239+
cudaMalloc((void**)&d_rhog, npw * sizeof(complex<float>));
240+
cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<float>));
241+
cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<float>));
242+
243+
for (int ig = 0; ig < npw; ++ig)
244+
{
245+
h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1);
246+
if (pwtest.gdirect[ig].y > 0)
247+
{
248+
h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1);
249+
}
250+
else if (pwtest.gdirect[ig].y < 0)
251+
{
252+
h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1);
253+
}
254+
}
255+
cudaMemcpy(d_rhog, h_rhog, npw * sizeof(complex<float>), cudaMemcpyHostToDevice);
256+
cudaMemcpy(d_rhogout, h_rhogout, npw * sizeof(complex<float>), cudaMemcpyHostToDevice);
257+
258+
std::complex<float>* h_rhor = new std::complex<float>[nrxx];
259+
std::complex<float>* d_rhor;
260+
cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<float>));
261+
pwtest.real_to_recip<std::complex<float>, std::complex<float>, base_device::DEVICE_GPU>(d_rhog, d_rhor);
262+
cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
263+
264+
int startiz = pwtest.startz_current;
265+
for (int ixy = 0; ixy < nx * ny; ++ixy)
266+
{
267+
for (int iz = 0; iz < nplane; ++iz)
268+
{
269+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-4);
270+
EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-4);
271+
}
272+
}
273+
delete[] h_rhog;
274+
delete[] h_rhogout;
275+
delete[] h_rhor;
276+
delete[] tmp;
277+
cudaFree(d_rhog);
278+
cudaFree(d_rhogr);
279+
cudaFree(d_rhogout);
280+
cudaFree(d_rhor);
281+
}

0 commit comments

Comments
 (0)