|
| 1 | +//--------------------------------------------- |
| 2 | +// TEST for CUFFT |
| 3 | +//--------------------------------------------- |
| 4 | +#include "../pw_basis.h" |
| 5 | +#ifdef __MPI |
| 6 | +#include "module_base/parallel_global.h" |
| 7 | +#include "mpi.h" |
| 8 | +#include "test_tool.h" |
| 9 | +#endif |
| 10 | +#include "cuda_runtime.h" |
| 11 | +#include "fftw3.h" |
| 12 | +#include "module_base/constants.h" |
| 13 | +#include "module_base/global_function.h" |
| 14 | +#include "pw_test.h" |
| 15 | + |
| 16 | +using namespace std; |
| 17 | +TEST_F(PWTEST, recip_to_real_C2C_double) |
| 18 | +{ |
| 19 | + cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl; |
| 20 | + ModulePW::PW_Basis pwtest("gpu", precision_flag); |
| 21 | + pwtest.fft_bundle.setfft("gpu", "double"); |
| 22 | + ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2); |
| 23 | + double wfcecut; |
| 24 | + double lat0 = 2.2; |
| 25 | + bool gamma_only = false; |
| 26 | + wfcecut = 18; |
| 27 | + gamma_only = false; |
| 28 | + int distribution_type = 1; |
| 29 | + bool xprime = false; |
| 30 | + |
| 31 | + // init |
| 32 | +#ifdef __MPI |
| 33 | + pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD); |
| 34 | +#endif |
| 35 | + pwtest.initgrids(lat0, latvec, wfcecut); |
| 36 | + pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime); |
| 37 | + pwtest.setuptransform(); |
| 38 | + pwtest.collect_local_pw(); |
| 39 | + |
| 40 | + const int npw = pwtest.npw; |
| 41 | + const int nrxx = pwtest.nrxx; |
| 42 | + const int nmaxgr = pwtest.nmaxgr; |
| 43 | + const int nx = pwtest.nx; |
| 44 | + const int ny = pwtest.ny; |
| 45 | + const int nz = pwtest.nz; |
| 46 | + const int nplane = pwtest.nplane; |
| 47 | + |
| 48 | + const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0; |
| 49 | + const double ggecut = wfcecut / tpiba2; |
| 50 | + ModuleBase::Matrix3 GT, G, GGT; |
| 51 | + GT = latvec.Inverse(); |
| 52 | + G = GT.Transpose(); |
| 53 | + GGT = G * GT; |
| 54 | + complex<double>* tmp = new complex<double>[nx * ny * nz]; |
| 55 | + if (rank_in_pool == 0) |
| 56 | + { |
| 57 | + for (int ix = 0; ix < nx; ++ix) |
| 58 | + { |
| 59 | + for (int iy = 0; iy < ny; ++iy) |
| 60 | + { |
| 61 | + for (int iz = 0; iz < nz; ++iz) |
| 62 | + { |
| 63 | + tmp[ix * ny * nz + iy * nz + iz] = 0.0; |
| 64 | + double vx = ix - int(nx / 2); |
| 65 | + double vy = iy - int(ny / 2); |
| 66 | + double vz = iz - int(nz / 2); |
| 67 | + ModuleBase::Vector3<double> v(vx, vy, vz); |
| 68 | + double modulus = v * (GGT * v); |
| 69 | + if (modulus <= ggecut) |
| 70 | + { |
| 71 | + tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1); |
| 72 | + if (vy > 0) |
| 73 | + tmp[ix * ny * nz + iy * nz + iz] += ModuleBase::IMAG_UNIT / (std::abs(v.x + 1) + 1); |
| 74 | + else if (vy < 0) |
| 75 | + tmp[ix * ny * nz + iy * nz + iz] -= ModuleBase::IMAG_UNIT / (std::abs(-v.x + 1) + 1); |
| 76 | + } |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | + fftw_plan pp |
| 81 | + = fftw_plan_dft_3d(nx, ny, nz, (fftw_complex*)tmp, (fftw_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE); |
| 82 | + fftw_execute(pp); |
| 83 | + fftw_destroy_plan(pp); |
| 84 | + |
| 85 | + ModuleBase::Vector3<double> delta_g(double(int(nx / 2)) / nx, |
| 86 | + double(int(ny / 2)) / ny, |
| 87 | + double(int(nz / 2)) / nz); |
| 88 | + for (int ixy = 0; ixy < nx * ny; ++ixy) |
| 89 | + { |
| 90 | + for (int iz = 0; iz < nz; ++iz) |
| 91 | + { |
| 92 | + int ix = ixy / ny; |
| 93 | + int iy = ixy % ny; |
| 94 | + ModuleBase::Vector3<double> real_r(ix, iy, iz); |
| 95 | + double phase_im = -delta_g * real_r; |
| 96 | + complex<double> phase(0, ModuleBase::TWO_PI * phase_im); |
| 97 | + tmp[ixy * nz + iz] *= exp(phase); |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | +#ifdef __MPI |
| 102 | + MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD); |
| 103 | +#endif |
| 104 | + // const int size = nx * ny * nz; |
| 105 | + complex<double>* h_rhog = new complex<double>[npw]; |
| 106 | + complex<double>* h_rhogout = new complex<double>[npw]; |
| 107 | + complex<double>* d_rhog; |
| 108 | + complex<double>* d_rhogr; |
| 109 | + complex<double>* d_rhogout; |
| 110 | + cudaMalloc((void**)&d_rhog, npw * sizeof(complex<double>)); |
| 111 | + cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<double>)); |
| 112 | + cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<double>)); |
| 113 | + |
| 114 | + for (int ig = 0; ig < npw; ++ig) |
| 115 | + { |
| 116 | + h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1); |
| 117 | + if (pwtest.gdirect[ig].y > 0) |
| 118 | + { |
| 119 | + h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1); |
| 120 | + } |
| 121 | + else if (pwtest.gdirect[ig].y < 0) |
| 122 | + { |
| 123 | + h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1); |
| 124 | + } |
| 125 | + } |
| 126 | + cudaMemcpy(d_rhog, h_rhog, npw * sizeof(complex<double>), cudaMemcpyHostToDevice); |
| 127 | + |
| 128 | + std::complex<double>* h_rhor = new std::complex<double>[nrxx]; |
| 129 | + std::complex<double>* d_rhor; |
| 130 | + cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<double>)); |
| 131 | + pwtest.recip_to_real<std::complex<double>, std::complex<double>, base_device::DEVICE_GPU>(d_rhog, d_rhor); |
| 132 | + cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<double>), cudaMemcpyDeviceToHost); |
| 133 | + |
| 134 | + int startiz = pwtest.startz_current; |
| 135 | + for (int ixy = 0; ixy < nx * ny; ++ixy) |
| 136 | + { |
| 137 | + for (int iz = 0; iz < nplane; ++iz) |
| 138 | + { |
| 139 | + EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-6); |
| 140 | + EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-6); |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + pwtest.real_to_recip<std::complex<double>,std::complex<double>,base_device::DEVICE_GPU>(d_rhor,d_rhog); |
| 145 | + cudaMemcpy(h_rhogout,d_rhog,npw * sizeof(complex<double>),cudaMemcpyDeviceToHost); |
| 146 | + for (int ig = 0; ig < npw; ++ig) |
| 147 | + { |
| 148 | + EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-6); |
| 149 | + EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-6); |
| 150 | + } |
| 151 | + delete[] h_rhog; |
| 152 | + delete[] h_rhogout; |
| 153 | + delete[] h_rhor; |
| 154 | + delete[] tmp; |
| 155 | + cudaFree(d_rhog); |
| 156 | + cudaFree(d_rhogr); |
| 157 | + cudaFree(d_rhogout); |
| 158 | + cudaFree(d_rhor); |
| 159 | +} |
| 160 | + |
| 161 | +TEST_F(PWTEST, recip_to_real_C2C_float) |
| 162 | +{ |
| 163 | + cout << "dividemthd 1, gamma_only: off, check fft between double and complex" << endl; |
| 164 | + ModulePW::PW_Basis pwtest("gpu", precision_flag); |
| 165 | + pwtest.fft_bundle.setfft("gpu", "single"); |
| 166 | + ModuleBase::Matrix3 latvec(1, 1, 0, 0, 1, 1, 0, 0, 2); |
| 167 | + double wfcecut = 18; |
| 168 | + double lat0 = 2.2; |
| 169 | + bool gamma_only = false; |
| 170 | + gamma_only = false; |
| 171 | + int distribution_type = 1; |
| 172 | + bool xprime = false; |
| 173 | + |
| 174 | +#ifdef __MPI |
| 175 | + pwtest.initmpi(nproc_in_pool, rank_in_pool, POOL_WORLD); |
| 176 | +#endif |
| 177 | + pwtest.initgrids(lat0, latvec, wfcecut); |
| 178 | + pwtest.initparameters(gamma_only, wfcecut, distribution_type, xprime); |
| 179 | + pwtest.setuptransform(); |
| 180 | + pwtest.collect_local_pw(); |
| 181 | + |
| 182 | + const int npw = pwtest.npw; |
| 183 | + const int nrxx = pwtest.nrxx; |
| 184 | + const int nmaxgr = pwtest.nmaxgr; |
| 185 | + const int nx = pwtest.nx; |
| 186 | + const int ny = pwtest.ny; |
| 187 | + const int nz = pwtest.nz; |
| 188 | + const int nplane = pwtest.nplane; |
| 189 | + const double tpiba2 = ModuleBase::TWO_PI * ModuleBase::TWO_PI / lat0 / lat0; |
| 190 | + const double ggecut = wfcecut / tpiba2; |
| 191 | + ModuleBase::Matrix3 GT = latvec.Inverse(); |
| 192 | + ModuleBase::Matrix3 G = GT.Transpose(); |
| 193 | + ModuleBase::Matrix3 GGT = G * GT; |
| 194 | + complex<float>* tmp = new complex<float>[nx * ny * nz]; |
| 195 | + if (rank_in_pool == 0) |
| 196 | + { |
| 197 | + for (int ix = 0; ix < nx; ++ix) |
| 198 | + { |
| 199 | + for (int iy = 0; iy < ny; ++iy) |
| 200 | + { |
| 201 | + for (int iz = 0; iz < nz; ++iz) |
| 202 | + { |
| 203 | + tmp[ix * ny * nz + iy * nz + iz] = 0.0; |
| 204 | + float vx = ix - int(nx / 2); |
| 205 | + float vy = iy - int(ny / 2); |
| 206 | + float vz = iz - int(nz / 2); |
| 207 | + ModuleBase::Vector3<double> v(vx, vy, vz); |
| 208 | + float modulus = v * (GGT * v); |
| 209 | + if (modulus <= ggecut) |
| 210 | + { |
| 211 | + tmp[ix * ny * nz + iy * nz + iz] = 1.0 / (modulus + 1); |
| 212 | + if (vy > 0) |
| 213 | + tmp[ix * ny * nz + iy * nz + iz] += std::complex<float>(0, 1.0) / (std::abs(vx + 1) + 1); |
| 214 | + else if (vy < 0) |
| 215 | + tmp[ix * ny * nz + iy * nz + iz] -= std::complex<float>(0, 1.0) / (std::abs(-vx + 1) + 1); |
| 216 | + } |
| 217 | + } |
| 218 | + } |
| 219 | + } |
| 220 | + fftwf_plan pp |
| 221 | + = fftwf_plan_dft_3d(nx, ny, nz, (fftwf_complex*)tmp, (fftwf_complex*)tmp, FFTW_BACKWARD, FFTW_ESTIMATE); |
| 222 | + fftwf_execute(pp); |
| 223 | + fftwf_destroy_plan(pp); |
| 224 | + |
| 225 | + ModuleBase::Vector3<float> delta_g(float(int(nx / 2)) / nx, float(int(ny / 2)) / ny, float(int(nz / 2)) / nz); |
| 226 | + for (int ixy = 0; ixy < nx * ny; ++ixy) |
| 227 | + { |
| 228 | + for (int iz = 0; iz < nz; ++iz) |
| 229 | + { |
| 230 | + int ix = ixy / ny; |
| 231 | + int iy = ixy % ny; |
| 232 | + ModuleBase::Vector3<float> real_r(ix, iy, iz); |
| 233 | + float phase_im = -delta_g * real_r; |
| 234 | + complex<float> phase(0, ModuleBase::TWO_PI * phase_im); |
| 235 | + tmp[ixy * nz + iz] *= exp(phase); |
| 236 | + } |
| 237 | + } |
| 238 | + } |
| 239 | +#ifdef __MPI |
| 240 | + MPI_Bcast(tmp, 2 * nx * ny * nz, MPI_DOUBLE, 0, POOL_WORLD); |
| 241 | +#endif |
| 242 | + // const int size = nx * ny * nz; |
| 243 | + complex<float>* h_rhog = new complex<float>[npw]; |
| 244 | + complex<float>* h_rhogout = new complex<float>[npw]; |
| 245 | + complex<float>* d_rhog; |
| 246 | + complex<float>* d_rhogr; |
| 247 | + complex<float>* d_rhogout; |
| 248 | + cudaMalloc((void**)&d_rhog, npw * sizeof(complex<float>)); |
| 249 | + cudaMalloc((void**)&d_rhogr, npw * sizeof(complex<float>)); |
| 250 | + cudaMalloc((void**)&d_rhogout, npw * sizeof(complex<float>)); |
| 251 | + |
| 252 | + for (int ig = 0; ig < npw; ++ig) |
| 253 | + { |
| 254 | + h_rhog[ig] = 1.0 / (pwtest.gg[ig] + 1); |
| 255 | + if (pwtest.gdirect[ig].y > 0) |
| 256 | + { |
| 257 | + h_rhog[ig] += ModuleBase::IMAG_UNIT / (std::abs(pwtest.gdirect[ig].x + 1) + 1); |
| 258 | + } |
| 259 | + else if (pwtest.gdirect[ig].y < 0) |
| 260 | + { |
| 261 | + h_rhog[ig] -= ModuleBase::IMAG_UNIT / (std::abs(-pwtest.gdirect[ig].x + 1) + 1); |
| 262 | + } |
| 263 | + } |
| 264 | + cudaMemcpy(d_rhog, h_rhog, npw * sizeof(complex<float>), cudaMemcpyHostToDevice); |
| 265 | + cudaMemcpy(d_rhogout, h_rhogout, npw * sizeof(complex<float>), cudaMemcpyHostToDevice); |
| 266 | + |
| 267 | + std::complex<float>* h_rhor = new std::complex<float>[nrxx]; |
| 268 | + std::complex<float>* d_rhor; |
| 269 | + cudaMalloc((void**)&d_rhor, nrxx * sizeof(std::complex<float>)); |
| 270 | + pwtest.recip_to_real<std::complex<float>, std::complex<float>, base_device::DEVICE_GPU>(d_rhog, d_rhor); |
| 271 | + cudaMemcpy(h_rhor, d_rhor, nrxx * sizeof(std::complex<float>), cudaMemcpyDeviceToHost); |
| 272 | + |
| 273 | + int startiz = pwtest.startz_current; |
| 274 | + for (int ixy = 0; ixy < nx * ny; ++ixy) |
| 275 | + { |
| 276 | + for (int iz = 0; iz < nplane; ++iz) |
| 277 | + { |
| 278 | + EXPECT_NEAR(tmp[ixy * nz + startiz + iz].real(), h_rhor[ixy * nplane + iz].real(), 1e-4); |
| 279 | + EXPECT_NEAR(tmp[ixy * nz + startiz + iz].imag(), h_rhor[ixy * nplane + iz].imag(), 1e-4); |
| 280 | + } |
| 281 | + } |
| 282 | + |
| 283 | + pwtest.real_to_recip<std::complex<float>,std::complex<float>,base_device::DEVICE_GPU>(d_rhor,d_rhog); |
| 284 | + cudaMemcpy(h_rhogout,d_rhog,npw * sizeof(complex<float>),cudaMemcpyDeviceToHost); |
| 285 | + for (int ig = 0; ig < npw; ++ig) |
| 286 | + { |
| 287 | + EXPECT_NEAR(h_rhog[ig].real(), h_rhogout[ig].real(), 1e-6); |
| 288 | + EXPECT_NEAR(h_rhog[ig].imag(), h_rhogout[ig].imag(), 1e-6); |
| 289 | + } |
| 290 | + |
| 291 | + delete[] h_rhog; |
| 292 | + delete[] h_rhogout; |
| 293 | + delete[] h_rhor; |
| 294 | + delete[] tmp; |
| 295 | + cudaFree(d_rhog); |
| 296 | + cudaFree(d_rhogr); |
| 297 | + cudaFree(d_rhogout); |
| 298 | + cudaFree(d_rhor); |
| 299 | +} |
0 commit comments