@@ -8,6 +8,8 @@ namespace hsolver {
88// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
99
1010static hipsolverHandle_t hipsolver_H = nullptr ;
11+ // Test on DCU platform. When nstart is greater than 234, code on DCU performs better.
12+ const int N_DCU = 234 ;
1113
1214void createGpuSolverHandle () {
1315 if (hipsolver_H == nullptr )
@@ -37,62 +39,65 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
3739 // copied from ../cuda/dngvd_op.cu, "dngvd_op"
3840 assert (nstart == ldh);
3941
40- hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (double ) * ldh * nstart, hipMemcpyDeviceToDevice));
41- // now vcc contains hcc
42-
43- // prepare some values for hipsolverDnZhegvd_bufferSize
44- int * devInfo = nullptr ;
45- int lwork = 0 , info_gpu = 0 ;
46- double * work = nullptr ;
47- hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
48- hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
49-
50- // calculate the sizes needed for pre-allocated buffer.
51- hipsolverErrcheck (hipsolverDnDsygvd_bufferSize (
52- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
53- nstart,
54- _vcc, ldh,
55- _scc, ldh,
56- _eigenvalue,
57- &lwork));
58-
59- // allocate memery
60- hipErrcheck (hipMalloc ((void **)&work, sizeof (double ) * lwork));
61-
62- // compute eigenvalues and eigenvectors.
63- hipsolverErrcheck (hipsolverDnDsygvd (
64- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
65- nstart,
66- _vcc, ldh,
67- const_cast <double *>(_scc), ldh,
68- _eigenvalue,
69- work, lwork, devInfo));
70-
71- hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
72-
73- // free the buffer
74- hipErrcheck (hipFree (work));
75- hipErrcheck (hipFree (devInfo));
42+ if (nstart > N_DCU){
43+ hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (double ) * ldh * nstart, hipMemcpyDeviceToDevice));
44+ // now vcc contains hcc
45+
46+ // prepare some values for hipsolverDnZhegvd_bufferSize
47+ int * devInfo = nullptr ;
48+ int lwork = 0 , info_gpu = 0 ;
49+ double * work = nullptr ;
50+ hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
51+ hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
52+
53+ // calculate the sizes needed for pre-allocated buffer.
54+ hipsolverErrcheck (hipsolverDnDsygvd_bufferSize (
55+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
56+ nstart,
57+ _vcc, ldh,
58+ _scc, ldh,
59+ _eigenvalue,
60+ &lwork));
61+
62+ // allocate memery
63+ hipErrcheck (hipMalloc ((void **)&work, sizeof (double ) * lwork));
64+
65+ // compute eigenvalues and eigenvectors.
66+ hipsolverErrcheck (hipsolverDnDsygvd (
67+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
68+ nstart,
69+ _vcc, ldh,
70+ const_cast <double *>(_scc), ldh,
71+ _eigenvalue,
72+ work, lwork, devInfo));
73+
74+ hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
75+
76+ // free the buffer
77+ hipErrcheck (hipFree (work));
78+ hipErrcheck (hipFree (devInfo));
79+ }
7680 // if(fail_info != nullptr) *fail_info = info_gpu;
81+ else {
82+ std::vector<double > hcc (nstart * nstart, 0.0 );
83+ std::vector<double > scc (nstart * nstart, 0.0 );
84+ std::vector<double > vcc (nstart * nstart, 0.0 );
85+ std::vector<double > eigenvalue (nstart, 0 );
86+ hipErrcheck (hipMemcpy (hcc.data (), _hcc, sizeof (double ) * hcc.size (), hipMemcpyDeviceToHost));
87+ hipErrcheck (hipMemcpy (scc.data (), _scc, sizeof (double ) * scc.size (), hipMemcpyDeviceToHost));
88+ base_device::DEVICE_CPU* cpu_ctx = {};
89+ dngvd_op<double , base_device::DEVICE_CPU>()(cpu_ctx,
90+ nstart,
91+ ldh,
92+ hcc.data (),
93+ scc.data (),
94+ eigenvalue.data (),
95+ vcc.data ());
96+ hipErrcheck (hipMemcpy (_vcc, vcc.data (), sizeof (double ) * vcc.size (), hipMemcpyHostToDevice));
97+ hipErrcheck (hipMemcpy (_eigenvalue, eigenvalue.data (), sizeof (double ) * eigenvalue.size (), hipMemcpyHostToDevice));
98+ }
7799
78-
79- // std::vector<double> hcc(nstart * nstart, 0.0);
80- // std::vector<double> scc(nstart * nstart, 0.0);
81- // std::vector<double> vcc(nstart * nstart, 0.0);
82- // std::vector<double> eigenvalue(nstart, 0);
83- // hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
84- // hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
85- // base_device::DEVICE_CPU* cpu_ctx = {};
86- // dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
87- // nstart,
88- // ldh,
89- // hcc.data(),
90- // scc.data(),
91- // eigenvalue.data(),
92- // vcc.data(),
93- // fail_info);
94- // hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
95- // hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
100+
96101}
97102#endif // __LCAO
98103
@@ -107,62 +112,65 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
107112{
108113 // copied from ../cuda/dngvd_op.cu, "dngvd_op"
109114 assert (nstart == ldh);
110-
111- hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (std::complex <float >) * ldh * nstart, hipMemcpyDeviceToDevice));
112- // now vcc contains hcc
113-
114- // prepare some values for hipsolverDnZhegvd_bufferSize
115- int * devInfo = nullptr ;
116- int lwork = 0 , info_gpu = 0 ;
117- float2 * work = nullptr ;
118- hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
119- hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
120-
121- // calculate the sizes needed for pre-allocated buffer.
122- hipsolverErrcheck (hipsolverDnChegvd_bufferSize (
123- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
124- nstart,
125- reinterpret_cast <const float2 *>(_vcc), ldh,
126- reinterpret_cast <const float2 *>(_scc), ldh,
127- _eigenvalue,
128- &lwork));
129-
130- // allocate memery
131- hipErrcheck (hipMalloc ((void **)&work, sizeof (float2 ) * lwork));
132-
133- // compute eigenvalues and eigenvectors.
134- hipsolverErrcheck (hipsolverDnChegvd (
135- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
136- nstart,
137- reinterpret_cast <float2 *>(_vcc), ldh,
138- const_cast <float2 *>(reinterpret_cast <const float2 *>(_scc)), ldh,
139- _eigenvalue,
140- work, lwork, devInfo));
141-
142- hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
143- // free the buffer
144- hipErrcheck (hipFree (work));
145- hipErrcheck (hipFree (devInfo));
115+
116+ if (nstart > N_DCU){
117+ hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (std::complex <float >) * ldh * nstart, hipMemcpyDeviceToDevice));
118+ // now vcc contains hcc
119+
120+ // prepare some values for hipsolverDnZhegvd_bufferSize
121+ int * devInfo = nullptr ;
122+ int lwork = 0 , info_gpu = 0 ;
123+ float2 * work = nullptr ;
124+ hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
125+ hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
126+
127+ // calculate the sizes needed for pre-allocated buffer.
128+ hipsolverErrcheck (hipsolverDnChegvd_bufferSize (
129+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
130+ nstart,
131+ reinterpret_cast <const float2 *>(_vcc), ldh,
132+ reinterpret_cast <const float2 *>(_scc), ldh,
133+ _eigenvalue,
134+ &lwork));
135+
136+ // allocate memery
137+ hipErrcheck (hipMalloc ((void **)&work, sizeof (float2 ) * lwork));
138+
139+ // compute eigenvalues and eigenvectors.
140+ hipsolverErrcheck (hipsolverDnChegvd (
141+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
142+ nstart,
143+ reinterpret_cast <float2 *>(_vcc), ldh,
144+ const_cast <float2 *>(reinterpret_cast <const float2 *>(_scc)), ldh,
145+ _eigenvalue,
146+ work, lwork, devInfo));
147+
148+ hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
149+ // free the buffer
150+ hipErrcheck (hipFree (work));
151+ hipErrcheck (hipFree (devInfo));
152+ }
146153 // if(fail_info != nullptr) *fail_info = info_gpu;
154+ else {
155+ std::vector<std::complex <float >> hcc (nstart * nstart, {0 , 0 });
156+ std::vector<std::complex <float >> scc (nstart * nstart, {0 , 0 });
157+ std::vector<std::complex <float >> vcc (nstart * nstart, {0 , 0 });
158+ std::vector<float > eigenvalue (nstart, 0 );
159+ hipErrcheck (hipMemcpy (hcc.data (), _hcc, sizeof (std::complex <float >) * hcc.size (), hipMemcpyDeviceToHost));
160+ hipErrcheck (hipMemcpy (scc.data (), _scc, sizeof (std::complex <float >) * scc.size (), hipMemcpyDeviceToHost));
161+ base_device::DEVICE_CPU* cpu_ctx = {};
162+ dngvd_op<std::complex <float >, base_device::DEVICE_CPU>()(cpu_ctx,
163+ nstart,
164+ ldh,
165+ hcc.data (),
166+ scc.data (),
167+ eigenvalue.data (),
168+ vcc.data ());
169+ hipErrcheck (hipMemcpy (_vcc, vcc.data (), sizeof (std::complex <float >) * vcc.size (), hipMemcpyHostToDevice));
170+ hipErrcheck (hipMemcpy (_eigenvalue, eigenvalue.data (), sizeof (float ) * eigenvalue.size (), hipMemcpyHostToDevice));
171+ }
147172
148-
149- // std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
150- // std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
151- // std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
152- // std::vector<float> eigenvalue(nstart, 0);
153- // hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
154- // hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
155- // base_device::DEVICE_CPU* cpu_ctx = {};
156- // dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
157- // nstart,
158- // ldh,
159- // hcc.data(),
160- // scc.data(),
161- // eigenvalue.data(),
162- // vcc.data(),
163- // fail_info);
164- // hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
165- // hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
173+
166174}
167175
168176template <>
@@ -179,69 +187,73 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
179187 assert (nstart == ldh);
180188
181189 // save a copy of scc in case the diagonalization fails
182- std::vector<std::complex <double >> scc (nstart * nstart, {0 , 0 });
183- hipErrcheck (hipMemcpy (scc.data (), _scc, sizeof (std::complex <double >) * scc.size (), hipMemcpyDeviceToHost));
184-
185- hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (std::complex <double >) * ldh * nstart, hipMemcpyDeviceToDevice));
186-
187- // now vcc contains hcc
188-
189- // prepare some values for hipsolverDnZhegvd_bufferSize
190- int * devInfo = nullptr ;
191- int lwork = 0 , info_gpu = 0 ;
192- double2 * work = nullptr ;
193- hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
194- hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
195-
196- // calculate the sizes needed for pre-allocated buffer.
197- hipsolverErrcheck (hipsolverDnZhegvd_bufferSize (
198- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
199- nstart,
200- reinterpret_cast <const double2 *>(_vcc), ldh,
201- reinterpret_cast <const double2 *>(_scc), ldh,
202- _eigenvalue,
203- &lwork));
204-
205- // allocate memery
206- hipErrcheck (hipMalloc ((void **)&work, sizeof (double2 ) * lwork));
207-
208- // compute eigenvalues and eigenvectors.
209- hipsolverErrcheck (hipsolverDnZhegvd (
210- hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
211- nstart,
212- reinterpret_cast <double2 *>(_vcc), ldh,
213- const_cast <double2 *>(reinterpret_cast <const double2 *>(_scc)), ldh,
214- _eigenvalue,
215- work, lwork, devInfo));
216-
217- hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
218- // free the buffer
219- hipErrcheck (hipFree (work));
220- hipErrcheck (hipFree (devInfo));
190+ if (nstart > N_DCU){
191+ std::vector<std::complex <double >> scc (nstart * nstart, {0 , 0 });
192+ hipErrcheck (hipMemcpy (scc.data (), _scc, sizeof (std::complex <double >) * scc.size (), hipMemcpyDeviceToHost));
193+
194+ hipErrcheck (hipMemcpy (_vcc, _hcc, sizeof (std::complex <double >) * ldh * nstart, hipMemcpyDeviceToDevice));
195+
196+ // now vcc contains hcc
197+
198+ // prepare some values for hipsolverDnZhegvd_bufferSize
199+ int * devInfo = nullptr ;
200+ int lwork = 0 , info_gpu = 0 ;
201+ double2 * work = nullptr ;
202+ hipErrcheck (hipMalloc ((void **)&devInfo, sizeof (int )));
203+ hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
204+
205+ // calculate the sizes needed for pre-allocated buffer.
206+ hipsolverErrcheck (hipsolverDnZhegvd_bufferSize (
207+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
208+ nstart,
209+ reinterpret_cast <const double2 *>(_vcc), ldh,
210+ reinterpret_cast <const double2 *>(_scc), ldh,
211+ _eigenvalue,
212+ &lwork));
213+
214+ // allocate memery
215+ hipErrcheck (hipMalloc ((void **)&work, sizeof (double2 ) * lwork));
216+
217+ // compute eigenvalues and eigenvectors.
218+ hipsolverErrcheck (hipsolverDnZhegvd (
219+ hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
220+ nstart,
221+ reinterpret_cast <double2 *>(_vcc), ldh,
222+ const_cast <double2 *>(reinterpret_cast <const double2 *>(_scc)), ldh,
223+ _eigenvalue,
224+ work, lwork, devInfo));
225+
226+ hipErrcheck (hipMemcpy (&info_gpu, devInfo, sizeof (int ), hipMemcpyDeviceToHost));
227+ // free the buffer
228+ hipErrcheck (hipFree (work));
229+ hipErrcheck (hipFree (devInfo));
230+ }
221231 // if(fail_info != nullptr) *fail_info = info_gpu;
232+ else {
233+ std::vector<std::complex <double >> hcc (nstart * nstart, {0 , 0 });
234+ std::vector<std::complex <double >> scc (nstart * nstart, {0 , 0 });
235+ std::vector<std::complex <double >> vcc (nstart * nstart, {0 , 0 });
236+ std::vector<double > eigenvalue (nstart, 0 );
237+ hipErrcheck (hipMemcpy (hcc.data (), _hcc, sizeof (std::complex <double >) * hcc.size (), hipMemcpyDeviceToHost));
238+ hipErrcheck (hipMemcpy (scc.data (), _scc, sizeof (std::complex <double >) * scc.size (), hipMemcpyDeviceToHost));
239+ base_device::DEVICE_CPU* cpu_ctx = {};
240+ dngvd_op<std::complex <double >, base_device::DEVICE_CPU>()(cpu_ctx,
241+ nstart,
242+ ldh,
243+ hcc.data (),
244+ scc.data (),
245+ eigenvalue.data (),
246+ vcc.data ());
247+ hipErrcheck (hipMemcpy (_vcc, vcc.data (), sizeof (std::complex <double >) * vcc.size (), hipMemcpyHostToDevice));
248+ hipErrcheck (hipMemcpy (_eigenvalue, eigenvalue.data (), sizeof (double ) * eigenvalue.size (), hipMemcpyHostToDevice));
249+ }
222250
223251
224252
225253
226254
227255
228-
229- /* std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
230- std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
231- std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
232- std::vector<double> eigenvalue(nstart, 0);
233- hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
234- hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
235- base_device::DEVICE_CPU* cpu_ctx = {};
236- dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
237- nstart,
238- ldh,
239- hcc.data(),
240- scc.data(),
241- eigenvalue.data(),
242- vcc.data());
243- hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
244- hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));*/
256+
245257}
246258
247259#ifdef __LCAO
0 commit comments