Skip to content

Commit a871050

Browse files
committed
dcu optimize
1 parent 87568f0 commit a871050

File tree

1 file changed

+176
-164
lines changed

1 file changed

+176
-164
lines changed

source/module_hsolver/kernels/rocm/dngvd_op.hip.cu

Lines changed: 176 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ namespace hsolver {
88
// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
99

1010
static hipsolverHandle_t hipsolver_H = nullptr;
11+
// Test on DCU platform. When nstart is greater than 234, code on DCU performs better.
12+
const int N_DCU = 234;
1113

1214
void createGpuSolverHandle() {
1315
if (hipsolver_H == nullptr)
@@ -37,62 +39,65 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
3739
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
3840
assert(nstart == ldh);
3941

40-
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
41-
// now vcc contains hcc
42-
43-
// prepare some values for hipsolverDnZhegvd_bufferSize
44-
int * devInfo = nullptr;
45-
int lwork = 0, info_gpu = 0;
46-
double * work = nullptr;
47-
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
48-
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
49-
50-
// calculate the sizes needed for pre-allocated buffer.
51-
hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
52-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
53-
nstart,
54-
_vcc, ldh,
55-
_scc, ldh,
56-
_eigenvalue,
57-
&lwork));
58-
59-
// allocate memery
60-
hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
61-
62-
// compute eigenvalues and eigenvectors.
63-
hipsolverErrcheck(hipsolverDnDsygvd(
64-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
65-
nstart,
66-
_vcc, ldh,
67-
const_cast<double *>(_scc), ldh,
68-
_eigenvalue,
69-
work, lwork, devInfo));
70-
71-
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
72-
73-
// free the buffer
74-
hipErrcheck(hipFree(work));
75-
hipErrcheck(hipFree(devInfo));
42+
if (nstart > N_DCU){
43+
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
44+
// now vcc contains hcc
45+
46+
// prepare some values for hipsolverDnZhegvd_bufferSize
47+
int * devInfo = nullptr;
48+
int lwork = 0, info_gpu = 0;
49+
double * work = nullptr;
50+
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
51+
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
52+
53+
// calculate the sizes needed for pre-allocated buffer.
54+
hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
55+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
56+
nstart,
57+
_vcc, ldh,
58+
_scc, ldh,
59+
_eigenvalue,
60+
&lwork));
61+
62+
// allocate memery
63+
hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
64+
65+
// compute eigenvalues and eigenvectors.
66+
hipsolverErrcheck(hipsolverDnDsygvd(
67+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
68+
nstart,
69+
_vcc, ldh,
70+
const_cast<double *>(_scc), ldh,
71+
_eigenvalue,
72+
work, lwork, devInfo));
73+
74+
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
75+
76+
// free the buffer
77+
hipErrcheck(hipFree(work));
78+
hipErrcheck(hipFree(devInfo));
79+
}
7680
// if(fail_info != nullptr) *fail_info = info_gpu;
81+
else{
82+
std::vector<double> hcc(nstart * nstart, 0.0);
83+
std::vector<double> scc(nstart * nstart, 0.0);
84+
std::vector<double> vcc(nstart * nstart, 0.0);
85+
std::vector<double> eigenvalue(nstart, 0);
86+
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
87+
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
88+
base_device::DEVICE_CPU* cpu_ctx = {};
89+
dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
90+
nstart,
91+
ldh,
92+
hcc.data(),
93+
scc.data(),
94+
eigenvalue.data(),
95+
vcc.data());
96+
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
97+
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
98+
}
7799

78-
79-
//std::vector<double> hcc(nstart * nstart, 0.0);
80-
//std::vector<double> scc(nstart * nstart, 0.0);
81-
//std::vector<double> vcc(nstart * nstart, 0.0);
82-
//std::vector<double> eigenvalue(nstart, 0);
83-
//hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
84-
//hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
85-
//base_device::DEVICE_CPU* cpu_ctx = {};
86-
//dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
87-
// nstart,
88-
// ldh,
89-
// hcc.data(),
90-
// scc.data(),
91-
// eigenvalue.data(),
92-
// vcc.data(),
93-
// fail_info);
94-
//hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
95-
//hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
100+
96101
}
97102
#endif // __LCAO
98103

@@ -107,62 +112,65 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
107112
{
108113
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
109114
assert(nstart == ldh);
110-
111-
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
112-
// now vcc contains hcc
113-
114-
// prepare some values for hipsolverDnZhegvd_bufferSize
115-
int * devInfo = nullptr;
116-
int lwork = 0, info_gpu = 0;
117-
float2 * work = nullptr;
118-
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
119-
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
120-
121-
// calculate the sizes needed for pre-allocated buffer.
122-
hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
123-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
124-
nstart,
125-
reinterpret_cast<const float2 *>(_vcc), ldh,
126-
reinterpret_cast<const float2 *>(_scc), ldh,
127-
_eigenvalue,
128-
&lwork));
129-
130-
// allocate memery
131-
hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
132-
133-
// compute eigenvalues and eigenvectors.
134-
hipsolverErrcheck(hipsolverDnChegvd(
135-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
136-
nstart,
137-
reinterpret_cast<float2 *>(_vcc), ldh,
138-
const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
139-
_eigenvalue,
140-
work, lwork, devInfo));
141-
142-
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
143-
// free the buffer
144-
hipErrcheck(hipFree(work));
145-
hipErrcheck(hipFree(devInfo));
115+
116+
if (nstart > N_DCU){
117+
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
118+
// now vcc contains hcc
119+
120+
// prepare some values for hipsolverDnZhegvd_bufferSize
121+
int * devInfo = nullptr;
122+
int lwork = 0, info_gpu = 0;
123+
float2 * work = nullptr;
124+
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
125+
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
126+
127+
// calculate the sizes needed for pre-allocated buffer.
128+
hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
129+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
130+
nstart,
131+
reinterpret_cast<const float2 *>(_vcc), ldh,
132+
reinterpret_cast<const float2 *>(_scc), ldh,
133+
_eigenvalue,
134+
&lwork));
135+
136+
// allocate memery
137+
hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
138+
139+
// compute eigenvalues and eigenvectors.
140+
hipsolverErrcheck(hipsolverDnChegvd(
141+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
142+
nstart,
143+
reinterpret_cast<float2 *>(_vcc), ldh,
144+
const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
145+
_eigenvalue,
146+
work, lwork, devInfo));
147+
148+
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
149+
// free the buffer
150+
hipErrcheck(hipFree(work));
151+
hipErrcheck(hipFree(devInfo));
152+
}
146153
// if(fail_info != nullptr) *fail_info = info_gpu;
154+
else{
155+
std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
156+
std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
157+
std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
158+
std::vector<float> eigenvalue(nstart, 0);
159+
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
160+
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
161+
base_device::DEVICE_CPU* cpu_ctx = {};
162+
dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
163+
nstart,
164+
ldh,
165+
hcc.data(),
166+
scc.data(),
167+
eigenvalue.data(),
168+
vcc.data());
169+
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
170+
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
171+
}
147172

148-
149-
//std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
150-
//std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
151-
//std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
152-
//std::vector<float> eigenvalue(nstart, 0);
153-
//hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
154-
//hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
155-
//base_device::DEVICE_CPU* cpu_ctx = {};
156-
//dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
157-
// nstart,
158-
// ldh,
159-
// hcc.data(),
160-
// scc.data(),
161-
// eigenvalue.data(),
162-
// vcc.data(),
163-
// fail_info);
164-
//hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
165-
//hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
173+
166174
}
167175

168176
template <>
@@ -179,69 +187,73 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
179187
assert(nstart == ldh);
180188

181189
// save a copy of scc in case the diagonalization fails
182-
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
183-
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
184-
185-
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
186-
187-
// now vcc contains hcc
188-
189-
// prepare some values for hipsolverDnZhegvd_bufferSize
190-
int * devInfo = nullptr;
191-
int lwork = 0, info_gpu = 0;
192-
double2 * work = nullptr;
193-
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
194-
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
195-
196-
// calculate the sizes needed for pre-allocated buffer.
197-
hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
198-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
199-
nstart,
200-
reinterpret_cast<const double2 *>(_vcc), ldh,
201-
reinterpret_cast<const double2 *>(_scc), ldh,
202-
_eigenvalue,
203-
&lwork));
204-
205-
// allocate memery
206-
hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
207-
208-
// compute eigenvalues and eigenvectors.
209-
hipsolverErrcheck(hipsolverDnZhegvd(
210-
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
211-
nstart,
212-
reinterpret_cast<double2 *>(_vcc), ldh,
213-
const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
214-
_eigenvalue,
215-
work, lwork, devInfo));
216-
217-
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
218-
// free the buffer
219-
hipErrcheck(hipFree(work));
220-
hipErrcheck(hipFree(devInfo));
190+
if (nstart > N_DCU){
191+
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
192+
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
193+
194+
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
195+
196+
// now vcc contains hcc
197+
198+
// prepare some values for hipsolverDnZhegvd_bufferSize
199+
int * devInfo = nullptr;
200+
int lwork = 0, info_gpu = 0;
201+
double2 * work = nullptr;
202+
hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
203+
hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
204+
205+
// calculate the sizes needed for pre-allocated buffer.
206+
hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
207+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
208+
nstart,
209+
reinterpret_cast<const double2 *>(_vcc), ldh,
210+
reinterpret_cast<const double2 *>(_scc), ldh,
211+
_eigenvalue,
212+
&lwork));
213+
214+
// allocate memery
215+
hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
216+
217+
// compute eigenvalues and eigenvectors.
218+
hipsolverErrcheck(hipsolverDnZhegvd(
219+
hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
220+
nstart,
221+
reinterpret_cast<double2 *>(_vcc), ldh,
222+
const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
223+
_eigenvalue,
224+
work, lwork, devInfo));
225+
226+
hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
227+
// free the buffer
228+
hipErrcheck(hipFree(work));
229+
hipErrcheck(hipFree(devInfo));
230+
}
221231
// if(fail_info != nullptr) *fail_info = info_gpu;
232+
else{
233+
std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
234+
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
235+
std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
236+
std::vector<double> eigenvalue(nstart, 0);
237+
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
238+
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
239+
base_device::DEVICE_CPU* cpu_ctx = {};
240+
dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
241+
nstart,
242+
ldh,
243+
hcc.data(),
244+
scc.data(),
245+
eigenvalue.data(),
246+
vcc.data());
247+
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
248+
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
249+
}
222250

223251

224252

225253

226254

227255

228-
229-
/*std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
230-
std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
231-
std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
232-
std::vector<double> eigenvalue(nstart, 0);
233-
hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
234-
hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
235-
base_device::DEVICE_CPU* cpu_ctx = {};
236-
dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
237-
nstart,
238-
ldh,
239-
hcc.data(),
240-
scc.data(),
241-
eigenvalue.data(),
242-
vcc.data());
243-
hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
244-
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));*/
256+
245257
}
246258

247259
#ifdef __LCAO

0 commit comments

Comments
 (0)