Skip to content

Commit b1a3c15

Browse files
committed
add debug for dcu codes
1 parent 6427bb4 commit b1a3c15

File tree

6 files changed

+82
-23
lines changed

6 files changed

+82
-23
lines changed

source/module_base/module_device/device.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
#include <base/macros/macros.h>
77
#include <cstring>
8-
8+
#include <chrono>
9+
#include <iostream>
910
#ifdef __MPI
1011
#include "mpi.h"
1112
#endif
@@ -166,6 +167,11 @@ int device_count = -1;
166167
cudaGetDeviceCount(&device_count);
167168
#elif defined(__ROCM)
168169
hipGetDeviceCount(&device_count);
170+
/***auto start_time = std::chrono::high_resolution_clock::now();
171+
std::cout << "Starting hipGetDeviceCount.." << std::endl;
172+
auto end_time = std::chrono::high_resolution_clock::now();
173+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
174+
std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
169175
#endif
170176
if (device_count <= 0)
171177
{
@@ -711,4 +717,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
711717
#endif
712718

713719
} // end of namespace information
714-
} // end of namespace base_device
720+
} // end of namespace base_device

source/module_esolver/esolver.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,21 @@ std::string determine_type()
107107
}
108108
if (GlobalV::MY_RANK == 0)
109109
{
110-
std::cout << " RUNNING WITH DEVICE : " << device_info << " / "
110+
/***auto start_time = std::chrono::high_resolution_clock::now();
111+
std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
112+
std::cout << " RUNNING WITH DEVICE : " << device_info << " / "
111113
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
114+
/***auto end_time = std::chrono::high_resolution_clock::now();
115+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
116+
std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
112117
}
113-
118+
/*** auto start_time = std::chrono::high_resolution_clock::now();
119+
std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
114120
GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / "
115-
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
116-
121+
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
122+
/***auto end_time = std::chrono::high_resolution_clock::now();
123+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
124+
std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
117125
return esolver_type;
118126
}
119127

source/module_hsolver/diago_dav_subspace.cpp

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "module_hsolver/kernels/dngvd_op.h"
88
#include "module_hsolver/kernels/math_kernel_op.h"
99
#include "module_base/kernels/dsp/dsp_connector.h"
10-
10+
#include <hip/hip_runtime.h>
1111
#include <vector>
1212

1313
using namespace hsolver;
@@ -261,6 +261,16 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
261261
bool test_precond = true;
262262
bool test_norm = true;
263263
ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad");
264+
/***#if defined(__CUDA) || defined(__ROCM)
265+
if (this->device == base_device::GpuDevice) {
266+
size_t free_mem, total_mem;
267+
hipMemGetInfo(&free_mem, &total_mem);
268+
std::cout << "\n[Memory Debug] Before operations - Free GPU memory: " << free_mem / 1024.0 / 1024.0
269+
<< " MB, Total: " << total_mem / 1024.0 / 1024.0 << " MB" << std::endl;
270+
std::cout << "[Memory Debug] Dimensions - dim: " << dim << ", nbase: " << nbase
271+
<< ", notconv: " << notconv << std::endl;
272+
}
273+
#endif***/
264274
for (size_t i = 0; i < notconv; i++)
265275
{
266276
if (unconv[i] != i)
@@ -292,13 +302,27 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
292302

293303
// Eigenvalues operation section
294304
if (test_eigs) {
305+
/***#if defined(__CUDA) || defined(__ROCM)
306+
if (this->device == base_device::GpuDevice) {
307+
size_t free_mem, total_mem;
308+
hipMemGetInfo(&free_mem, &total_mem);
309+
std::cout << "\n[Memory Debug] Before eigenvalues op - Free GPU memory: " << free_mem / 1024.0 / 1024.0
310+
<< " MB" << std::endl;
311+
}
312+
#endif***/
295313
// Original implementation
296314
std::vector<Real> e_temp_cpu(this->notconv, 0);
297315
Real* e_temp_hd = e_temp_cpu.data();
298316
if (this->device == base_device::GpuDevice)
299317
{
300318
e_temp_hd = nullptr;
301-
resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
319+
try {
320+
resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
321+
} catch (const std::exception& e) {
322+
std::cerr << "[Memory Debug] Failed to allocate e_temp_hd of size "
323+
<< (this->notconv * sizeof(Real)) << " bytes: " << e.what() << std::endl;
324+
throw;
325+
}
302326
}
303327

304328
for (int m = 0; m < this->notconv; m++)
@@ -371,10 +395,18 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
371395
#if defined(__CUDA) || defined(__ROCM)
372396
if (this->device == base_device::GpuDevice)
373397
{
398+
/*** size_t free_mem, total_mem;
399+
hipMemGetInfo(&free_mem, &total_mem);
400+
std::cout << "\n[Memory Debug] Before precondition op - Free GPU memory: " << free_mem / 1024.0 / 1024.0 << " MB" << std::endl;***/
374401
Real* eigenvalues_gpu = nullptr;
375-
resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
376-
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu, (*eigenvalue_iter).data(), notconv);
377-
402+
try {
403+
resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
404+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
405+
} catch (const std::exception& e) {
406+
std::cerr << "[Memory Debug] Failed to allocate eigenvalues_gpu of size "
407+
<< (notconv * sizeof(Real)) << " bytes: " << e.what() << std::endl;
408+
throw;
409+
}
378410
precondition_op<T, Device>()(this->ctx,
379411
this->dim,
380412
psi_iter,
@@ -433,9 +465,20 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
433465
#if defined(__CUDA) || defined(__ROCM)
434466
if (this->device == base_device::GpuDevice)
435467
{
468+
/***size_t free_mem, total_mem;
469+
hipMemGetInfo(&free_mem, &total_mem);
470+
std::cout << "\n[Memory Debug] Before normalize op - Free GPU memory: " << free_mem / 1024.0 / 1024.0 << " MB" << std::endl;***/
436471
Real* psi_norm = nullptr;
437-
resmem_real_op()(this->ctx, psi_norm, notconv);
438-
cudaMemset(psi_norm, 0, notconv * sizeof(Real));
472+
try {
473+
resmem_real_op()(this->ctx, psi_norm, notconv);
474+
using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
475+
setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
476+
//hipMemset(psi_norm, 0, notconv * sizeof(Real));
477+
} catch (const std::exception& e){
478+
std::cerr << "[Memory Debug] Failed to allocate psi_norm of size "
479+
<< (notconv * sizeof(Real)) << " bytes: " << e.what() << std::endl;
480+
throw;
481+
}
439482
normalize_op<T, Device>()(this->ctx,
440483
this->dim,
441484
psi_iter,

source/module_hsolver/hsolver_pw.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,4 +118,4 @@ class HSolverPW
118118

119119
} // namespace hsolver
120120

121-
#endif
121+
#endif

source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
#define FULL_MASK 0xffffffff
1313
#define THREAD_PER_BLOCK 256
1414

15+
template <>
16+
struct GetTypeReal<thrust::complex<float>> {
17+
using type = float; /**< The return type specialization for std::complex<double>. */
18+
};
19+
template <>
20+
struct GetTypeReal<thrust::complex<double>> {
21+
using type = double; /**< The return type specialization for std::complex<double>. */
22+
};
23+
1524
// Forward declarations for abs2
1625
template<typename T>
1726
__device__ typename GetTypeReal<T>::type abs2(const T& x);
@@ -54,14 +63,6 @@ __device__ double abs2(const std::complex<double>& x) {
5463
return tx->real() * tx->real() + tx->imag() * tx->imag();
5564
}
5665

57-
template <>
58-
struct GetTypeReal<thrust::complex<float>> {
59-
using type = float; /**< The return type specialization for std::complex<double>. */
60-
};
61-
template <>
62-
struct GetTypeReal<thrust::complex<double>> {
63-
using type = double; /**< The return type specialization for std::complex<double>. */
64-
};
6566

6667
namespace hsolver {
6768

source/module_psi/psi_init.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
265265
{
266266
for (int ik = 0; ik < this->pw_wfc->nks; ++ik)
267267
{
268+
if(ik > 0) continue;
268269
//! Update Hamiltonian from other kpoint to the given one
269270
p_hamilt->updateHk(ik);
270271

@@ -316,4 +317,4 @@ template class PSIInit<std::complex<double>, base_device::DEVICE_CPU>;
316317
template class PSIInit<std::complex<float>, base_device::DEVICE_GPU>;
317318
template class PSIInit<std::complex<double>, base_device::DEVICE_GPU>;
318319
#endif
319-
} // namespace psi
320+
} // namespace psi

0 commit comments

Comments
 (0)