Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 27 additions & 15 deletions source/module_base/kernels/cuda/math_kernel_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
}

template <typename T>
__global__ void vector_mul_vector_kernel(
const int size,
T* result,
const T* vector1,
const typename GetTypeReal<T>::type* vector2)
__global__ void vector_mul_vector_kernel(const int size,
T* result,
const T* vector1,
const typename GetTypeReal<T>::type* vector2,
const bool add)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
result[i] = vector1[i] * vector2[i];
if (add)
{
result[i] += vector1[i] * vector2[i];
}
else
{
result[i] = vector1[i] * vector2[i];
}
}
}

Expand Down Expand Up @@ -548,11 +555,12 @@ template <>
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
double* result,
const double* vector1,
const double* vector2)
const double* vector2,
const bool& add)
{
int thread = thread_per_block;
int block = (dim + thread - 1) / thread;
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);

cudaCheckOnDebug();
}
Expand All @@ -561,32 +569,35 @@ template <typename FPTYPE>
inline void vector_mul_vector_complex_wrapper(const int& dim,
std::complex<FPTYPE>* result,
const std::complex<FPTYPE>* vector1,
const FPTYPE* vector2)
const FPTYPE* vector2,
const bool& add)
{
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
int thread = thread_per_block;
int block = (dim + thread - 1) / thread;
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);

cudaCheckOnDebug();
}
template <>
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<float>* result,
const std::complex<float>* vector1,
const float* vector2)
const float* vector2,
const bool& add)
{
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
}
template <>
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
const int& dim,
std::complex<double>* result,
const std::complex<double>* vector1,
const double* vector2)
const double* vector2,
const bool& add)
{
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
}

// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
Expand Down Expand Up @@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
Expand All @@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
Expand All @@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
#ifdef __LCAO
template struct dot_real_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
#endif
} // namespace ModuleBase
19 changes: 16 additions & 3 deletions source/module_base/kernels/math_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,27 @@ template <typename T>
struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
{
using Real = typename GetTypeReal<T>::type;
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
{
if (add)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
#endif
for (int i = 0; i < dim; i++)
for (int i = 0; i < dim; i++)
{
result[i] += vector1[i] * vector2[i];
}
}
else
{
result[i] = vector1[i] * vector2[i];
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * vector2[i];
}
}
}
};
Expand Down
7 changes: 3 additions & 4 deletions source/module_base/kernels/math_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
/// \param dim : array size
/// \param vector1 : input array A
/// \param vector2 : input array B
/// \param add : flag to control whether to add the result to the output array
///
/// Output Parameters
/// \param result : output array
void operator()(const int &dim, T *result, const T *vector1,
const Real *vector2);
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
};

// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
Expand Down Expand Up @@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const int &dim, T *result,
const T *vector1, const Real *vector2);
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
};

// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
Expand Down
35 changes: 24 additions & 11 deletions source/module_base/kernels/rocm/math_kernel_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
const int size,
T* result,
const T* vector1,
const typename GetTypeReal<T>::type* vector2)
const typename GetTypeReal<T>::type* vector2,
const bool add)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
result[i] = vector1[i] * vector2[i];
if (add)
{
result[i] += vector1[i] * vector2[i];
}
else
{
result[i] = vector1[i] * vector2[i];
}
}
}

Expand Down Expand Up @@ -471,11 +479,12 @@ template <>
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
double* result,
const double* vector1,
const double* vector2)
const double* vector2,
const bool& add)
{
int thread = 1024;
int block = (dim + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);

hipCheckOnDebug();
}
Expand All @@ -485,32 +494,35 @@ template <typename FPTYPE>
inline void vector_mul_vector_complex_wrapper(const int& dim,
std::complex<FPTYPE>* result,
const std::complex<FPTYPE>* vector1,
const FPTYPE* vector2)
const FPTYPE* vector2,
const bool& add)
{
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
int thread = 1024;
int block = (dim + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);

hipCheckOnDebug();
}
template <>
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<float>* result,
const std::complex<float>* vector1,
const float* vector2)
const float* vector2,
const bool& add)
{
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
}
template <>
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
const int& dim,
std::complex<double>* result,
const std::complex<double>* vector1,
const double* vector2)
const double* vector2,
const bool& add)
{
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
}
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
template <>
Expand Down Expand Up @@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
Expand All @@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
Expand All @@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
#ifdef __LCAO
template struct dot_real_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
template struct matrixCopy<double, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
Expand Down
2 changes: 1 addition & 1 deletion source/module_esolver/esolver_ks_pw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
//! 7) Use Kubo-Greenwood method to compute conductivities
if (PARAM.inp.cal_cond)
{
EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
elec_cond.KG(PARAM.inp.cond_smear,
PARAM.inp.cond_fwhm,
PARAM.inp.cond_wcut,
Expand Down
Loading
Loading