Skip to content

Commit 7f8ddce

Browse files
authored
Merge branch 'develop' into develop
2 parents 5318a96 + 8973c25 commit 7f8ddce

File tree

28 files changed

+678
-236
lines changed

28 files changed

+678
-236
lines changed

Dockerfile.intel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ RUN source /opt/intel/oneapi/setvars.sh && \
4444
ln -s /usr/local/include/elpa_openmp-$ELPA_VER/elpa /usr/local/include/ && \
4545
cd /tmp && rm -rf elpa-$ELPA_VER
4646

47+
RUN cd /tmp && git clone https://github.com/Tencent/rapidjson.git && cp -r rapidjson/include/rapidjson /usr/include/ \
48+
&& rm -rf rapidjson
49+
4750
RUN wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip \
4851
--no-check-certificate --quiet -O libtorch.zip && \
4952
unzip -q libtorch.zip -d /opt && rm libtorch.zip

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
ABACUS Documentation
88
=================================================
99

10-
ABACUS (**A**tomic-orbital **B**ased **A**b-initio **C**omputation at **US**tc) is
10+
ABACUS (Atomic-orbital Based Ab-initio Computation at UStc) is
1111
an open-source computer code package based on density functional
1212
theory (DFT). The package utilizes both plane wave and numerical
1313
atomic basis sets with the usage of pseudopotentials

source/module_base/kernels/cuda/math_kernel_op.cu

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
325325
}
326326

327327
template <typename T>
328-
__global__ void vector_mul_vector_kernel(
329-
const int size,
330-
T* result,
331-
const T* vector1,
332-
const typename GetTypeReal<T>::type* vector2)
328+
__global__ void vector_mul_vector_kernel(const int size,
329+
T* result,
330+
const T* vector1,
331+
const typename GetTypeReal<T>::type* vector2,
332+
const bool add)
333333
{
334334
int i = blockIdx.x * blockDim.x + threadIdx.x;
335335
if (i < size)
336336
{
337-
result[i] = vector1[i] * vector2[i];
337+
if (add)
338+
{
339+
result[i] += vector1[i] * vector2[i];
340+
}
341+
else
342+
{
343+
result[i] = vector1[i] * vector2[i];
344+
}
338345
}
339346
}
340347

@@ -548,11 +555,12 @@ template <>
548555
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
549556
double* result,
550557
const double* vector1,
551-
const double* vector2)
558+
const double* vector2,
559+
const bool& add)
552560
{
553561
int thread = thread_per_block;
554562
int block = (dim + thread - 1) / thread;
555-
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
563+
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
556564

557565
cudaCheckOnDebug();
558566
}
@@ -561,32 +569,35 @@ template <typename FPTYPE>
561569
inline void vector_mul_vector_complex_wrapper(const int& dim,
562570
std::complex<FPTYPE>* result,
563571
const std::complex<FPTYPE>* vector1,
564-
const FPTYPE* vector2)
572+
const FPTYPE* vector2,
573+
const bool& add)
565574
{
566575
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
567576
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
568577
int thread = thread_per_block;
569578
int block = (dim + thread - 1) / thread;
570-
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
579+
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
571580

572581
cudaCheckOnDebug();
573582
}
574583
template <>
575584
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
576585
std::complex<float>* result,
577586
const std::complex<float>* vector1,
578-
const float* vector2)
587+
const float* vector2,
588+
const bool& add)
579589
{
580-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
590+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
581591
}
582592
template <>
583593
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
584594
const int& dim,
585595
std::complex<double>* result,
586596
const std::complex<double>* vector1,
587-
const double* vector2)
597+
const double* vector2,
598+
const bool& add)
588599
{
589-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
600+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
590601
}
591602

592603
// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
@@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
10191030
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
10201031
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
10211032
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
1033+
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
10221034
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10231035
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10241036
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
@@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
10291041
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10301042
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10311043
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
1044+
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10321045
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10331046
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10341047
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
@@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
10391052
#ifdef __LCAO
10401053
template struct dot_real_op<double, base_device::DEVICE_GPU>;
10411054
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
1042-
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10431055
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
10441056
#endif
10451057
} // namespace ModuleBase

source/module_base/kernels/math_kernel_op.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,27 @@ template <typename T>
167167
struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
168168
{
169169
using Real = typename GetTypeReal<T>::type;
170-
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
170+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
171171
{
172+
if (add)
173+
{
172174
#ifdef _OPENMP
173175
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
174176
#endif
175-
for (int i = 0; i < dim; i++)
177+
for (int i = 0; i < dim; i++)
178+
{
179+
result[i] += vector1[i] * vector2[i];
180+
}
181+
}
182+
else
176183
{
177-
result[i] = vector1[i] * vector2[i];
184+
#ifdef _OPENMP
185+
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
186+
#endif
187+
for (int i = 0; i < dim; i++)
188+
{
189+
result[i] = vector1[i] * vector2[i];
190+
}
178191
}
179192
}
180193
};

source/module_base/kernels/math_kernel_op.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
143143
/// \param dim : array size
144144
/// \param vector1 : input array A
145145
/// \param vector2 : input array B
146+
/// \param add : flag to control whether to add the result to the output array
146147
///
147148
/// Output Parameters
148149
/// \param result : output array
149-
void operator()(const int &dim, T *result, const T *vector1,
150-
const Real *vector2);
150+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
151151
};
152152

153153
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
350350
// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
351351
template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
352352
using Real = typename GetTypeReal<T>::type;
353-
void operator()(const int &dim, T *result,
354-
const T *vector1, const Real *vector2);
353+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
355354
};
356355

357356
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)

source/module_base/kernels/rocm/math_kernel_op.hip.cu

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
248248
const int size,
249249
T* result,
250250
const T* vector1,
251-
const typename GetTypeReal<T>::type* vector2)
251+
const typename GetTypeReal<T>::type* vector2,
252+
const bool add)
252253
{
253254
int i = blockIdx.x * blockDim.x + threadIdx.x;
254255
if (i < size)
255256
{
256-
result[i] = vector1[i] * vector2[i];
257+
if (add)
258+
{
259+
result[i] += vector1[i] * vector2[i];
260+
}
261+
else
262+
{
263+
result[i] = vector1[i] * vector2[i];
264+
}
257265
}
258266
}
259267

@@ -471,11 +479,12 @@ template <>
471479
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
472480
double* result,
473481
const double* vector1,
474-
const double* vector2)
482+
const double* vector2,
483+
const bool& add)
475484
{
476485
int thread = 1024;
477486
int block = (dim + thread - 1) / thread;
478-
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
487+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
479488

480489
hipCheckOnDebug();
481490
}
@@ -485,32 +494,35 @@ template <typename FPTYPE>
485494
inline void vector_mul_vector_complex_wrapper(const int& dim,
486495
std::complex<FPTYPE>* result,
487496
const std::complex<FPTYPE>* vector1,
488-
const FPTYPE* vector2)
497+
const FPTYPE* vector2,
498+
const bool& add)
489499
{
490500
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
491501
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
492502
int thread = 1024;
493503
int block = (dim + thread - 1) / thread;
494-
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
504+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
495505

496506
hipCheckOnDebug();
497507
}
498508
template <>
499509
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
500510
std::complex<float>* result,
501511
const std::complex<float>* vector1,
502-
const float* vector2)
512+
const float* vector2,
513+
const bool& add)
503514
{
504-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
515+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
505516
}
506517
template <>
507518
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
508519
const int& dim,
509520
std::complex<double>* result,
510521
const std::complex<double>* vector1,
511-
const double* vector2)
522+
const double* vector2,
523+
const bool& add)
512524
{
513-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
525+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
514526
}
515527
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
516528
template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
931943
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
932944
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
933945
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
946+
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
934947
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
935948
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
936949
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
940953
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
941954
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
942955
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
956+
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
943957
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
944958
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
945959
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
948962
#ifdef __LCAO
949963
template struct dot_real_op<double, base_device::DEVICE_GPU>;
950964
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
951-
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
952965
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
953966
template struct matrixCopy<double, base_device::DEVICE_GPU>;
954967
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;

source/module_base/module_device/rocm/memory_op.hip.cu

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
108108
// No need to cast the memory if the data types are the same.
109109
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
110110
{
111-
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
112-
dev_in,
113-
arr_out,
111+
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(arr_out,
114112
reinterpret_cast<const FPTYPE_out*>(arr_in),
115113
size);
116114
return;
@@ -135,9 +133,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
135133
// No need to cast the memory if the data types are the same.
136134
if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
137135
{
138-
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
139-
dev_in,
140-
arr_out,
136+
synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(arr_out,
141137
reinterpret_cast<const FPTYPE_out*>(arr_in),
142138
size);
143139
return;

source/module_esolver/esolver_ks_pw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
943943
//! 7) Use Kubo-Greenwood method to compute conductivities
944944
if (PARAM.inp.cal_cond)
945945
{
946-
EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
946+
EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
947947
elec_cond.KG(PARAM.inp.cond_smear,
948948
PARAM.inp.cond_fwhm,
949949
PARAM.inp.cond_wcut,

source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ void sparse_format::cal_dH(const UnitCell& ucell,
2626
fsr_dh.DHloc_fixedR_y = new double[nnr];
2727
fsr_dh.DHloc_fixedR_z = new double[nnr];
2828

29-
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_x, pv.nloc);
30-
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_y, pv.nloc);
31-
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_z, pv.nloc);
29+
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_x, nnr);
30+
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_y, nnr);
31+
ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_z, nnr);
3232
// cal dT=<phi|kin|dphi> in LCAO
3333
// cal T + VNL(P1) in LCAO basis
3434
const bool cal_deri = true;

0 commit comments

Comments
 (0)