@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
248248 const int size,
249249 T* result,
250250 const T* vector1,
251- const typename GetTypeReal<T>::type* vector2)
251+ const typename GetTypeReal<T>::type* vector2,
252+ const bool add)
252253{
253254 int i = blockIdx .x * blockDim .x + threadIdx .x ;
254255 if (i < size)
255256 {
256- result[i] = vector1[i] * vector2[i];
257+ if (add)
258+ {
259+ result[i] += vector1[i] * vector2[i];
260+ }
261+ else
262+ {
263+ result[i] = vector1[i] * vector2[i];
264+ }
257265 }
258266}
259267
@@ -471,11 +479,12 @@ template <>
471479void vector_mul_vector_op<double , base_device::DEVICE_GPU>::operator ()(const int & dim,
472480 double * result,
473481 const double * vector1,
474- const double * vector2)
482+ const double * vector2,
483+ const bool & add)
475484{
476485 int thread = 1024 ;
477486 int block = (dim + thread - 1 ) / thread;
478- hipLaunchKernelGGL (HIP_KERNEL_NAME (vector_mul_vector_kernel<double >), dim3 (block), dim3 (thread), 0 , 0 , dim, result, vector1, vector2);
487+ hipLaunchKernelGGL (HIP_KERNEL_NAME (vector_mul_vector_kernel<double >), dim3 (block), dim3 (thread), 0 , 0 , dim, result, vector1, vector2, add );
479488
480489 hipCheckOnDebug ();
481490}
@@ -485,32 +494,35 @@ template <typename FPTYPE>
485494inline void vector_mul_vector_complex_wrapper (const int & dim,
486495 std::complex <FPTYPE>* result,
487496 const std::complex <FPTYPE>* vector1,
488- const FPTYPE* vector2)
497+ const FPTYPE* vector2,
498+ const bool & add)
489499{
490500 thrust::complex <FPTYPE>* result_tmp = reinterpret_cast <thrust::complex <FPTYPE>*>(result);
491501 const thrust::complex <FPTYPE>* vector1_tmp = reinterpret_cast <const thrust::complex <FPTYPE>*>(vector1);
492502 int thread = 1024 ;
493503 int block = (dim + thread - 1 ) / thread;
494- hipLaunchKernelGGL (HIP_KERNEL_NAME (vector_mul_vector_kernel<thrust::complex <FPTYPE>>), dim3 (block), dim3 (thread), 0 , 0 , dim, result_tmp, vector1_tmp, vector2);
504+ hipLaunchKernelGGL (HIP_KERNEL_NAME (vector_mul_vector_kernel<thrust::complex <FPTYPE>>), dim3 (block), dim3 (thread), 0 , 0 , dim, result_tmp, vector1_tmp, vector2, add );
495505
496506 hipCheckOnDebug ();
497507}
498508template <>
499509void vector_mul_vector_op<std::complex <float >, base_device::DEVICE_GPU>::operator ()(const int & dim,
500510 std::complex <float >* result,
501511 const std::complex <float >* vector1,
502- const float * vector2)
512+ const float * vector2,
513+ const bool & add)
503514{
504- vector_mul_vector_complex_wrapper (dim, result, vector1, vector2);
515+ vector_mul_vector_complex_wrapper (dim, result, vector1, vector2, add );
505516}
506517template <>
507518void vector_mul_vector_op<std::complex <double >, base_device::DEVICE_GPU>::operator ()(
508519 const int & dim,
509520 std::complex <double >* result,
510521 const std::complex <double >* vector1,
511- const double * vector2)
522+ const double * vector2,
523+ const bool & add)
512524{
513- vector_mul_vector_complex_wrapper (dim, result, vector1, vector2);
525+ vector_mul_vector_complex_wrapper (dim, result, vector1, vector2, add );
514526}
515527// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
516528template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
931943template struct calc_grad_with_block_op <std::complex <float >, base_device::DEVICE_GPU>;
932944template struct line_minimize_with_block_op <std::complex <float >, base_device::DEVICE_GPU>;
933945template struct vector_div_constant_op <std::complex <float >, base_device::DEVICE_GPU>;
946+ template struct vector_mul_vector_op <float , base_device::DEVICE_GPU>;
934947template struct vector_mul_vector_op <std::complex <float >, base_device::DEVICE_GPU>;
935948template struct vector_div_vector_op <std::complex <float >, base_device::DEVICE_GPU>;
936949template struct constantvector_addORsub_constantVector_op <std::complex <float >, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
940953template struct calc_grad_with_block_op <std::complex <double >, base_device::DEVICE_GPU>;
941954template struct line_minimize_with_block_op <std::complex <double >, base_device::DEVICE_GPU>;
942955template struct vector_div_constant_op <std::complex <double >, base_device::DEVICE_GPU>;
956+ template struct vector_mul_vector_op <double , base_device::DEVICE_GPU>;
943957template struct vector_mul_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
944958template struct vector_div_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
945959template struct constantvector_addORsub_constantVector_op <std::complex <double >, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
948962#ifdef __LCAO
949963template struct dot_real_op <double , base_device::DEVICE_GPU>;
950964template struct vector_div_constant_op <double , base_device::DEVICE_GPU>;
951- template struct vector_mul_vector_op <double , base_device::DEVICE_GPU>;
952965template struct vector_div_vector_op <double , base_device::DEVICE_GPU>;
953966template struct matrixCopy <double , base_device::DEVICE_GPU>;
954967template struct constantvector_addORsub_constantVector_op <double , base_device::DEVICE_GPU>;
0 commit comments