Skip to content

Commit 91e8dc2

Browse files
committed
Fix bugs
1 parent 7646974 commit 91e8dc2

File tree

2 files changed

+6
-144
lines changed

2 files changed

+6
-144
lines changed

source/module_base/blas_connector.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,10 @@
99
#ifdef __CUDA
1010
#include <base/macros/macros.h>
1111
#include <cuda_runtime.h>
12-
#include <thrust/complex.h>
13-
#include <thrust/execution_policy.h>
14-
#include <thrust/inner_product.h>
15-
#include "module_base/tool_quit.h"
16-
#include "module_base/kernels/cuda/math_op.cu"
17-
1812
#include "cublas_v2.h"
13+
#include "module_hsolver/kernels/math_kernel_op.h"
14+
#include "module_base/module_device/memory_op.h"
15+
1916

2017
namespace BlasUtils{
2118

@@ -671,7 +668,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec
671668
}
672669
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
673670
#ifdef __CUDA
674-
vector_mul_vector_gpu(dim, result, vector1, vector2);
671+
hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
675672
#endif
676673
}
677674
}
@@ -691,7 +688,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec
691688
}
692689
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
693690
#ifdef __CUDA
694-
vector_mul_vector_gpu(dim, result, vector1, vector2);
691+
hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
695692
#endif
696693
}
697694
}

source/module_base/kernels/cuda/math_op.cu

Lines changed: 1 addition & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
#include "cuda_runtime.h"
1+
#include <cuda_runtime.h>
22
#include "module_base/kernels/math_op.h"
3-
#include "module_base/macros.h"
43

54
#include <base/macros/macros.h>
6-
#include <cuda_runtime.h>
7-
#include <thrust/complex.h>
8-
#include <thrust/execution_policy.h>
9-
#include <thrust/inner_product.h>
105

116
namespace ModuleBase {
127

@@ -159,134 +154,4 @@ void cal_ylm_real_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
159154
template struct cal_ylm_real_op<float, base_device::DEVICE_GPU>;
160155
template struct cal_ylm_real_op<double, base_device::DEVICE_GPU>;
161156

162-
163-
// The next are kernels for new blas_connector
164-
165-
166-
template <typename T>
167-
__global__ void vector_mul_vector_kernel(
168-
const int size,
169-
T* result,
170-
const T* vector1,
171-
const typename GetTypeReal<T>::type* vector2)
172-
{
173-
int i = blockIdx.x * blockDim.x + threadIdx.x;
174-
if (i < size)
175-
{
176-
result[i] = vector1[i] * vector2[i];
177-
}
178-
}
179-
180-
template <typename T>
181-
__global__ void vector_div_vector_kernel(
182-
const int size,
183-
T* result,
184-
const T* vector1,
185-
const typename GetTypeReal<T>::type* vector2)
186-
{
187-
int i = blockIdx.x * blockDim.x + threadIdx.x;
188-
if (i < size)
189-
{
190-
result[i] = vector1[i] / vector2[i];
191-
}
192-
}
193-
194-
template <typename FPTYPE>
195-
inline void vector_div_vector_complex_wrapper(const int dim,
196-
std::complex<FPTYPE>* result,
197-
const std::complex<FPTYPE>* vector,
198-
const FPTYPE constant)
199-
{
200-
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
201-
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
202-
int thread = THREADS_PER_BLOCK;
203-
int block = (dim + thread - 1) / thread;
204-
vector_div_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
205-
206-
cudaCheckOnDebug();
207-
}
208-
209-
template <typename FPTYPE>
210-
inline void vector_mul_vector_complex_wrapper(const int& dim,
211-
std::complex<FPTYPE>* result,
212-
const std::complex<FPTYPE>* vector1,
213-
const FPTYPE* vector2)
214-
{
215-
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
216-
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
217-
int thread = THREADS_PER_BLOCK;
218-
int block = (dim + thread - 1) / thread;
219-
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
220-
221-
cudaCheckOnDebug();
222-
}
223-
224-
void vector_div_vector_gpu(const int& dim,
225-
double* result,
226-
const double* vector1,
227-
const double* vector2)
228-
{
229-
int thread = THREADS_PER_BLOCK;
230-
int block = (dim + thread - 1) / thread;
231-
vector_div_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
232-
233-
cudaCheckOnDebug();
234-
}
235-
236-
void vector_div_vector_gpu(const int& dim,
237-
float* result,
238-
const float* vector1,
239-
const float* vector2)
240-
{
241-
int thread = THREADS_PER_BLOCK;
242-
int block = (dim + thread - 1) / thread;
243-
vector_div_vector_kernel<float> <<<block, thread >>> (dim, result, vector1, vector2);
244-
245-
cudaCheckOnDebug();
246-
}
247-
248-
void vector_div_vector_gpu(const int& dim, std::complex<float>* result, const std::complex<float>* vector1, const float* vector2)
249-
{
250-
vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
251-
}
252-
253-
void vector_div_vector_gpu(const int& dim, std::complex<double>* result, const std::complex<double>* vector1, const double* vector2)
254-
{
255-
vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
256-
}
257-
258-
void vector_mul_vector_gpu(const int& dim,
259-
double* result,
260-
const double* vector1,
261-
const double* vector2)
262-
{
263-
int thread = THREADS_PER_BLOCK;
264-
int block = (dim + thread - 1) / thread;
265-
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
266-
267-
cudaCheckOnDebug();
268-
}
269-
270-
void vector_mul_vector_gpu(const int& dim,
271-
float* result,
272-
const float* vector1,
273-
const float* vector2)
274-
{
275-
int thread = THREADS_PER_BLOCK;
276-
int block = (dim + thread - 1) / thread;
277-
vector_mul_vector_kernel<float> <<<block, thread >>> (dim, result, vector1, vector2);
278-
279-
cudaCheckOnDebug();
280-
}
281-
282-
void vector_mul_vector_gpu(const int& dim, std::complex<float>* result, const std::complex<float>* vector1, const float* vector2)
283-
{
284-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
285-
}
286-
287-
void vector_mul_vector_gpu(const int& dim, std::complex<double>* result, const std::complex<double>* vector1, const double* vector2)
288-
{
289-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
290-
}
291-
292157
} // namespace ModuleBase

0 commit comments

Comments
 (0)