|
1 | | -#include "cuda_runtime.h" |
| 1 | +#include <cuda_runtime.h> |
2 | 2 | #include "module_base/kernels/math_op.h" |
3 | | -#include "module_base/macros.h" |
4 | 3 |
|
5 | 4 | #include <base/macros/macros.h> |
6 | | -#include <cuda_runtime.h> |
7 | | -#include <thrust/complex.h> |
8 | | -#include <thrust/execution_policy.h> |
9 | | -#include <thrust/inner_product.h> |
10 | 5 |
|
11 | 6 | namespace ModuleBase { |
12 | 7 |
|
@@ -159,134 +154,4 @@ void cal_ylm_real_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev |
159 | 154 | template struct cal_ylm_real_op<float, base_device::DEVICE_GPU>; |
160 | 155 | template struct cal_ylm_real_op<double, base_device::DEVICE_GPU>; |
161 | 156 |
|
162 | | - |
163 | | -// The next are kernels for new blas_connector |
164 | | - |
165 | | - |
166 | | -template <typename T> |
167 | | -__global__ void vector_mul_vector_kernel( |
168 | | - const int size, |
169 | | - T* result, |
170 | | - const T* vector1, |
171 | | - const typename GetTypeReal<T>::type* vector2) |
172 | | -{ |
173 | | - int i = blockIdx.x * blockDim.x + threadIdx.x; |
174 | | - if (i < size) |
175 | | - { |
176 | | - result[i] = vector1[i] * vector2[i]; |
177 | | - } |
178 | | -} |
179 | | - |
180 | | -template <typename T> |
181 | | -__global__ void vector_div_vector_kernel( |
182 | | - const int size, |
183 | | - T* result, |
184 | | - const T* vector1, |
185 | | - const typename GetTypeReal<T>::type* vector2) |
186 | | -{ |
187 | | - int i = blockIdx.x * blockDim.x + threadIdx.x; |
188 | | - if (i < size) |
189 | | - { |
190 | | - result[i] = vector1[i] / vector2[i]; |
191 | | - } |
192 | | -} |
193 | | - |
194 | | -template <typename FPTYPE> |
195 | | -inline void vector_div_vector_complex_wrapper(const int dim, |
196 | | - std::complex<FPTYPE>* result, |
197 | | - const std::complex<FPTYPE>* vector, |
198 | | - const FPTYPE constant) |
199 | | -{ |
200 | | - thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result); |
201 | | - const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1); |
202 | | - int thread = THREADS_PER_BLOCK; |
203 | | - int block = (dim + thread - 1) / thread; |
204 | | - vector_div_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2); |
205 | | - |
206 | | - cudaCheckOnDebug(); |
207 | | -} |
208 | | - |
209 | | -template <typename FPTYPE> |
210 | | -inline void vector_mul_vector_complex_wrapper(const int& dim, |
211 | | - std::complex<FPTYPE>* result, |
212 | | - const std::complex<FPTYPE>* vector1, |
213 | | - const FPTYPE* vector2) |
214 | | -{ |
215 | | - thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result); |
216 | | - const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1); |
217 | | - int thread = THREADS_PER_BLOCK; |
218 | | - int block = (dim + thread - 1) / thread; |
219 | | - vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2); |
220 | | - |
221 | | - cudaCheckOnDebug(); |
222 | | -} |
223 | | - |
224 | | -void vector_div_vector_gpu(const int& dim, |
225 | | - double* result, |
226 | | - const double* vector1, |
227 | | - const double* vector2) |
228 | | -{ |
229 | | - int thread = THREADS_PER_BLOCK; |
230 | | - int block = (dim + thread - 1) / thread; |
231 | | - vector_div_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2); |
232 | | - |
233 | | - cudaCheckOnDebug(); |
234 | | -} |
235 | | - |
236 | | -void vector_div_vector_gpu(const int& dim, |
237 | | - float* result, |
238 | | - const float* vector1, |
239 | | - const float* vector2) |
240 | | -{ |
241 | | - int thread = THREADS_PER_BLOCK; |
242 | | - int block = (dim + thread - 1) / thread; |
243 | | - vector_div_vector_kernel<float> <<<block, thread >>> (dim, result, vector1, vector2); |
244 | | - |
245 | | - cudaCheckOnDebug(); |
246 | | -} |
247 | | - |
248 | | -void vector_div_vector_gpu(const int& dim, std::complex<float>* result, const std::complex<float>* vector1, const float* vector2) |
249 | | -{ |
250 | | - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); |
251 | | -} |
252 | | - |
253 | | -void vector_div_vector_gpu(const int& dim, std::complex<double>* result, const std::complex<double>* vector1, const double* vector2) |
254 | | -{ |
255 | | - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); |
256 | | -} |
257 | | - |
258 | | -void vector_mul_vector_gpu(const int& dim, |
259 | | - double* result, |
260 | | - const double* vector1, |
261 | | - const double* vector2) |
262 | | -{ |
263 | | - int thread = THREADS_PER_BLOCK; |
264 | | - int block = (dim + thread - 1) / thread; |
265 | | - vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2); |
266 | | - |
267 | | - cudaCheckOnDebug(); |
268 | | -} |
269 | | - |
270 | | -void vector_mul_vector_gpu(const int& dim, |
271 | | - float* result, |
272 | | - const float* vector1, |
273 | | - const float* vector2) |
274 | | -{ |
275 | | - int thread = THREADS_PER_BLOCK; |
276 | | - int block = (dim + thread - 1) / thread; |
277 | | - vector_mul_vector_kernel<float> <<<block, thread >>> (dim, result, vector1, vector2); |
278 | | - |
279 | | - cudaCheckOnDebug(); |
280 | | -} |
281 | | - |
282 | | -void vector_mul_vector_gpu(const int& dim, std::complex<float>* result, const std::complex<float>* vector1, const float* vector2) |
283 | | -{ |
284 | | - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); |
285 | | -} |
286 | | - |
287 | | -void vector_mul_vector_gpu(const int& dim, std::complex<double>* result, const std::complex<double>* vector1, const double* vector2) |
288 | | -{ |
289 | | - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); |
290 | | -} |
291 | | - |
292 | 157 | } // namespace ModuleBase |
0 commit comments