forked from abacusmodeling/abacus-develop
-
Notifications
You must be signed in to change notification settings - Fork 221
Expand file tree
/
Copy pathmemory_op.h
More file actions
347 lines (306 loc) · 15.5 KB
/
memory_op.h
File metadata and controls
347 lines (306 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#ifndef MODULE_DEVICE_MEMORY_H_
#define MODULE_DEVICE_MEMORY_H_
#include "types.h"
#include "memory_op_dsp.h"
#include <complex>
#include <cstddef>
namespace base_device
{
namespace memory
{
template <typename FPTYPE, typename Device>
struct resize_memory_op
{
/// @brief Allocate memory for a given pointer. Note this op will free the pointer first.
///
/// Input Parameters
/// \param size : array size
/// \param record_string : label for memory record
///
/// Output Parameters
/// \param arr : allocated array
void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
};
template <typename FPTYPE, typename Device>
struct set_memory_op
{
/// @brief memset for multi-device
///
/// Input Parameters
/// \param var : the specified constant value
/// \param size : array size
///
/// Output Parameters
/// \param arr : output array initialized by the input value
void operator()(FPTYPE* arr, const int var, const size_t size);
};
template <typename FPTYPE, typename Device>
struct set_memory_2d_op
{
/// @brief memset2D for multi-device
///
/// Input Parameters
/// \param var : the specified constant value
/// \param pitch : Pitch in elements of 2D device memory
/// \param width : Width of matrix set (columns in elements)
/// \param height : Height of matrix set (rows)
///
/// Output Parameters
/// \param arr : output array initialized by the input value
void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height);
};
template <typename FPTYPE, typename Device_out, typename Device_in>
struct synchronize_memory_op
{
/// @brief memcpy for multi-device
///
/// Input Parameters
/// \param arr_in : input array
/// \param size : array size
///
/// Output Parameters
/// \param arr_out : output array initialized by the input array
void operator()(FPTYPE* arr_out,
const FPTYPE* arr_in,
const size_t size);
};
template <typename FPTYPE, typename Device_out, typename Device_in>
struct synchronize_memory_2d_op
{
/// @brief memcpy2D for multi-device
///
/// Input Parameters
/// \param arr_in : input array
/// \param dpitch : Pitch in elements of destination memory
/// \param spitch : Pitch in elements of source memory
/// \param width : Width of matrix transfer (columns in elements)
/// \param height : Height of matrix transfer (rows)
///
/// Output Parameters
/// \param arr_out : output array initialized by the input array
void operator()(FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height);
};
template <typename FPTYPE_out, typename FPTYPE_in, typename Device_out, typename Device_in>
struct cast_memory_op
{
/// @brief memcpy for multi-device
///
/// Input Parameters
/// \param arr_in : input array
/// \param size : array size
///
/// Output Parameters
/// \param arr_out : output array initialized by the input array
void operator()(FPTYPE_out* arr_out,
const FPTYPE_in* arr_in,
const size_t size);
};
template <typename FPTYPE, typename Device>
struct delete_memory_op
{
/// @brief free memory for multi-device
///
/// Input Parameters
/// \param arr : the input array
void operator()(FPTYPE* arr);
};
template <typename FPTYPE>
void resize_memory(FPTYPE* arr, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
template <typename FPTYPE>
void set_memory(FPTYPE* arr, const int var, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
template <typename FPTYPE>
void synchronize_memory(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in);
template <typename FPTYPE_out, typename FPTYPE_in>
void cast_memory(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in);
template <typename FPTYPE>
void delete_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
// Partially specialize operator for base_device::GpuDevice.
template <typename FPTYPE>
struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
{
void operator()(FPTYPE*& arr,
const size_t size,
const char* record_in = nullptr);
};
template <typename FPTYPE>
struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr, const int var, const size_t size);
};
template <typename FPTYPE>
struct set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height);
};
template <typename FPTYPE>
struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr_out,
const FPTYPE* arr_in,
const size_t size);
};
template <typename FPTYPE>
struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
{
void operator()(FPTYPE* arr_out,
const FPTYPE* arr_in,
const size_t size);
};
template <typename FPTYPE>
struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr_out,
const FPTYPE* arr_in,
const size_t size);
};
template <typename FPTYPE>
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height);
};
template <typename FPTYPE>
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
{
void operator()(FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height);
};
template <typename FPTYPE>
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height);
};
template <typename FPTYPE>
struct delete_memory_op<FPTYPE, base_device::DEVICE_GPU>
{
void operator()(FPTYPE* arr);
};
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
} // end of namespace memory
} // end of namespace base_device
using resmem_sh_op = base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>;
using resmem_dh_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>;
using resmem_ch_op = base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
using resmem_zh_op = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
using resmem_sd_op = base_device::memory::resize_memory_op<float, base_device::DEVICE_GPU>;
using resmem_dd_op = base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>;
using resmem_cd_op = base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
using resmem_zd_op = base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
using setmem_sh_op = base_device::memory::set_memory_op<float, base_device::DEVICE_CPU>;
using setmem_dh_op = base_device::memory::set_memory_op<double, base_device::DEVICE_CPU>;
using setmem_ch_op = base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
using setmem_zh_op = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
using setmem_sd_op = base_device::memory::set_memory_op<float, base_device::DEVICE_GPU>;
using setmem_dd_op = base_device::memory::set_memory_op<double, base_device::DEVICE_GPU>;
using setmem_cd_op = base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
using setmem_zd_op = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
using setmem_sh_2d_op = base_device::memory::set_memory_2d_op<float, base_device::DEVICE_CPU>;
using setmem_dh_2d_op = base_device::memory::set_memory_2d_op<double, base_device::DEVICE_CPU>;
using setmem_ch_2d_op = base_device::memory::set_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU>;
using setmem_zh_2d_op = base_device::memory::set_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU>;
using setmem_sd_2d_op = base_device::memory::set_memory_2d_op<float, base_device::DEVICE_GPU>;
using setmem_dd_2d_op = base_device::memory::set_memory_2d_op<double, base_device::DEVICE_GPU>;
using setmem_cd_2d_op = base_device::memory::set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
using setmem_zd_2d_op = base_device::memory::set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
using delmem_sh_op = base_device::memory::delete_memory_op<float, base_device::DEVICE_CPU>;
using delmem_dh_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_CPU>;
using delmem_ch_op = base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
using delmem_zh_op = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
using delmem_sd_op = base_device::memory::delete_memory_op<float, base_device::DEVICE_GPU>;
using delmem_dd_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>;
using delmem_cd_op = base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
using delmem_zd_op = base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
using syncmem_s2s_h2h_op
= base_device::memory::synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_s2s_h2d_op
= base_device::memory::synchronize_memory_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_s2s_d2h_op
= base_device::memory::synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_d2d_h2h_op
= base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_d2d_h2d_op
= base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_d2d_d2h_op
= base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_c2c_h2h_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_c2c_h2d_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_c2c_d2h_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_z2z_h2h_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_z2z_h2d_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_z2z_d2h_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_c2c_h2h_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_c2c_h2d_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_c2c_d2h_op
= base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_z2z_h2h_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_z2z_h2d_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_z2z_d2h_op
= base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_c2c_h2h_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_c2c_h2d_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_c2c_d2h_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using syncmem_z2z_h2h_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using syncmem_z2z_h2d_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using syncmem_z2z_d2h_2d_op
= base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using castmem_s2d_h2h_op
= base_device::memory::cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using castmem_s2d_h2d_op
= base_device::memory::cast_memory_op<double, float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using castmem_s2d_d2h_op
= base_device::memory::cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using castmem_d2s_h2h_op
= base_device::memory::cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using castmem_d2s_h2d_op
= base_device::memory::cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using castmem_d2s_d2h_op
= base_device::memory::cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using castmem_c2z_h2h_op = base_device::memory::
cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using castmem_c2z_h2d_op = base_device::memory::
cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using castmem_c2z_d2h_op = base_device::memory::
cast_memory_op<std::complex<double>, std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
using castmem_z2c_h2h_op = base_device::memory::
cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
using castmem_z2c_h2d_op = base_device::memory::
cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
using castmem_z2c_d2h_op = base_device::memory::
cast_memory_op<std::complex<float>, std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
static base_device::DEVICE_CPU* cpu_ctx = {};
static base_device::DEVICE_GPU* gpu_ctx = {};
#endif // MODULE_DEVICE_MEMORY_H_