Skip to content

Commit 968e537

Browse files
committed
Perf: Replace loop-based 2D copy and memset with memcpy_2d_op, memset_2d_op
Signed-off-by:Tianxiang Wang<[email protected]>, Contributed under MetaX Integrated Circuits (Shanghai) Co., Ltd.
1 parent b9bb679 commit 968e537

File tree

5 files changed

+300
-17
lines changed

5 files changed

+300
-17
lines changed

source/source_base/module_device/cuda/memory_op.cu

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,16 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
8585
cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size));
8686
}
8787

88+
template <typename FPTYPE>
89+
void set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
90+
const size_t pitch,
91+
const int var,
92+
const size_t width,
93+
const size_t height)
94+
{
95+
cudaErrcheck(cudaMemset2D(arr, sizeof(FPTYPE) * pitch , var, sizeof(FPTYPE) * width, height));
96+
}
97+
8898
template <typename FPTYPE>
8999
void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
90100
FPTYPE* arr_out,
@@ -112,6 +122,42 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
112122
cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToDevice));
113123
}
114124

125+
template <typename FPTYPE>
126+
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
127+
FPTYPE* arr_out,
128+
const size_t dpitch,
129+
const FPTYPE* arr_in,
130+
const size_t spitch,
131+
const size_t width,
132+
const size_t height)
133+
{
134+
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToHost));
135+
}
136+
137+
template <typename FPTYPE>
138+
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
139+
FPTYPE* arr_out,
140+
const size_t dpitch,
141+
const FPTYPE* arr_in,
142+
const size_t spitch,
143+
const size_t width,
144+
const size_t height)
145+
{
146+
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyHostToDevice));
147+
}
148+
149+
template <typename FPTYPE>
150+
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
151+
FPTYPE* arr_out,
152+
const size_t dpitch,
153+
const FPTYPE* arr_in,
154+
const size_t spitch,
155+
const size_t width,
156+
const size_t height)
157+
{
158+
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToDevice));
159+
}
160+
115161
template <typename FPTYPE_out, typename FPTYPE_in>
116162
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
117163
{
@@ -196,6 +242,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
196242
template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
197243
template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
198244

245+
template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
246+
template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
247+
template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
248+
template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
249+
template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
250+
199251
template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
200252
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
201253
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
@@ -212,6 +264,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
212264
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
213265
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
214266

267+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
268+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
269+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
270+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
271+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
272+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
273+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
274+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
275+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
276+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
277+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
278+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
279+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
280+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
281+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
282+
215283
template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
216284
template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
217285
template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;

source/source_base/module_device/memory_op.cpp

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_CPU>
5555
}
5656
};
5757

58+
template <typename FPTYPE>
59+
struct set_memory_2d_op<FPTYPE, base_device::DEVICE_CPU>
60+
{
61+
void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height)
62+
{
63+
for (size_t i = 0; i < height; i++){
64+
set_memory_op<FPTYPE, base_device::DEVICE_CPU>()(arr + i * pitch, var, width);
65+
}
66+
}
67+
};
68+
69+
5870
template <typename FPTYPE>
5971
struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
6072
{
@@ -70,6 +82,23 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
7082
}
7183
};
7284

85+
template <typename FPTYPE>
86+
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
87+
{
88+
void operator()(FPTYPE* arr_out,
89+
const size_t dpitch,
90+
const FPTYPE* arr_in,
91+
const size_t spitch,
92+
const size_t width,
93+
const size_t height)
94+
{
95+
for (int i = 0; i < height; i++){
96+
synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>()(
97+
arr_out + i * dpitch, arr_in + i * spitch, width);
98+
}
99+
}
100+
};
101+
73102
template <typename FPTYPE_out, typename FPTYPE_in>
74103
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
75104
{
@@ -108,12 +137,24 @@ template struct set_memory_op<double, base_device::DEVICE_CPU>;
108137
template struct set_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
109138
template struct set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
110139

140+
template struct set_memory_2d_op<int, base_device::DEVICE_CPU>;
141+
template struct set_memory_2d_op<float, base_device::DEVICE_CPU>;
142+
template struct set_memory_2d_op<double, base_device::DEVICE_CPU>;
143+
template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU>;
144+
template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU>;
145+
111146
template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
112147
template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
113148
template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
114149
template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
115150
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
116151

152+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
153+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
154+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
155+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
156+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
157+
117158
template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
118159
template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
119160
template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
@@ -167,6 +208,14 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
167208
}
168209
};
169210

211+
template <typename FPTYPE>
212+
struct set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>
213+
{
214+
void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height)
215+
{
216+
}
217+
};
218+
170219
template <typename FPTYPE>
171220
struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
172221
{
@@ -197,6 +246,48 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
197246
}
198247
};
199248

249+
template <typename FPTYPE>
250+
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
251+
{
252+
void operator()(const Device_in* dev_in,
253+
FPTYPE* arr_out,
254+
const size_t dpitch,
255+
const FPTYPE* arr_in,
256+
const size_t spitch,
257+
const size_t width,
258+
const size_t height)
259+
{
260+
}
261+
};
262+
263+
template <typename FPTYPE>
264+
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
265+
{
266+
void operator()(const Device_in* dev_in,
267+
FPTYPE* arr_out,
268+
const size_t dpitch,
269+
const FPTYPE* arr_in,
270+
const size_t spitch,
271+
const size_t width,
272+
const size_t height)
273+
{
274+
}
275+
};
276+
277+
template <typename FPTYPE>
278+
struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
279+
{
280+
void operator()(const Device_in* dev_in,
281+
FPTYPE* arr_out,
282+
const size_t dpitch,
283+
const FPTYPE* arr_in,
284+
const size_t spitch,
285+
const size_t width,
286+
const size_t height)
287+
{
288+
}
289+
};
290+
200291
template <typename FPTYPE_out, typename FPTYPE_in>
201292
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
202293
{
@@ -247,6 +338,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
247338
template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
248339
template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
249340

341+
template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
342+
template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
343+
template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
344+
template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
345+
template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
346+
250347
template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
251348
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
252349
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
@@ -263,6 +360,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
263360
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
264361
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
265362

363+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
364+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
365+
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
366+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
367+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
368+
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
369+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
370+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
371+
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
372+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
373+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
374+
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
375+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
376+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
377+
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
378+
266379
template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
267380
template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
268381
template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;

0 commit comments

Comments
 (0)