Skip to content

Commit 83573ac

Browse files
authored
Merge pull request #40 from morousg/37-move-benchmarks-that-do-not-require-opencv-from-cvgpuspeedup-to-th-fkl-repository
37 move benchmarks that do not require opencv from cvgpuspeedup to th fkl repository
2 parents ea35c57 + 9c574ec commit 83573ac

File tree

15 files changed

+1180
-30
lines changed

15 files changed

+1180
-30
lines changed

benchmarks/fkBenchmarksCommon.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#include <array>
1616

1717
#include <fused_kernel/core/utils/vlimits.h>
18+
#include <fused_kernel/core/data/ptr_nd.cuh>
19+
#include <fused_kernel/algorithms/basic_ops/logical.cuh>
20+
21+
#include <iostream>
1822

1923
template <size_t START_VALUE, size_t INCREMENT, std::size_t... Is>
2024
constexpr std::array<size_t, sizeof...(Is)> generate_sequence(std::index_sequence<Is...>) {
@@ -23,3 +27,21 @@ constexpr std::array<size_t, sizeof...(Is)> generate_sequence(std::index_sequenc
2327

2428
template <size_t START_VALUE, size_t INCREMENT, size_t NUM_ELEMS>
2529
constexpr std::array<size_t, NUM_ELEMS> arrayIndexSecuence = generate_sequence<START_VALUE, INCREMENT>(std::make_index_sequence<NUM_ELEMS>{});
30+
31+
template <typename T>
32+
inline bool compareAndCheck(const fk::Ptr2D<T>& firstResult, const fk::Ptr2D<T>& secondResult) {
33+
const bool sameDims = firstResult.dims().width == secondResult.dims().width && firstResult.dims().height == secondResult.dims().height;
34+
if (!sameDims) {
35+
std::cout << "Dimensions do not match: " << firstResult.dims().width << "x" << firstResult.dims().height << " vs " << secondResult.dims().width << "x" << secondResult.dims().height << std::endl;
36+
return false;
37+
}
38+
for (uint y = 0; y < firstResult.dims().height; ++y) {
39+
for (uint x = 0; x < firstResult.dims().width; ++x) {
40+
if (!fk::Equal<T>::exec(fk::make_tuple(firstResult.at(fk::Point(x, y)), secondResult.at(fk::Point(x, y))))) {
41+
std::cout << "Mismatch at (" << x << ", " << y << ") " << std::endl;
42+
return false;
43+
}
44+
}
45+
}
46+
return true;
47+
}
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
/* Copyright 2025 Grup Mediapro S.L.U (Oscar Amoros Huguet)
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include <tests/main.h>
16+
17+
#include <benchmarks/fkBenchmarksCommon.h>
18+
#include <benchmarks/twoExecutionsBenchmark.h>
19+
20+
#include <fused_kernel/core/data/ptr_nd.cuh>
21+
#include <fused_kernel/algorithms/algorithms.cuh>
22+
#include <fused_kernel/fused_kernel.cuh>
23+
24+
constexpr char VARIABLE_DIMENSION_NAME[]{ "Batch size" };
25+
26+
constexpr size_t NUM_EXPERIMENTS = 10; // Used 100 in the paper
27+
constexpr size_t FIRST_VALUE = 1;
28+
constexpr size_t INCREMENT = 5;
29+
30+
constexpr std::array<size_t, NUM_EXPERIMENTS> variableDimensionValues = arrayIndexSecuence<FIRST_VALUE, INCREMENT, NUM_EXPERIMENTS>;
31+
32+
template <size_t BATCH>
33+
bool benchmark_Horizontal_Fusion(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, cudaStream_t stream) {
34+
constexpr std::string_view FIRST_LABEL{ "Iterated Batch" };
35+
constexpr std::string_view SECOND_LABEL{ "Fused Batch" };
36+
std::stringstream error_s;
37+
bool passed = true;
38+
bool exception = false;
39+
40+
using InputType = uchar;
41+
using OutputType = float;
42+
43+
const InputType val_init = 10u;
44+
const OutputType val_alpha = 1.0f;
45+
const OutputType val_sub = 1.f;
46+
const OutputType val_div = 3.2f;
47+
try {
48+
const fk::Size cropSize(60, 120);
49+
fk::Ptr2D<InputType> d_input((int)NUM_ELEMS_Y, (int)NUM_ELEMS_X);
50+
fk::setTo(val_init, d_input, stream);
51+
std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
52+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
53+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
54+
55+
fk::Tensor<OutputType> d_tensor_output(cropSize.width, cropSize.height, BATCH);
56+
fk::Tensor<OutputType> h_tensor_output(cropSize.width, cropSize.height, BATCH, 1, fk::MemType::HostPinned);
57+
58+
std::array<fk::Ptr2D<InputType>, BATCH> crops;
59+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
60+
crops[crop_i] = d_input.crop(fk::Point(crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast<uint>(cropSize.width),
61+
static_cast<uint>(cropSize.height),
62+
static_cast<uint>(d_input.dims().pitch)});
63+
d_output_cv[crop_i].Alloc(cropSize, 0, fk::MemType::Device);
64+
h_cvResults[crop_i].Alloc(cropSize, 0, fk::MemType::HostPinned);
65+
}
66+
67+
START_FIRST_BENCHMARK
68+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
69+
fk::executeOperations(crops[crop_i], stream,
70+
fk::SaturateCast<InputType, OutputType>::build(),
71+
fk::Mul<OutputType>::build(val_alpha),
72+
fk::Sub<OutputType>::build(val_sub),
73+
fk::Div<OutputType>::build(val_div),
74+
fk::PerThreadWrite<fk::_2D, OutputType>::build(d_output_cv[crop_i]));
75+
}
76+
STOP_FIRST_START_SECOND_BENCHMARK
77+
fk::executeOperations(crops, stream,
78+
fk::SaturateCast<InputType, OutputType>::build(),
79+
fk::Mul<OutputType>::build(val_alpha),
80+
fk::Sub<OutputType>::build(val_sub),
81+
fk::Div<OutputType>::build(val_div),
82+
fk::TensorWrite<OutputType>::build(d_tensor_output));
83+
STOP_SECOND_BENCHMARK
84+
85+
d_tensor_output.download(h_tensor_output, stream);
86+
87+
// Verify results
88+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
89+
d_output_cv[crop_i].download(h_cvResults[crop_i], stream);
90+
}
91+
92+
gpuErrchk(cudaStreamSynchronize(stream));
93+
94+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
95+
fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
96+
fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane(crop_i);
97+
bool passedThisTime = compareAndCheck(cvRes, cvGSRes);
98+
if (!passedThisTime) { std::cout << "Failed on crop idx=" << crop_i << std::endl; }
99+
passed &= passedThisTime;
100+
}
101+
} catch (const std::exception& e) {
102+
error_s << e.what();
103+
passed = false;
104+
exception = true;
105+
}
106+
107+
if (!passed) {
108+
if (!exception) {
109+
std::stringstream ss;
110+
ss << "benchmark_Horizontal_Fusion";
111+
std::cout << ss.str() << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
112+
} else {
113+
std::stringstream ss;
114+
ss << "benchmark_Horizontal_Fusion";
115+
std::cout << ss.str() << "> failed!! EXCEPTION: " << error_s.str() << std::endl;
116+
}
117+
}
118+
119+
return passed;
120+
}
121+
122+
template <size_t BATCH>
123+
bool benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, cudaStream_t stream) {
124+
constexpr std::string_view FIRST_LABEL{ "Iterated Batch" };
125+
constexpr std::string_view SECOND_LABEL{ "Fused Batch" };
126+
std::stringstream error_s;
127+
bool passed = true;
128+
bool exception = false;
129+
130+
using InputType = uchar;
131+
using OutputType = float;
132+
133+
const InputType val_init = 10u;
134+
const OutputType val_alpha = 1.0f;
135+
const OutputType val_sub = 1.f;
136+
const OutputType val_div = 3.2f;
137+
try {
138+
const fk::Size cropSize(60, 120);
139+
fk::Ptr2D<InputType> d_input((int)NUM_ELEMS_Y, (int)NUM_ELEMS_X);
140+
fk::setTo(val_init, d_input, stream);
141+
std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
142+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
143+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
144+
145+
fk::Tensor<OutputType> d_tensor_output(cropSize.width, cropSize.height, BATCH);
146+
fk::Tensor<OutputType> h_tensor_output(cropSize.width, cropSize.height, BATCH, 1, fk::MemType::HostPinned);
147+
148+
std::array<fk::Ptr2D<InputType>, BATCH> crops;
149+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
150+
crops[crop_i] = d_input.crop(fk::Point(crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast<uint>(cropSize.width),
151+
static_cast<uint>(cropSize.height),
152+
static_cast<uint>(d_input.dims().pitch)});
153+
d_output_cv[crop_i].Alloc(cropSize, 0, fk::MemType::Device);
154+
h_cvResults[crop_i].Alloc(cropSize, 0, fk::MemType::HostPinned);
155+
}
156+
157+
// Read Ops
158+
const auto read_array = fk::PerThreadRead<fk::_2D, InputType>::build_batch(crops);
159+
const auto read = fk::PerThreadRead<fk::_2D, InputType>::build(crops);
160+
161+
// Compute Ops
162+
const auto saturate = fk::SaturateCast<InputType, OutputType>::build();
163+
const auto mul = fk::Mul<OutputType>::build(val_alpha);
164+
const auto sub = fk::Sub<OutputType>::build(val_sub);
165+
const auto div = fk::Div<OutputType>::build(val_div);
166+
167+
// Write Ops
168+
const auto write_array = fk::PerThreadWrite<fk::_2D, OutputType>::build_batch(d_output_cv);
169+
const auto write = fk::TensorWrite<OutputType>::build(d_tensor_output);
170+
171+
START_FIRST_BENCHMARK
172+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
173+
fk::executeOperations(stream, read_array[crop_i], saturate,
174+
mul, sub, div, write_array[crop_i]);
175+
}
176+
STOP_FIRST_START_SECOND_BENCHMARK
177+
fk::executeOperations(stream, read, saturate, mul, sub, div, write);
178+
STOP_SECOND_BENCHMARK
179+
180+
d_tensor_output.download(h_tensor_output, stream);
181+
182+
// Verify results
183+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
184+
d_output_cv[crop_i].download(h_cvResults[crop_i], stream);
185+
}
186+
187+
gpuErrchk(cudaStreamSynchronize(stream));
188+
189+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
190+
fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
191+
fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane(crop_i);
192+
bool passedThisTime = compareAndCheck(cvRes, cvGSRes);
193+
if (!passedThisTime) { std::cout << "Failed on crop idx=" << crop_i << std::endl; }
194+
passed &= passedThisTime;
195+
}
196+
} catch (const std::exception& e) {
197+
error_s << e.what();
198+
passed = false;
199+
exception = true;
200+
}
201+
202+
if (!passed) {
203+
if (!exception) {
204+
std::stringstream ss;
205+
ss << "benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD";
206+
std::cout << ss.str() << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
207+
} else {
208+
std::stringstream ss;
209+
ss << "benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD";
210+
std::cout << ss.str() << "> failed!! EXCEPTION: " << error_s.str() << std::endl;
211+
}
212+
}
213+
214+
return passed;
215+
}
216+
217+
template <size_t... Is>
218+
bool launch_benchmark_Horizontal_Fusion(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
219+
bool passed = true;
220+
221+
passed &= (benchmark_Horizontal_Fusion<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
222+
223+
return passed;
224+
}
225+
226+
template <size_t... Is>
227+
bool launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
228+
bool passed = true;
229+
230+
passed &= (benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
231+
232+
return passed;
233+
}
234+
235+
int launch() {
236+
constexpr size_t NUM_ELEMS_X = 3840;
237+
constexpr size_t NUM_ELEMS_Y = 2160;
238+
cudaStream_t stream;
239+
gpuErrchk(cudaStreamCreate(&stream));
240+
241+
warmup = true;
242+
launch_benchmark_Horizontal_Fusion(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
243+
launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
244+
warmup = false;
245+
246+
launch_benchmark_Horizontal_Fusion(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
247+
launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
248+
249+
gpuErrchk(cudaStreamDestroy(stream));
250+
251+
return 0;
252+
}

0 commit comments

Comments
 (0)