Skip to content

Commit a5eec88

Browse files
committed
Merge branch 'main' into 38-fix-cmake-generation-stages
* main: Fixed test_thread_fusion Fixed comments typos Reduced the compilation times in the benchmark Fixed a typo Finished adding all thread fusion benchmarks Adding different types benchmark for thread fusion Added one thread fusion benchmark Adding thread fusion tests Fixing Equal for vector types Adding type_to_string.h Adding nvtx header Some fixes in common files for benchmarks Added benchmark with and without CPU overhead for horizontal fussion Fix an stack overflow when compiling in Debug mode Added test and fixed a bug in Executor Adding convenience methods to Ptr class Work in progress Starting with the horizontal fusion test, WIP
2 parents 512db53 + 83573ac commit a5eec88

File tree

15 files changed

+1180
-30
lines changed

15 files changed

+1180
-30
lines changed

benchmarks/fkBenchmarksCommon.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#include <array>
1616

1717
#include <fused_kernel/core/utils/vlimits.h>
18+
#include <fused_kernel/core/data/ptr_nd.cuh>
19+
#include <fused_kernel/algorithms/basic_ops/logical.cuh>
20+
21+
#include <iostream>
1822

1923
template <size_t START_VALUE, size_t INCREMENT, std::size_t... Is>
2024
constexpr std::array<size_t, sizeof...(Is)> generate_sequence(std::index_sequence<Is...>) {
@@ -23,3 +27,21 @@ constexpr std::array<size_t, sizeof...(Is)> generate_sequence(std::index_sequenc
2327

2428
template <size_t START_VALUE, size_t INCREMENT, size_t NUM_ELEMS>
2529
constexpr std::array<size_t, NUM_ELEMS> arrayIndexSecuence = generate_sequence<START_VALUE, INCREMENT>(std::make_index_sequence<NUM_ELEMS>{});
30+
31+
template <typename T>
32+
inline bool compareAndCheck(const fk::Ptr2D<T>& firstResult, const fk::Ptr2D<T>& secondResult) {
33+
const bool sameDims = firstResult.dims().width == secondResult.dims().width && firstResult.dims().height == secondResult.dims().height;
34+
if (!sameDims) {
35+
std::cout << "Dimensions do not match: " << firstResult.dims().width << "x" << firstResult.dims().height << " vs " << secondResult.dims().width << "x" << secondResult.dims().height << std::endl;
36+
return false;
37+
}
38+
for (uint y = 0; y < firstResult.dims().height; ++y) {
39+
for (uint x = 0; x < firstResult.dims().width; ++x) {
40+
if (!fk::Equal<T>::exec(fk::make_tuple(firstResult.at(fk::Point(x, y)), secondResult.at(fk::Point(x, y))))) {
41+
std::cout << "Mismatch at (" << x << ", " << y << ") " << std::endl;
42+
return false;
43+
}
44+
}
45+
}
46+
return true;
47+
}
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
/* Copyright 2025 Grup Mediapro S.L.U (Oscar Amoros Huguet)
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include <tests/main.h>
16+
17+
#include <benchmarks/fkBenchmarksCommon.h>
18+
#include <benchmarks/twoExecutionsBenchmark.h>
19+
20+
#include <fused_kernel/core/data/ptr_nd.cuh>
21+
#include <fused_kernel/algorithms/algorithms.cuh>
22+
#include <fused_kernel/fused_kernel.cuh>
23+
24+
constexpr char VARIABLE_DIMENSION_NAME[]{ "Batch size" };
25+
26+
constexpr size_t NUM_EXPERIMENTS = 10; // Used 100 in the paper
27+
constexpr size_t FIRST_VALUE = 1;
28+
constexpr size_t INCREMENT = 5;
29+
30+
constexpr std::array<size_t, NUM_EXPERIMENTS> variableDimensionValues = arrayIndexSecuence<FIRST_VALUE, INCREMENT, NUM_EXPERIMENTS>;
31+
32+
template <size_t BATCH>
33+
bool benchmark_Horizontal_Fusion(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, cudaStream_t stream) {
34+
constexpr std::string_view FIRST_LABEL{ "Iterated Batch" };
35+
constexpr std::string_view SECOND_LABEL{ "Fused Batch" };
36+
std::stringstream error_s;
37+
bool passed = true;
38+
bool exception = false;
39+
40+
using InputType = uchar;
41+
using OutputType = float;
42+
43+
const InputType val_init = 10u;
44+
const OutputType val_alpha = 1.0f;
45+
const OutputType val_sub = 1.f;
46+
const OutputType val_div = 3.2f;
47+
try {
48+
const fk::Size cropSize(60, 120);
49+
fk::Ptr2D<InputType> d_input((int)NUM_ELEMS_Y, (int)NUM_ELEMS_X);
50+
fk::setTo(val_init, d_input, stream);
51+
std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
52+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
53+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
54+
55+
fk::Tensor<OutputType> d_tensor_output(cropSize.width, cropSize.height, BATCH);
56+
fk::Tensor<OutputType> h_tensor_output(cropSize.width, cropSize.height, BATCH, 1, fk::MemType::HostPinned);
57+
58+
std::array<fk::Ptr2D<InputType>, BATCH> crops;
59+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
60+
crops[crop_i] = d_input.crop(fk::Point(crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast<uint>(cropSize.width),
61+
static_cast<uint>(cropSize.height),
62+
static_cast<uint>(d_input.dims().pitch)});
63+
d_output_cv[crop_i].Alloc(cropSize, 0, fk::MemType::Device);
64+
h_cvResults[crop_i].Alloc(cropSize, 0, fk::MemType::HostPinned);
65+
}
66+
67+
START_FIRST_BENCHMARK
68+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
69+
fk::executeOperations(crops[crop_i], stream,
70+
fk::SaturateCast<InputType, OutputType>::build(),
71+
fk::Mul<OutputType>::build(val_alpha),
72+
fk::Sub<OutputType>::build(val_sub),
73+
fk::Div<OutputType>::build(val_div),
74+
fk::PerThreadWrite<fk::_2D, OutputType>::build(d_output_cv[crop_i]));
75+
}
76+
STOP_FIRST_START_SECOND_BENCHMARK
77+
fk::executeOperations(crops, stream,
78+
fk::SaturateCast<InputType, OutputType>::build(),
79+
fk::Mul<OutputType>::build(val_alpha),
80+
fk::Sub<OutputType>::build(val_sub),
81+
fk::Div<OutputType>::build(val_div),
82+
fk::TensorWrite<OutputType>::build(d_tensor_output));
83+
STOP_SECOND_BENCHMARK
84+
85+
d_tensor_output.download(h_tensor_output, stream);
86+
87+
// Verify results
88+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
89+
d_output_cv[crop_i].download(h_cvResults[crop_i], stream);
90+
}
91+
92+
gpuErrchk(cudaStreamSynchronize(stream));
93+
94+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
95+
fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
96+
fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane(crop_i);
97+
bool passedThisTime = compareAndCheck(cvRes, cvGSRes);
98+
if (!passedThisTime) { std::cout << "Failed on crop idx=" << crop_i << std::endl; }
99+
passed &= passedThisTime;
100+
}
101+
} catch (const std::exception& e) {
102+
error_s << e.what();
103+
passed = false;
104+
exception = true;
105+
}
106+
107+
if (!passed) {
108+
if (!exception) {
109+
std::stringstream ss;
110+
ss << "benchmark_Horizontal_Fusion";
111+
std::cout << ss.str() << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
112+
} else {
113+
std::stringstream ss;
114+
ss << "benchmark_Horizontal_Fusion";
115+
std::cout << ss.str() << "> failed!! EXCEPTION: " << error_s.str() << std::endl;
116+
}
117+
}
118+
119+
return passed;
120+
}
121+
122+
template <size_t BATCH>
123+
bool benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, cudaStream_t stream) {
124+
constexpr std::string_view FIRST_LABEL{ "Iterated Batch" };
125+
constexpr std::string_view SECOND_LABEL{ "Fused Batch" };
126+
std::stringstream error_s;
127+
bool passed = true;
128+
bool exception = false;
129+
130+
using InputType = uchar;
131+
using OutputType = float;
132+
133+
const InputType val_init = 10u;
134+
const OutputType val_alpha = 1.0f;
135+
const OutputType val_sub = 1.f;
136+
const OutputType val_div = 3.2f;
137+
try {
138+
const fk::Size cropSize(60, 120);
139+
fk::Ptr2D<InputType> d_input((int)NUM_ELEMS_Y, (int)NUM_ELEMS_X);
140+
fk::setTo(val_init, d_input, stream);
141+
std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
142+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
143+
std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
144+
145+
fk::Tensor<OutputType> d_tensor_output(cropSize.width, cropSize.height, BATCH);
146+
fk::Tensor<OutputType> h_tensor_output(cropSize.width, cropSize.height, BATCH, 1, fk::MemType::HostPinned);
147+
148+
std::array<fk::Ptr2D<InputType>, BATCH> crops;
149+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
150+
crops[crop_i] = d_input.crop(fk::Point(crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast<uint>(cropSize.width),
151+
static_cast<uint>(cropSize.height),
152+
static_cast<uint>(d_input.dims().pitch)});
153+
d_output_cv[crop_i].Alloc(cropSize, 0, fk::MemType::Device);
154+
h_cvResults[crop_i].Alloc(cropSize, 0, fk::MemType::HostPinned);
155+
}
156+
157+
// Read Ops
158+
const auto read_array = fk::PerThreadRead<fk::_2D, InputType>::build_batch(crops);
159+
const auto read = fk::PerThreadRead<fk::_2D, InputType>::build(crops);
160+
161+
// Compute Ops
162+
const auto saturate = fk::SaturateCast<InputType, OutputType>::build();
163+
const auto mul = fk::Mul<OutputType>::build(val_alpha);
164+
const auto sub = fk::Sub<OutputType>::build(val_sub);
165+
const auto div = fk::Div<OutputType>::build(val_div);
166+
167+
// Write Ops
168+
const auto write_array = fk::PerThreadWrite<fk::_2D, OutputType>::build_batch(d_output_cv);
169+
const auto write = fk::TensorWrite<OutputType>::build(d_tensor_output);
170+
171+
START_FIRST_BENCHMARK
172+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
173+
fk::executeOperations(stream, read_array[crop_i], saturate,
174+
mul, sub, div, write_array[crop_i]);
175+
}
176+
STOP_FIRST_START_SECOND_BENCHMARK
177+
fk::executeOperations(stream, read, saturate, mul, sub, div, write);
178+
STOP_SECOND_BENCHMARK
179+
180+
d_tensor_output.download(h_tensor_output, stream);
181+
182+
// Verify results
183+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
184+
d_output_cv[crop_i].download(h_cvResults[crop_i], stream);
185+
}
186+
187+
gpuErrchk(cudaStreamSynchronize(stream));
188+
189+
for (int crop_i = 0; crop_i < BATCH; crop_i++) {
190+
fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
191+
fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane(crop_i);
192+
bool passedThisTime = compareAndCheck(cvRes, cvGSRes);
193+
if (!passedThisTime) { std::cout << "Failed on crop idx=" << crop_i << std::endl; }
194+
passed &= passedThisTime;
195+
}
196+
} catch (const std::exception& e) {
197+
error_s << e.what();
198+
passed = false;
199+
exception = true;
200+
}
201+
202+
if (!passed) {
203+
if (!exception) {
204+
std::stringstream ss;
205+
ss << "benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD";
206+
std::cout << ss.str() << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
207+
} else {
208+
std::stringstream ss;
209+
ss << "benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD";
210+
std::cout << ss.str() << "> failed!! EXCEPTION: " << error_s.str() << std::endl;
211+
}
212+
}
213+
214+
return passed;
215+
}
216+
217+
template <size_t... Is>
218+
bool launch_benchmark_Horizontal_Fusion(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
219+
bool passed = true;
220+
221+
passed &= (benchmark_Horizontal_Fusion<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
222+
223+
return passed;
224+
}
225+
226+
template <size_t... Is>
227+
bool launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(const size_t& NUM_ELEMS_X, const size_t& NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
228+
bool passed = true;
229+
230+
passed &= (benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
231+
232+
return passed;
233+
}
234+
235+
int launch() {
236+
constexpr size_t NUM_ELEMS_X = 3840;
237+
constexpr size_t NUM_ELEMS_Y = 2160;
238+
cudaStream_t stream;
239+
gpuErrchk(cudaStreamCreate(&stream));
240+
241+
warmup = true;
242+
launch_benchmark_Horizontal_Fusion(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
243+
launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
244+
warmup = false;
245+
246+
launch_benchmark_Horizontal_Fusion(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
247+
launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD(NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
248+
249+
gpuErrchk(cudaStreamDestroy(stream));
250+
251+
return 0;
252+
}

0 commit comments

Comments
 (0)