1+ /* Copyright 2025 Grup Mediapro S.L.U (Oscar Amoros Huguet)
2+
3+ Licensed under the Apache License, Version 2.0 (the "License");
4+ you may not use this file except in compliance with the License.
5+ You may obtain a copy of the License at
6+
7+ http://www.apache.org/licenses/LICENSE-2.0
8+
9+ Unless required by applicable law or agreed to in writing, software
10+ distributed under the License is distributed on an "AS IS" BASIS,
11+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ See the License for the specific language governing permissions and
13+ limitations under the License. */
14+
15+ #include < tests/main.h>
16+
17+ #include < benchmarks/fkBenchmarksCommon.h>
18+ #include < benchmarks/twoExecutionsBenchmark.h>
19+
20+ #include < fused_kernel/core/data/ptr_nd.cuh>
21+ #include < fused_kernel/algorithms/algorithms.cuh>
22+ #include < fused_kernel/fused_kernel.cuh>
23+
24+ constexpr char VARIABLE_DIMENSION_NAME[]{ " Batch size" };
25+
26+ constexpr size_t NUM_EXPERIMENTS = 10 ; // Used 100 in the paper
27+ constexpr size_t FIRST_VALUE = 1 ;
28+ constexpr size_t INCREMENT = 5 ;
29+
30+ constexpr std::array<size_t , NUM_EXPERIMENTS> variableDimensionValues = arrayIndexSecuence<FIRST_VALUE, INCREMENT, NUM_EXPERIMENTS>;
31+
32+ template <size_t BATCH>
33+ bool benchmark_Horizontal_Fusion (const size_t & NUM_ELEMS_X, const size_t & NUM_ELEMS_Y, cudaStream_t stream) {
34+ constexpr std::string_view FIRST_LABEL{ " Iterated Batch" };
35+ constexpr std::string_view SECOND_LABEL{ " Fused Batch" };
36+ std::stringstream error_s;
37+ bool passed = true ;
38+ bool exception = false ;
39+
40+ using InputType = uchar;
41+ using OutputType = float ;
42+
43+ const InputType val_init = 10u ;
44+ const OutputType val_alpha = 1 .0f ;
45+ const OutputType val_sub = 1 .f ;
46+ const OutputType val_div = 3 .2f ;
47+ try {
48+ const fk::Size cropSize (60 , 120 );
49+ fk::Ptr2D<InputType> d_input ((int )NUM_ELEMS_Y, (int )NUM_ELEMS_X);
50+ fk::setTo (val_init, d_input, stream);
51+ std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
52+ std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
53+ std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
54+
55+ fk::Tensor<OutputType> d_tensor_output (cropSize.width , cropSize.height , BATCH);
56+ fk::Tensor<OutputType> h_tensor_output (cropSize.width , cropSize.height , BATCH, 1 , fk::MemType::HostPinned);
57+
58+ std::array<fk::Ptr2D<InputType>, BATCH> crops;
59+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
60+ crops[crop_i] = d_input.crop (fk::Point (crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast <uint>(cropSize.width ),
61+ static_cast <uint>(cropSize.height ),
62+ static_cast <uint>(d_input.dims ().pitch )});
63+ d_output_cv[crop_i].Alloc (cropSize, 0 , fk::MemType::Device);
64+ h_cvResults[crop_i].Alloc (cropSize, 0 , fk::MemType::HostPinned);
65+ }
66+
67+ START_FIRST_BENCHMARK
68+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
69+ fk::executeOperations (crops[crop_i], stream,
70+ fk::SaturateCast<InputType, OutputType>::build (),
71+ fk::Mul<OutputType>::build (val_alpha),
72+ fk::Sub<OutputType>::build (val_sub),
73+ fk::Div<OutputType>::build (val_div),
74+ fk::PerThreadWrite<fk::_2D, OutputType>::build (d_output_cv[crop_i]));
75+ }
76+ STOP_FIRST_START_SECOND_BENCHMARK
77+ fk::executeOperations (crops, stream,
78+ fk::SaturateCast<InputType, OutputType>::build (),
79+ fk::Mul<OutputType>::build (val_alpha),
80+ fk::Sub<OutputType>::build (val_sub),
81+ fk::Div<OutputType>::build (val_div),
82+ fk::TensorWrite<OutputType>::build (d_tensor_output));
83+ STOP_SECOND_BENCHMARK
84+
85+ d_tensor_output.download (h_tensor_output, stream);
86+
87+ // Verify results
88+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
89+ d_output_cv[crop_i].download (h_cvResults[crop_i], stream);
90+ }
91+
92+ gpuErrchk (cudaStreamSynchronize (stream));
93+
94+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
95+ fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
96+ fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane (crop_i);
97+ bool passedThisTime = compareAndCheck (cvRes, cvGSRes);
98+ if (!passedThisTime) { std::cout << " Failed on crop idx=" << crop_i << std::endl; }
99+ passed &= passedThisTime;
100+ }
101+ } catch (const std::exception& e) {
102+ error_s << e.what ();
103+ passed = false ;
104+ exception = true ;
105+ }
106+
107+ if (!passed) {
108+ if (!exception) {
109+ std::stringstream ss;
110+ ss << " benchmark_Horizontal_Fusion" ;
111+ std::cout << ss.str () << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
112+ } else {
113+ std::stringstream ss;
114+ ss << " benchmark_Horizontal_Fusion" ;
115+ std::cout << ss.str () << " > failed!! EXCEPTION: " << error_s.str () << std::endl;
116+ }
117+ }
118+
119+ return passed;
120+ }
121+
122+ template <size_t BATCH>
123+ bool benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD (const size_t & NUM_ELEMS_X, const size_t & NUM_ELEMS_Y, cudaStream_t stream) {
124+ constexpr std::string_view FIRST_LABEL{ " Iterated Batch" };
125+ constexpr std::string_view SECOND_LABEL{ " Fused Batch" };
126+ std::stringstream error_s;
127+ bool passed = true ;
128+ bool exception = false ;
129+
130+ using InputType = uchar;
131+ using OutputType = float ;
132+
133+ const InputType val_init = 10u ;
134+ const OutputType val_alpha = 1 .0f ;
135+ const OutputType val_sub = 1 .f ;
136+ const OutputType val_div = 3 .2f ;
137+ try {
138+ const fk::Size cropSize (60 , 120 );
139+ fk::Ptr2D<InputType> d_input ((int )NUM_ELEMS_Y, (int )NUM_ELEMS_X);
140+ fk::setTo (val_init, d_input, stream);
141+ std::array<fk::Ptr2D<OutputType>, BATCH> d_output_cv;
142+ std::array<fk::Ptr2D<OutputType>, BATCH> h_cvResults;
143+ std::array<fk::Ptr2D<OutputType>, BATCH> h_cvGSResults;
144+
145+ fk::Tensor<OutputType> d_tensor_output (cropSize.width , cropSize.height , BATCH);
146+ fk::Tensor<OutputType> h_tensor_output (cropSize.width , cropSize.height , BATCH, 1 , fk::MemType::HostPinned);
147+
148+ std::array<fk::Ptr2D<InputType>, BATCH> crops;
149+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
150+ crops[crop_i] = d_input.crop (fk::Point (crop_i, crop_i), fk::PtrDims<fk::_2D>{static_cast <uint>(cropSize.width ),
151+ static_cast <uint>(cropSize.height ),
152+ static_cast <uint>(d_input.dims ().pitch )});
153+ d_output_cv[crop_i].Alloc (cropSize, 0 , fk::MemType::Device);
154+ h_cvResults[crop_i].Alloc (cropSize, 0 , fk::MemType::HostPinned);
155+ }
156+
157+ // Read Ops
158+ const auto read_array = fk::PerThreadRead<fk::_2D, InputType>::build_batch (crops);
159+ const auto read = fk::PerThreadRead<fk::_2D, InputType>::build (crops);
160+
161+ // Compute Ops
162+ const auto saturate = fk::SaturateCast<InputType, OutputType>::build ();
163+ const auto mul = fk::Mul<OutputType>::build (val_alpha);
164+ const auto sub = fk::Sub<OutputType>::build (val_sub);
165+ const auto div = fk::Div<OutputType>::build (val_div);
166+
167+ // Write Ops
168+ const auto write_array = fk::PerThreadWrite<fk::_2D, OutputType>::build_batch (d_output_cv);
169+ const auto write = fk::TensorWrite<OutputType>::build (d_tensor_output);
170+
171+ START_FIRST_BENCHMARK
172+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
173+ fk::executeOperations (stream, read_array[crop_i], saturate,
174+ mul, sub, div, write_array[crop_i]);
175+ }
176+ STOP_FIRST_START_SECOND_BENCHMARK
177+ fk::executeOperations (stream, read, saturate, mul, sub, div, write);
178+ STOP_SECOND_BENCHMARK
179+
180+ d_tensor_output.download (h_tensor_output, stream);
181+
182+ // Verify results
183+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
184+ d_output_cv[crop_i].download (h_cvResults[crop_i], stream);
185+ }
186+
187+ gpuErrchk (cudaStreamSynchronize (stream));
188+
189+ for (int crop_i = 0 ; crop_i < BATCH; crop_i++) {
190+ fk::Ptr2D<OutputType> cvRes = h_cvResults[crop_i];
191+ fk::Ptr2D<OutputType> cvGSRes = h_tensor_output.getPlane (crop_i);
192+ bool passedThisTime = compareAndCheck (cvRes, cvGSRes);
193+ if (!passedThisTime) { std::cout << " Failed on crop idx=" << crop_i << std::endl; }
194+ passed &= passedThisTime;
195+ }
196+ } catch (const std::exception& e) {
197+ error_s << e.what ();
198+ passed = false ;
199+ exception = true ;
200+ }
201+
202+ if (!passed) {
203+ if (!exception) {
204+ std::stringstream ss;
205+ ss << " benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD" ;
206+ std::cout << ss.str () << " failed!! RESULT ERROR: Some results do not match baseline." << std::endl;
207+ } else {
208+ std::stringstream ss;
209+ ss << " benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD" ;
210+ std::cout << ss.str () << " > failed!! EXCEPTION: " << error_s.str () << std::endl;
211+ }
212+ }
213+
214+ return passed;
215+ }
216+
217+ template <size_t ... Is>
218+ bool launch_benchmark_Horizontal_Fusion (const size_t & NUM_ELEMS_X, const size_t & NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
219+ bool passed = true ;
220+
221+ passed &= (benchmark_Horizontal_Fusion<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
222+
223+ return passed;
224+ }
225+
226+ template <size_t ... Is>
227+ bool launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD (const size_t & NUM_ELEMS_X, const size_t & NUM_ELEMS_Y, const std::index_sequence<Is...>& seq, cudaStream_t stream) {
228+ bool passed = true ;
229+
230+ passed &= (benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD<variableDimensionValues[Is]>(NUM_ELEMS_X, NUM_ELEMS_Y, stream) && ...);
231+
232+ return passed;
233+ }
234+
235+ int launch () {
236+ constexpr size_t NUM_ELEMS_X = 3840 ;
237+ constexpr size_t NUM_ELEMS_Y = 2160 ;
238+ cudaStream_t stream;
239+ gpuErrchk (cudaStreamCreate (&stream));
240+
241+ warmup = true ;
242+ launch_benchmark_Horizontal_Fusion (NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
243+ launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD (NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
244+ warmup = false ;
245+
246+ launch_benchmark_Horizontal_Fusion (NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
247+ launch_benchmark_Horizontal_Fusion_NO_CPU_OVERHEAD (NUM_ELEMS_X, NUM_ELEMS_Y, std::make_index_sequence<NUM_EXPERIMENTS>{}, stream);
248+
249+ gpuErrchk (cudaStreamDestroy (stream));
250+
251+ return 0 ;
252+ }
0 commit comments