Skip to content

Commit 9369275

Browse files
pbalcerCompute-Runtime-Automation
authored andcommitted
add support for normal cmdqueue dispatch for graphs
Signed-off-by: Piotr Balcer <[email protected]>
1 parent 7b34d81 commit 9369275

23 files changed

+1004
-336
lines changed

TESTS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,14 @@ WriteTimestamp|measures time required to write a timestamp on GPU.|<ul><li>--mea
111111

112112

113113

114+
# graph_api_benchmark
115+
Graph Api Overhead Benchmark is a set of tests aimed at measuring CPU-side execution duration of SYCL Graphs API calls.
116+
| Test name | Description | Params | L0 | OCL |
117+
|-----------|-------------|--------|----|-----|
118+
SinKernelGraph|Benchmark running memory copy and kernel runs, with graphs and without graphs|<ul><li>--immediateAppendCmdList Use zeCommandListImmediateAppendCommandListsExp to submit graph (only valid for L0) (0 or 1)</li><li>--numKernels Number of kernel invocations</li><li>--withCopyOffload Enable driver copy offload (only valid for L0) (0 or 1)</li><li>--withGraphs Runs with or without graphs (0 or 1)</li></ul>|:heavy_check_mark:|:x:|
119+
120+
121+
114122
# memory_benchmark
115123
Memory Benchmark is a set of tests aimed at measuring bandwidth of memory transfers.
116124
| Test name | Description | Params | L0 | OCL |

source/benchmarks/graph_api_benchmark/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
# SPDX-License-Identifier: MIT
55
#
66

7-
add_benchmark(graph_api_benchmark sycl all)
7+
add_benchmark(graph_api_benchmark sycl l0 ur all)

source/benchmarks/graph_api_benchmark/definitions/sin_kernel_graph.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,23 @@
99

1010
#include "framework/argument/basic_argument.h"
1111
#include "framework/test_case/test_case.h"
12+
#include "framework/test_case/test_result.h"
13+
#include "framework/utility/timer.h"
14+
15+
#include <math.h>
16+
#include <random>
1217

1318
struct SinKernelGraphArguments : TestCaseArgumentContainer {
1419
PositiveIntegerArgument numKernels;
1520
BooleanArgument withGraphs;
21+
BooleanArgument withCopyOffload;
22+
BooleanArgument immediateAppendCmdList;
1623

1724
SinKernelGraphArguments()
1825
: numKernels(*this, "numKernels", "Number of kernel invocations"),
19-
withGraphs(*this, "withGraphs", "Runs with or without graphs") {}
26+
withGraphs(*this, "withGraphs", "Runs with or without graphs"),
27+
withCopyOffload(*this, "withCopyOffload", "Enable driver copy offload (only valid for L0)"),
28+
immediateAppendCmdList(*this, "immediateAppendCmdList", "Use zeCommandListImmediateAppendCommandListsExp to submit graph (only valid for L0)") {}
2029
};
2130

2231
struct SinKernelGraph : TestCase<SinKernelGraphArguments> {
@@ -27,6 +36,6 @@ struct SinKernelGraph : TestCase<SinKernelGraphArguments> {
2736
}
2837

2938
std::string getHelp() const override {
30-
return "Benchmark calling sycl::sin kernel & doing mem alloc/dealloc, with graphs and without graphs";
39+
return "Benchmark running memory copy and kernel runs, with graphs and without graphs";
3140
}
3241
};
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/*
2+
* Copyright (C) 2024-2025 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#pragma once
9+
10+
#include "framework/benchmark_info.h"
11+
12+
#include "sin_kernel_graph.h"
13+
14+
#include <cmath>
15+
#include <functional>
16+
#include <iostream>
17+
#include <memory>
18+
#include <random>
19+
#include <vector>
20+
21+
class SinKernelGraphBase {
22+
public:
23+
SinKernelGraphBase(const SinKernelGraphArguments &arguments)
24+
: numKernels(arguments.numKernels), size(65536), withGraphs(arguments.withGraphs), withCopyOffload(arguments.withCopyOffload), immediateAppendCmdList(arguments.immediateAppendCmdList), iterations(arguments.iterations), engine(0), distribution(-10.0, 10.0){};
25+
26+
using DataFloatPtr = std::unique_ptr<float, std::function<void(float *)>>;
27+
28+
virtual DataFloatPtr allocDevice(uint32_t count) = 0;
29+
virtual DataFloatPtr allocHost(uint32_t count) = 0;
30+
31+
virtual TestResult init() = 0;
32+
33+
virtual TestResult recordGraph() = 0;
34+
virtual TestResult readResults(float *output_h) = 0;
35+
36+
virtual TestResult runGraph(float *input_h) = 0;
37+
virtual TestResult runEager(float *input_h) = 0;
38+
virtual TestResult waitCompletion() = 0;
39+
40+
TestResult calcRefResults(float *input_h, float *golden_h) {
41+
std::vector<float> buffer0(size);
42+
std::vector<float> buffer1(size);
43+
44+
// assign action
45+
for (uint32_t i = 0; i < size; ++i)
46+
buffer0[i] = input_h[i];
47+
48+
// repeat sin action
49+
for (size_t k = 0; k < numKernels; ++k) {
50+
std::swap(buffer0, buffer1);
51+
for (uint32_t i = 0; i < size; ++i)
52+
buffer0[i] = sin(buffer1[i]);
53+
}
54+
55+
if (numKernels % 2 != 0)
56+
std::swap(buffer0, buffer1);
57+
58+
for (uint32_t i = 0; i < size; ++i)
59+
golden_h[i] = buffer0[i];
60+
61+
return TestResult::Success;
62+
}
63+
64+
bool checkResults(float *output_h, float *golden_h) {
65+
bool ret = true;
66+
for (uint32_t idx = 0; idx < size; ++idx) {
67+
if ((fabs(output_h[idx] - golden_h[idx]) > 0.00001f) ||
68+
(output_h[idx]) == 0.0f) {
69+
ret = false;
70+
std::cout << "at (" << idx << "), expected " << golden_h[idx]
71+
<< ", but got " << output_h[idx] << std::endl;
72+
return ret;
73+
}
74+
}
75+
return ret;
76+
}
77+
78+
float randFloat() {
79+
return distribution(engine);
80+
}
81+
82+
TestResult run(
83+
Statistics &statistics) {
84+
MeasurementFields typeSelector(MeasurementUnit::Microseconds,
85+
MeasurementType::Cpu);
86+
if (isNoopRun()) {
87+
statistics.pushUnitAndType(typeSelector.getUnit(),
88+
typeSelector.getType());
89+
return TestResult::Nooped;
90+
}
91+
92+
init();
93+
94+
TestResult result = TestResult::Success;
95+
96+
Timer timer;
97+
98+
DataFloatPtr inputData = allocHost(size);
99+
DataFloatPtr refResult = allocHost(size);
100+
DataFloatPtr outputData = allocHost(size);
101+
102+
graphInputData = allocDevice(size);
103+
graphOutputData = allocDevice(size);
104+
105+
for (uint32_t i = 0; i < size; ++i) {
106+
inputData.get()[i] = randFloat();
107+
}
108+
109+
// reference results
110+
ASSERT_TEST_RESULT_SUCCESS(calcRefResults(inputData.get(), refResult.get()));
111+
112+
if (withGraphs)
113+
ASSERT_TEST_RESULT_SUCCESS(recordGraph());
114+
115+
// warm-up & results verification
116+
{
117+
if (withGraphs) {
118+
ASSERT_TEST_RESULT_SUCCESS(runGraph(inputData.get()));
119+
} else {
120+
ASSERT_TEST_RESULT_SUCCESS(runEager(inputData.get()));
121+
}
122+
ASSERT_TEST_RESULT_SUCCESS(waitCompletion());
123+
ASSERT_TEST_RESULT_SUCCESS(readResults(outputData.get()));
124+
125+
// if results don't match, fail the benchmark
126+
if (!checkResults(outputData.get(), refResult.get())) {
127+
std::cout << "Check FAILED" << std::endl;
128+
return TestResult::Error;
129+
}
130+
}
131+
132+
for (uint32_t i = 0; i < iterations; ++i) {
133+
timer.measureStart();
134+
135+
if (withGraphs) {
136+
ASSERT_TEST_RESULT_SUCCESS(runGraph(inputData.get()));
137+
} else {
138+
ASSERT_TEST_RESULT_SUCCESS(runEager(inputData.get()));
139+
}
140+
141+
ASSERT_TEST_RESULT_SUCCESS(waitCompletion());
142+
143+
timer.measureEnd();
144+
statistics.pushValue(timer.get(), typeSelector.getUnit(),
145+
typeSelector.getType());
146+
}
147+
148+
return result;
149+
}
150+
151+
~SinKernelGraphBase() = default;
152+
153+
protected:
154+
size_t numKernels;
155+
uint32_t size;
156+
bool withGraphs;
157+
bool withCopyOffload;
158+
bool immediateAppendCmdList;
159+
160+
size_t iterations;
161+
162+
float pattern = 123.4567f;
163+
164+
// device memory
165+
DataFloatPtr graphInputData;
166+
DataFloatPtr graphOutputData;
167+
168+
std::mt19937 engine;
169+
std::uniform_real_distribution<float> distribution;
170+
};

source/benchmarks/graph_api_benchmark/gtest/sin_kernel_graph.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,26 @@
1414

1515
[[maybe_unused]] static const inline RegisterTestCase<SinKernelGraph> registerTestCase{};
1616

17-
class SinKernelGraphTest : public ::testing::TestWithParam<std::tuple<std::size_t, bool>> {
18-
};
17+
class SinKernelGraphTest
18+
: public ::testing::TestWithParam<std::tuple<Api, uint32_t, bool, bool, bool>> {};
1919

2020
TEST_P(SinKernelGraphTest, Test) {
2121
SinKernelGraphArguments args{};
22-
args.api = Api::SYCL;
23-
args.numKernels = std::get<0>(GetParam());
24-
args.withGraphs = std::get<1>(GetParam());
22+
args.api = std::get<0>(GetParam());
23+
args.numKernels = std::get<1>(GetParam());
24+
args.immediateAppendCmdList = std::get<2>(GetParam());
25+
args.withCopyOffload = std::get<3>(GetParam());
26+
args.withGraphs = std::get<4>(GetParam());
2527

2628
SinKernelGraph test;
2729
test.run(args);
2830
}
2931

3032
INSTANTIATE_TEST_SUITE_P(
31-
SinKernelGraphTest,
32-
SinKernelGraphTest,
33-
::testing::Combine(
34-
::testing::Values(20, 50, 100, 500),
35-
::testing::Values(false, true)));
33+
SinKernelGraphTest, SinKernelGraphTest,
34+
::testing::Combine(::testing::Values(Api::SYCL, Api::UR, Api::L0),
35+
::testing::Values(3, 10, 50, 100),
36+
// FIXME: immediateAppendCmdList is currently broken, add 'true' to enable it once the driver is fixed.
37+
::testing::Values(false),
38+
::testing::Values(false, true),
39+
::testing::Values(false, true)));

source/benchmarks/graph_api_benchmark/gtest/submit_exec_graph.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,5 @@ INSTANTIATE_TEST_SUITE_P(
3333
SubmitExecGraphTest,
3434
::testing::Combine(
3535
::testing::Values(false, true),
36-
::testing::Values(50, 100, 500),
36+
::testing::Values(20, 100),
3737
::testing::Values(false, true)));
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright (C) 2024-2025 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "framework/test_case/register_test_case.h"
9+
10+
#include "definitions/sin_kernel_graph.h"
11+
#include "sin_kernel_impl_l0.h"
12+
13+
static TestResult run(const SinKernelGraphArguments &arguments,
14+
Statistics &statistics) {
15+
auto sin = SinKernelGraphL0(arguments);
16+
return sin.run(statistics);
17+
}
18+
19+
[[maybe_unused]] static RegisterTestCaseImplementation<SinKernelGraph>
20+
registerTestCase(run, Api::L0);

0 commit comments

Comments
 (0)