Skip to content

Commit 0c65ffa

Browse files
Add barrier benchmark
Multiple enqueues with barriers inbetween, wait for the event from last barrier. Related-To: NEO-8147 Signed-off-by: Dominik Dabek <[email protected]>
1 parent b329c12 commit 0c65ffa

File tree

4 files changed

+161
-0
lines changed

4 files changed

+161
-0
lines changed

TESTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ CompletionLatency|enqueues system memory write and measures time between the mom
223223
CopySubmissionEvents|enqueues 4 byte copy to copy engine and return submission delta which is time between host API call and copy engine start|<ul><li>--engine Engine used for copying (RCS or CCS0 or CCS1 or CCS2 or CCS3 or BCS or BCS1 or BCS2 or BCS3 or BCS4 or BCS5 or BCS6 or BCS7 or BCS8)</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
224224
EmptyKernel|enqueues empty kernel and measures time to launch it and wait for it on CPU, thus measuring walker spawn time.|<ul><li>--wgc Workgroup count</li><li>--wgs Workgroup size (aka local work size)</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
225225
EmptyKernelImmediate|enqueues empty kernel and measures time to launch it using immediate command list and wait for it on CPU, thus measuring walker spawn time.|<ul><li>--UseEventForHostSync If true, use events to synchronize with host.If false, use zeCommandListHostSynchronize (0 or 1)</li><li>--wgc Workgroup count</li><li>--wgs Workgroup size (aka local work size)</li></ul>|:heavy_check_mark:|:x:|
226+
EnqueueBarrierWithEmptyWaitlist|enqueues kernel with barriers with empty waitlists inbetween, waiting on the last barriers event|<ul><li>--enqueueCount Number of enqueues</li><li>--outOfOrderQueue Use out of order queue (0 or 1)</li></ul>|:x:|:heavy_check_mark:|
226227
KernelSwitchLatency|measures time from end of one kernel till start of next kernel|<ul><li>--barrier synchronization with barrier instead of events (0 or 1)</li><li>--flush Flush between kernels (0 or 1)</li><li>--hostVisible events are with host visible flag (0 or 1)</li><li>--kernelCount Count of kernels</li><li>--kernelExecutionTime Approximately how long a single kernel executes, in us</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
227228
KernelSwitchLatencyImmediate|measures time from end of one kernel till start of next kernel using immediate command lists|<ul><li>--barrier synchronization with barrier instead of events (0 or 1)</li><li>--hostVisible events are with host visible flag (0 or 1)</li><li>--kernelCount Count of kernels</li><li>--kernelExecutionTime Approximately how long a single kernel executes, in us</li></ul>|:heavy_check_mark:|:x:|
228229
KernelWithWork|measures time required to run a GPU kernel which assigns constant values to elements of a buffer. Each thread assigns one value.|<ul><li>--usedIds Which of the get_global_id() and get_local_id() calls will be used in the kernel (None or Global or Local or AtomicPerWkg)</li><li>--wgc Workgroup count</li><li>--wgs Workgroup size (aka local work size)</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (C) 2023 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#pragma once
9+
10+
#include "framework/argument/basic_argument.h"
11+
#include "framework/test_case/test_case.h"
12+
#include "framework/utility/common_help_message.h"
13+
14+
struct EnqueueBarrierWithEmptyWaitlistArguments : TestCaseArgumentContainer {
15+
PositiveIntegerArgument enqueueCount;
16+
BooleanArgument outOfOrderQueue;
17+
18+
EnqueueBarrierWithEmptyWaitlistArguments()
19+
: enqueueCount(*this, "enqueueCount", "Number of enqueues"),
20+
outOfOrderQueue(*this, "outOfOrderQueue", "Use out of order queue") {}
21+
};
22+
23+
struct EnqueueBarrierWithEmptyWaitlist : TestCase<EnqueueBarrierWithEmptyWaitlistArguments> {
24+
using TestCase<EnqueueBarrierWithEmptyWaitlistArguments>::TestCase;
25+
26+
std::string getTestCaseName() const override {
27+
return "EnqueueBarrierWithEmptyWaitlist";
28+
}
29+
30+
std::string getHelp() const override {
31+
return "enqueues kernel with barriers with empty waitlists inbetween, waiting on the last barriers event";
32+
}
33+
};
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (C) 2023 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "definitions/enqueue_barrier_with_empty_waitlist.h"
9+
10+
#include "framework/test_case/register_test_case.h"
11+
#include "framework/utility/common_gtest_args.h"
12+
13+
#include <gtest/gtest.h>
14+
15+
[[maybe_unused]] static const inline RegisterTestCase<EnqueueBarrierWithEmptyWaitlist> registerTestCase{};
16+
17+
class EnqueueBarrierWithEmptyWaitlistTest : public ::testing::TestWithParam<std::tuple<size_t, bool>> {
18+
};
19+
20+
TEST_P(EnqueueBarrierWithEmptyWaitlistTest, Test) {
21+
EnqueueBarrierWithEmptyWaitlistArguments args{};
22+
args.api = Api::OpenCL;
23+
args.enqueueCount = std::get<0>(GetParam());
24+
args.outOfOrderQueue = std::get<1>(GetParam());
25+
26+
EnqueueBarrierWithEmptyWaitlist test;
27+
test.run(args);
28+
}
29+
30+
INSTANTIATE_TEST_SUITE_P(
31+
EnqueueBarrierWithEmptyWaitlistTest,
32+
EnqueueBarrierWithEmptyWaitlistTest,
33+
::testing::Combine(
34+
::testing::Values(16, 32, 64),
35+
::testing::Bool()));
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright (C) 2023 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "framework/ocl/opencl.h"
9+
#include "framework/ocl/utility/program_helper_ocl.h"
10+
#include "framework/test_case/register_test_case.h"
11+
#include "framework/utility/timer.h"
12+
13+
#include "definitions/enqueue_barrier_with_empty_waitlist.h"
14+
15+
#include <cstring>
16+
#include <gtest/gtest.h>
17+
18+
static TestResult run(const EnqueueBarrierWithEmptyWaitlistArguments &arguments, Statistics &statistics) {
19+
MeasurementFields typeSelector(MeasurementUnit::Microseconds, MeasurementType::Cpu);
20+
21+
if (isNoopRun()) {
22+
statistics.pushUnitAndType(typeSelector.getUnit(), typeSelector.getType());
23+
return TestResult::Nooped;
24+
}
25+
26+
// Setup
27+
QueueProperties queueProperties = QueueProperties::create().setOoq(arguments.outOfOrderQueue);
28+
Opencl opencl(queueProperties);
29+
Timer timer{};
30+
cl_int retVal{};
31+
32+
// Prepare data
33+
const size_t workgroupCount = 128;
34+
const size_t lws = 128;
35+
const size_t gws = lws * workgroupCount;
36+
const size_t enqueueCount = arguments.enqueueCount;
37+
if (enqueueCount == 0) {
38+
return TestResult::InvalidArgs;
39+
}
40+
41+
// Create kernel
42+
cl_program program = nullptr;
43+
const char *programName = "ulls_benchmark_eat_time.cl";
44+
const char *kernelName = "eat_time";
45+
if (auto result = ProgramHelperOcl::buildProgramFromSourceFile(opencl.context, opencl.device, programName, nullptr, program); result != TestResult::Success) {
46+
return result;
47+
}
48+
cl_kernel kernel = clCreateKernel(program, kernelName, &retVal);
49+
ASSERT_CL_SUCCESS(retVal);
50+
const cl_int operationsCount = 1;
51+
ASSERT_CL_SUCCESS(clSetKernelArg(kernel, 0, sizeof(size_t), &operationsCount));
52+
53+
// Warmup
54+
{
55+
cl_event event{};
56+
for (auto j = 0u; j < enqueueCount - 1; ++j) {
57+
ASSERT_CL_SUCCESS(clEnqueueNDRangeKernel(opencl.commandQueue, kernel, 1, nullptr, &gws, &lws, 0, nullptr, nullptr));
58+
ASSERT_CL_SUCCESS(clEnqueueBarrierWithWaitList(opencl.commandQueue, 0, nullptr, nullptr));
59+
}
60+
ASSERT_CL_SUCCESS(clEnqueueNDRangeKernel(opencl.commandQueue, kernel, 1, nullptr, &gws, &lws, 0, nullptr, nullptr));
61+
ASSERT_CL_SUCCESS(clEnqueueBarrierWithWaitList(opencl.commandQueue, 0, nullptr, &event));
62+
63+
ASSERT_CL_SUCCESS(clWaitForEvents(1, &event));
64+
ASSERT_CL_SUCCESS(clReleaseEvent(event));
65+
}
66+
67+
// Benchmark
68+
for (auto i = 0u; i < arguments.iterations; i++) {
69+
timer.measureStart();
70+
{
71+
cl_event event{};
72+
for (auto j = 0u; j < enqueueCount - 1; ++j) {
73+
ASSERT_CL_SUCCESS(clEnqueueNDRangeKernel(opencl.commandQueue, kernel, 1, nullptr, &gws, &lws, 0, nullptr, nullptr));
74+
ASSERT_CL_SUCCESS(clEnqueueBarrierWithWaitList(opencl.commandQueue, 0, nullptr, nullptr));
75+
}
76+
ASSERT_CL_SUCCESS(clEnqueueNDRangeKernel(opencl.commandQueue, kernel, 1, nullptr, &gws, &lws, 0, nullptr, nullptr));
77+
ASSERT_CL_SUCCESS(clEnqueueBarrierWithWaitList(opencl.commandQueue, 0, nullptr, &event));
78+
79+
ASSERT_CL_SUCCESS(clWaitForEvents(1, &event));
80+
ASSERT_CL_SUCCESS(clReleaseEvent(event));
81+
}
82+
timer.measureEnd();
83+
statistics.pushValue(timer.get(), typeSelector.getUnit(), typeSelector.getType());
84+
}
85+
86+
// Cleanup
87+
ASSERT_CL_SUCCESS(clReleaseKernel(kernel));
88+
ASSERT_CL_SUCCESS(clReleaseProgram(program));
89+
return TestResult::Success;
90+
}
91+
92+
static RegisterTestCaseImplementation<EnqueueBarrierWithEmptyWaitlist> registerTestCase(run, Api::OpenCL);

0 commit comments

Comments
 (0)