add vector size concept to stream benchmarks

MichalMrozek · Compute-Runtime-Automation · commit 3b7d71e2b7b5 · 2025-02-14T11:50:23.000+01:00
Signed-off-by: Michal Mrozek &lt;michal.mrozek@intel.com&gt;
diff --git a/TESTS.md b/TESTS.md
@@ -137,7 +137,7 @@ RemoteAccessMemoryMaxSaturation|Uses stream memory write to measure max data bus
 SLM_DataAccessLatency|generates SLM local memory transactions inside thread group to measure latency between reads (uses Intel only private intel_get_cycle_counter() )|<ul><li>--direction write or read mode (0 or 1)</li><li>--occupancyDiv H/W load divider by 8, 4, 2, full occupancy</li><li>--size SLM Size</li></ul>|:x:|:heavy_check_mark:|
 SlmSwitchLatency|Enqueues 2 kernels with different SLM size. Measures switch time between these kernels.|<ul><li>--firstSlmSize Size of the shared local memory per thread group. First kernel.</li><li>--secondSlmSize Size of the shared local memory per thread group. Second kernel.</li><li>--wgs Size of the work group.</li></ul>|:heavy_check_mark:|:x:|
 StreamAfterTransfer|Goal of this test is to measure how stream kernels perform right after host to device transfer populating the data. Test does clean caches, then emits transfers and then follows with stream kernel and measures GPU execution time of it.|<ul><li>--size Size of the memory to stream. Must be divisible by datatype size.</li><li>--type Memory streaming type (Read or Write or Scale or Triad)</li><li>--useEvents Perform GPU-side measurements using events (0 or 1)</li></ul>|:x:|:heavy_check_mark:|
-StreamMemory|Streams memory inside of kernel in a fashion described by 'type'. Copy means one memory location is read from and the second one is written to. Triad means two buffers are read and one is written to. In read and write memory is only read or written to.|<ul><li>--contents Buffer contents zeros/random (Zeros or Random)</li><li>--memoryPlacement Memory type used for stream (Device or Host or Shared or non-USM-mapped or non-USMmisaligned or non-USM4KBAligned or non-USM2MBAligned or non-USMmisaligned-imported or non-USM4KBAligned-imported or non-USM2MBAligned-imported)</li><li>--multiplier multiplies id used for accessing the resources to simulate partials</li><li>--size Size of the memory to stream. Must be divisible by datatype size.</li><li>--type Memory streaming type (Read or Write or Scale or Triad)</li><li>--useEvents Perform GPU-side measurements using events (0 or 1)</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
+StreamMemory|Streams memory inside of kernel in a fashion described by 'type'. Copy means one memory location is read from and the second one is written to. Triad means two buffers are read and one is written to. In read and write memory is only read or written to.|<ul><li>--contents Buffer contents zeros/random (Zeros or Random)</li><li>--memoryPlacement Memory type used for stream (Device or Host or Shared or non-USM-mapped or non-USMmisaligned or non-USM4KBAligned or non-USM2MBAligned or non-USMmisaligned-imported or non-USM4KBAligned-imported or non-USM2MBAligned-imported)</li><li>--multiplier multiplies id used for accessing the resources to simulate partials</li><li>--size Size of the memory to stream. Must be divisible by datatype size.</li><li>--type Memory streaming type (Read or Write or Scale or Triad)</li><li>--useEvents Perform GPU-side measurements using events (0 or 1)</li><li>--vectorSize size of uint vector type 1/2/4/8/16</li></ul>|:heavy_check_mark:|:heavy_check_mark:|
 StreamMemoryImmediate|Streams memory inside of kernel in a fashion described by 'type' using immediate command list. Copy means one memory location is read from and the second one is written to. Triad means two buffers are read and one is written to. In read and write memory is only read or written to.|<ul><li>--size Size of the memory to stream. Must be divisible by datatype size.</li><li>--type Memory streaming type (Read or Write or Scale or Triad)</li><li>--useEvents Perform GPU-side measurements using events (0 or 1)</li></ul>|:heavy_check_mark:|:x:|
 UnmapBuffer|allocates an OpenCL buffer and measures unmap bandwidth. Unmapping operation meansmemory transfer from CPU to GPU or a no-op, depending on map flags.|<ul><li>--compressed Select if the buffer is to be compressed. Will be skipped, if device does not support compression (0 or 1)</li><li>--contents Contents of the buffer (Zeros or Random)</li><li>--mapFlags OpenCL map flags passed during memory mapping (Read or Write or WriteInvalidate)</li><li>--size Size of the buffer</li><li>--useEvents Perform GPU-side measurements using events (0 or 1)</li></ul>|:x:|:heavy_check_mark:|
 UsmConcurrentCopy|allocates four unified shared memory buffers, 2 in device memory and 2 in host memory. Measures concurrent copy bandwidth between them.|<ul><li>--d2hEngine Engine used for device to host copy (RCS or CCS0 or CCS1 or CCS2 or CCS3 or BCS or BCS1 or BCS2 or BCS3 or BCS4 or BCS5 or BCS6 or BCS7 or BCS8)</li><li>--h2dEngine Engine used for host to device copy (RCS or CCS0 or CCS1 or CCS2 or CCS3 or BCS or BCS1 or BCS2 or BCS3 or BCS4 or BCS5 or BCS6 or BCS7 or BCS8)</li><li>--size Size of the buffer</li></ul>|:heavy_check_mark:|:x:|
diff --git a/source/benchmarks/memory_benchmark/definitions/stream_memory.h b/source/benchmarks/memory_benchmark/definitions/stream_memory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022-2024 Intel Corporation
+ * Copyright (C) 2022-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,14 +20,16 @@ struct StreamMemoryArguments : TestCaseArgumentContainer {
     BufferContentsArgument contents;
     UsmMemoryPlacementArgument memoryPlacement;
     PositiveIntegerArgument partialMultiplier;
+    PositiveIntegerArgument vectorSize;
 
     StreamMemoryArguments()
         : type(*this, "type", "Memory streaming type"),
           size(*this, "size", "Size of the memory to stream. Must be divisible by datatype size."),
           useEvents(*this, "useEvents", CommonHelpMessage::useEvents()),
           contents(*this, "contents", "Buffer contents zeros/random"),
           memoryPlacement(*this, "memoryPlacement", "Memory type used for stream"),
-          partialMultiplier(*this, "multiplier", "multiplies id used for accessing the resources to simulate partials") {}
+          partialMultiplier(*this, "multiplier", "multiplies id used for accessing the resources to simulate partials"),
+          vectorSize(*this, "vectorSize", "size of uint vector type 1/2/4/8/16") {}
 };
 
 struct StreamMemory : TestCase<StreamMemoryArguments> {
diff --git a/source/benchmarks/memory_benchmark/gtest/stream_memory.cpp b/source/benchmarks/memory_benchmark/gtest/stream_memory.cpp
@@ -15,7 +15,7 @@
 
 [[maybe_unused]] static const inline RegisterTestCase<StreamMemory> registerTestCase{};
 
-class StreamMemoryTest : public ::testing::TestWithParam<std::tuple<Api, StreamMemoryType, size_t, bool, BufferContents, UsmMemoryPlacement, size_t>> {
+class StreamMemoryTest : public ::testing::TestWithParam<std::tuple<Api, StreamMemoryType, size_t, bool, BufferContents, UsmMemoryPlacement, size_t, size_t>> {
 };
 
 TEST_P(StreamMemoryTest, Test) {
@@ -27,6 +27,7 @@ TEST_P(StreamMemoryTest, Test) {
     args.contents = std::get<4>(GetParam());
     args.memoryPlacement = std::get<5>(GetParam());
     args.partialMultiplier = std::get<6>(GetParam());
+    args.vectorSize = std::get<7>(GetParam());
 
     StreamMemory test;
     test.run(args);
@@ -43,29 +44,30 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(false, true),
         ::testing::Values(BufferContents::Zeros, BufferContents::Random),
         ::testing::ValuesIn(UsmMemoryPlacementArgument::deviceAndHost),
-        ::testing::Values(1u)));
+        ::testing::Values(1u),
+        ::testing::Values(1, 2, 4)));
 
 INSTANTIATE_TEST_SUITE_P(
     StreamMemoryTestLIMITED,
     StreamMemoryTest,
     ::testing::ValuesIn([] {
-        std::vector<std::tuple<Api, StreamMemoryType, size_t, bool, BufferContents, UsmMemoryPlacement, size_t>> testCases;
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 1 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u);
-        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u);
+        std::vector<std::tuple<Api, StreamMemoryType, size_t, bool, BufferContents, UsmMemoryPlacement, size_t, size_t>> testCases;
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 1 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Read, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Scale, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Triad, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Random, UsmMemoryPlacement::Host, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Device, 1u, 1u);
+        testCases.emplace_back(Api::OpenCL, StreamMemoryType::Write, 512 * megaByte, true, BufferContents::Zeros, UsmMemoryPlacement::Host, 1u, 1u);
         return testCases;
     }()));
diff --git a/source/benchmarks/memory_benchmark/implementations/l0/stream_memory_l0.cpp b/source/benchmarks/memory_benchmark/implementations/l0/stream_memory_l0.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022-2024 Intel Corporation
+ * Copyright (C) 2022-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,9 @@ static TestResult run(const StreamMemoryArguments &arguments, Statistics &statis
     if (arguments.partialMultiplier > 1u) {
         return TestResult::NoImplementation;
     }
+    if (arguments.vectorSize > 1u) {
+        return TestResult::NoImplementation;
+    }
 
     if (isNoopRun()) {
         statistics.pushUnitAndType(typeSelector.getUnit(), typeSelector.getType());
diff --git a/source/benchmarks/memory_benchmark/implementations/ocl/stream_memory_ocl.cpp b/source/benchmarks/memory_benchmark/implementations/ocl/stream_memory_ocl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022-2024 Intel Corporation
+ * Copyright (C) 2022-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,10 +42,9 @@ static TestResult run(const StreamMemoryArguments &arguments, Statistics &statis
     QueueProperties queueProperties = QueueProperties::create().setProfiling(true).setOoq(0);
     Opencl opencl(queueProperties);
     Timer timer;
-    bool useDoubles = opencl.getExtensions().areDoublesSupported();
 
-    size_t elementSize = useDoubles ? 8u : 4u;
-    const int64_t scalarValue = -999;
+    size_t elementSize = arguments.vectorSize * sizeof(uint32_t);
+    unsigned int scalarValue[16] = {9999999u};
     bool setScalarArgument = true;
     const bool printBuildInfo = true;
 
@@ -94,7 +93,11 @@ static TestResult run(const StreamMemoryArguments &arguments, Statistics &statis
 
     // Create kernel
     CompilerOptionsBuilder compilerOptions;
-    compilerOptions.addDefinitionKeyValue("STREAM_TYPE", useDoubles ? "double" : "float");
+    std::string streamType = "uint";
+    if (arguments.vectorSize > 1) {
+        streamType += std::to_string(arguments.vectorSize);
+    }
+    compilerOptions.addDefinitionKeyValue("STREAM_TYPE", streamType.c_str());
     const char *programName = "memory_benchmark_stream_memory.cl";
     cl_program program{};
     if (auto result = ProgramHelperOcl::buildProgramFromSourceFile(opencl.context, opencl.device, programName, compilerOptions.str().c_str(), program); result != TestResult::Success) {
diff --git a/source/benchmarks/memory_benchmark/kernels/memory_benchmark_stream_memory.cl b/source/benchmarks/memory_benchmark/kernels/memory_benchmark_stream_memory.cl
@@ -1,34 +1,24 @@
 /*
- * Copyright (C) 2022-2024 Intel Corporation
+ * Copyright (C) 2022-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 // #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-__kernel void readWithMultiplier(const __global STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, STREAM_TYPE scalar, int multiplier) {
+__kernel void readWithMultiplier(const __global volatile STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, STREAM_TYPE scalar, int multiplier) {
     int i = get_global_id(0);
     if(multiplier > 1){
         i = i * multiplier;
         if(i >= get_global_size(0)) return;
     }
     STREAM_TYPE value = x[i];
-
-    // A trick to ensure compiler won't optimize away the read
-    if (value == 0.37221) {
-        *dummyOutput = value;
-    }
 }
 
-__kernel void read(const __global STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, STREAM_TYPE scalar) {
+__kernel void read(const __global volatile STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, STREAM_TYPE scalar) {
     const int i = get_global_id(0);
     STREAM_TYPE value = x[i];
-
-    // A trick to ensure compiler won't optimize away the read
-    if (value == 0.37221) {
-        *dummyOutput = value;
-    }
 }
 
 __kernel void writeWithMultiplier(__global STREAM_TYPE *restrict x, STREAM_TYPE scalar, int multiplier) {
@@ -107,7 +97,7 @@ __kernel void remote_triad(const __global STREAM_TYPE *restrict x, const __globa
     z[g_id] = x[g_id] + y[g_id];
 }
 
-__kernel void remote_read(const __global STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, uint workItemGroupSize, const int remoteAccessFraction) {
+__kernel void remote_read(const __global volatile STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, uint workItemGroupSize, const int remoteAccessFraction) {
     int g_id = get_global_id(0);
     if (remoteAccessFraction != 0) {
         const size_t gws = get_global_size(0);
@@ -118,9 +108,6 @@ __kernel void remote_read(const __global STREAM_TYPE *restrict x, __global STREA
     }
 
     STREAM_TYPE value = x[g_id];
-    if (value == 37) {
-        *dummyOutput = value;
-    }
 }
 
 #ifdef ELEMENT_SIZE
@@ -200,7 +187,7 @@ __kernel void full_remote_block_read_xe_cores_distributed(const __global STREAM_
 }
 #endif
 
-__kernel void full_remote_scatter_read(const __global STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, const uint bufferLength, const uint iterations) {
+__kernel void full_remote_scatter_read(const __global volatile STREAM_TYPE *restrict x, __global STREAM_TYPE *restrict dummyOutput, const uint bufferLength, const uint iterations) {
     const uint gid = get_global_id(0);
     const size_t gws = get_global_size(0);
     // First half of workitems access memory starting from middle of the buffer
@@ -214,9 +201,6 @@ __kernel void full_remote_scatter_read(const __global STREAM_TYPE *restrict x, _
     for (uint i = 0; i < iterations; i++) {
         // Fold up calculated offset to prevent exceeding buffer length
         STREAM_TYPE value = x[startIndex + ((i * cachelineGap) & (bufferLength / 2 - 1))];
-        if (value == 33) {
-            *dummyOutput = value;
-        }
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (C) 2022-2024 Intel Corporation`
	`2`	`+ * Copyright (C) 2022-2025 Intel Corporation`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -25,6 +25,9 @@ static TestResult run(const StreamMemoryArguments &arguments, Statistics &statis`
`25`	`25`	`if (arguments.partialMultiplier > 1u) {`
`26`	`26`	`return TestResult::NoImplementation;`
`27`	`27`	`}`
	`28`	`+ if (arguments.vectorSize > 1u) {`
	`29`	`+ return TestResult::NoImplementation;`
	`30`	`+ }`
`28`	`31`
`29`	`32`	`if (isNoopRun()) {`
`30`	`33`	`statistics.pushUnitAndType(typeSelector.getUnit(), typeSelector.getType());`