Skip to content

Commit d2eeab2

Browse files
authored
Merge pull request #2475 from VeenaGayathri12/development
Add Sample for PSTL Functionality Demonstrating Performance of Various C++ STL Algorithms
2 parents 70aa25f + 6fe28dc commit d2eeab2

File tree

3 files changed

+214
-4
lines changed

3 files changed

+214
-4
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
cmake_minimum_required(VERSION 3.12)
2+
3+
project(ParSTLTests)
4+
5+
find_package(TBB REQUIRED)
6+
set(CMAKE_CXX_COMPILER icpx)
7+
8+
if(GPU)
9+
#To build for Intel® Data Center GPU Max 1550 or 1100
10+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -fsycl-pstl-offload=gpu -D GPU")
11+
endif()
12+
if(CPU)
13+
#To build for Intel® UHD Graphics, Intel® Gen9, Gen11, Xeon CPU
14+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D CPU ")
15+
endif()
16+
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
17+
SET(CMAKE_EXE_LINKER_FLAGS "-ltbb ${CMAKE_EXE_LINKER_FLAGS}")
18+
19+
add_executable (ParSTLTest main.cpp)
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
2+
3+
#include <algorithm>
4+
#include <chrono>
5+
#include <execution>
6+
#include <iostream>
7+
#include <numeric>
8+
#include <vector>
9+
template <typename TFunc>
10+
void RunAndMeasure(const char* title, TFunc func) {
11+
const auto start = std::chrono::steady_clock::now();
12+
auto ret = func();
13+
const auto end = std::chrono::steady_clock::now();
14+
std::cout << title << ": "
15+
<< std::chrono::duration<double, std::milli>(end - start).count()
16+
<< " ms, res " << ret << "\n";
17+
}
18+
19+
int main() {
20+
int size=1024000000;
21+
std::vector<double> v(1024000000, 0.5);
22+
std::vector<double> result(v.size());
23+
24+
std::vector<double> v1(size);
25+
std::iota(v1.begin(), v1.end(), 1.0);
26+
27+
28+
RunAndMeasure("std::warm up", [&v] {
29+
return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
30+
});
31+
32+
RunAndMeasure("std::accumulate",
33+
[&v] { return std::accumulate(v.begin(), v.end(), 0.0); });
34+
35+
RunAndMeasure("std::reduce, seq", [&v] {
36+
return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
37+
});
38+
39+
RunAndMeasure("std::reduce, par", [&v] {
40+
return std::reduce(std::execution::par, v.begin(), v.end(), 0.0);
41+
});
42+
43+
RunAndMeasure("std::reduce, par_unseq", [&v] {
44+
return std::reduce(std::execution::par_unseq, v.begin(), v.end(), 0.0);
45+
});
46+
47+
48+
RunAndMeasure("std::find, seq", [&v] {
49+
auto res = std::find(std::execution::seq, std::begin(v), std::end(v), 0.6);
50+
return res == std::end(v) ? 0.0 : 1.0;
51+
});
52+
53+
RunAndMeasure("std::find, par", [&v] {
54+
auto res = std::find(std::execution::par, std::begin(v), std::end(v), 0.6);
55+
return res == std::end(v) ? 0.0 : 1.0;
56+
});
57+
58+
RunAndMeasure("std::find, par_unseq", [&v] {
59+
auto res = std::find(std::execution::par_unseq, std::begin(v), std::end(v), 0.6);
60+
return res == std::end(v) ? 0.0 : 1.0;
61+
});
62+
RunAndMeasure("std::copy_if, seq", [&v, &result] {
63+
auto new_end = std::copy_if(std::execution::seq, v.begin(), v.end(), result.begin(),
64+
[](double value) { return value > 0.4; });
65+
return std::distance(result.begin(), new_end);
66+
});
67+
68+
RunAndMeasure("std::copy_if, par", [&v, &result] {
69+
auto new_end = std::copy_if(std::execution::par, v.begin(), v.end(), result.begin(),
70+
[](double value) { return value > 0.4; });
71+
return std::distance(result.begin(), new_end);
72+
});
73+
74+
RunAndMeasure("std::copy_if, par_unseq", [&v, &result] {
75+
auto new_end = std::copy_if(std::execution::par_unseq, v.begin(), v.end(), result.begin(),
76+
[](double value) { return value > 0.4; });
77+
return std::distance(result.begin(), new_end);
78+
});
79+
80+
RunAndMeasure("std::inclusive_scan, seq", [&v] {
81+
std::vector<double> scan_result(v.size());
82+
std::inclusive_scan(std::execution::seq, v.begin(), v.end(), scan_result.begin());
83+
return scan_result.back();
84+
});
85+
86+
RunAndMeasure("std::inclusive_scan, par", [&v] {
87+
std::vector<double> scan_result(v.size());
88+
std::inclusive_scan(std::execution::par, v.begin(), v.end(), scan_result.begin());
89+
return scan_result.back();
90+
});
91+
92+
RunAndMeasure("std::inclusive_scan, par_unseq", [&v] {
93+
std::vector<double> scan_result(v.size());
94+
std::inclusive_scan(std::execution::par_unseq, v.begin(), v.end(), scan_result.begin());
95+
return scan_result.back();
96+
});
97+
98+
99+
RunAndMeasure("std::min_element, seq", [&v1] {
100+
return *std::min_element(std::execution::seq, v1.begin(), v1.end());
101+
});
102+
103+
RunAndMeasure("std::min_element, par", [&v1] {
104+
return *std::min_element(std::execution::par, v1.begin(), v1.end());
105+
});
106+
107+
RunAndMeasure("std::min_element, par_unseq", [&v1] {
108+
return *std::min_element(std::execution::par_unseq, v1.begin(), v1.end());
109+
});
110+
111+
RunAndMeasure("std::max_element, seq", [&v1] {
112+
return *std::max_element(std::execution::seq, v1.begin(), v1.end());
113+
});
114+
115+
RunAndMeasure("std::max_element, par", [&v1] {
116+
return *std::max_element(std::execution::par, v1.begin(), v1.end());
117+
});
118+
119+
RunAndMeasure("std::max_element, par_unseq", [&v1] {
120+
return *std::max_element(std::execution::par_unseq, v1.begin(), v1.end());
121+
});
122+
123+
RunAndMeasure("std::minmax_element, seq", [&v1] {
124+
auto result = std::minmax_element(std::execution::seq, v1.begin(), v1.end());
125+
return *result.first + *result.second;
126+
});
127+
128+
RunAndMeasure("std::minmax_element, par", [&v1] {
129+
auto result = std::minmax_element(std::execution::par, v1.begin(), v1.end());
130+
return *result.first + *result.second;
131+
});
132+
133+
RunAndMeasure("std::minmax_element, par_unseq", [&v1] {
134+
auto result = std::minmax_element(std::execution::par_unseq, v1.begin(), v1.end());
135+
return *result.first + *result.second;
136+
});
137+
138+
RunAndMeasure("std::is_partitioned, seq", [&v] {
139+
return std::is_partitioned(std::execution::seq, v.begin(), v.end(), [](double n) { return n < 1.0; });
140+
});
141+
142+
RunAndMeasure("std::is_partitioned, par", [&v] {
143+
return std::is_partitioned(std::execution::par, v.begin(), v.end(), [](double n) { return n < 1.0; });
144+
});
145+
146+
RunAndMeasure("std::is_partitioned, par_unseq", [&v] {
147+
return std::is_partitioned(std::execution::par_unseq, v.begin(), v.end(), [](double n) { return n < 1.0; });
148+
});
149+
150+
RunAndMeasure("std::lexicographical_compare, seq", [&v] {
151+
std::vector<double> v2(1024000000, 0.5);
152+
return std::lexicographical_compare(std::execution::seq, v.begin(), v.end(), v2.begin(), v2.end());
153+
});
154+
155+
RunAndMeasure("std::lexicographical_compare, par", [&v] {
156+
std::vector<double> v2(1024000000, 0.5);
157+
return std::lexicographical_compare(std::execution::par, v.begin(), v.end(), v2.begin(), v2.end());
158+
});
159+
160+
RunAndMeasure("std::lexicographical_compare, par_unseq", [&v] {
161+
std::vector<double> v2(1024000000, 0.5);
162+
return std::lexicographical_compare(std::execution::par_unseq, v.begin(), v.end(), v2.begin(), v2.end());
163+
});
164+
165+
RunAndMeasure("std::binary_search", [&v] {
166+
return std::binary_search( v.begin(), v.end(), 0.5);
167+
});
168+
169+
RunAndMeasure("std::lower_bound", [&v1] {
170+
return *std::lower_bound(v1.begin(), v1.end(), 0.5);
171+
});
172+
173+
RunAndMeasure("std::upper_bound", [&v1] {
174+
return *std::upper_bound( v1.begin(), v1.end(), 0.5);
175+
});
176+
177+
return 0;
178+
}

Libraries/oneDPL/pSTL_offload/README.md

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ The `pSTL_offload` sample demonstrates the offloading of C++ standard parallel a
1414

1515
Offloading the C++ standard parallel STL code (par-unseq policy) to GPU and CPU without any code changes when using the `-fsycl-pstl-offload` compiler option with Intel® DPC+/C+ compiler. It is an experimental feature of oneDPL.
1616

17-
This folder contains two sample examples in the following folders:
17+
This folder contains three sample examples in the following folders:
1818

1919
| Folder Name | Description
2020
|:--- |:---
2121
| `FileWordCount` | Counting Words in Files Example
2222
| `WordCount` | Counting Words generated Example
23+
| 'ParSTLTests' | Examples of Various STL Algorithms with Execution Policies
2324

2425
> **Note**: For more information refer to [Get Started with Parallel STL](https://www.intel.com/content/www/us/en/developer/articles/guide/get-started-with-parallel-stl.html).
2526
@@ -34,8 +35,8 @@ This folder contains two sample examples in the following folders:
3435

3536
## Key Implementation Details
3637

37-
The example includes two samples `FileWordCount` and `WordCount` which count the number of words in files and the number of words generated respectively using the standard C++17 Parallel Algorithm [transfor_reduce](https://en.cppreference.com/w/cpp/algorithm/transform_reduce). This computation can be offloaded to the GPU device with the help of `-fsycl-pstl-offload` compiler option and standard <algorithm> header inclusion is explicitly required for PSTL Offload to work.
38-
FileWordCount sample also demonstrates the use of transform, copy, copy_if, and for_each standard C++17 Parallel Algorithms.
38+
The example includes three samples `FileWordCount` , `WordCount` and and ParSTLTests. FileWordCount and WordCount counts the number of words which count the number of words in files and the number of words generated respectively using the standard C++17 Parallel Algorithm [transfor_reduce](https://en.cppreference.com/w/cpp/algorithm/transform_reduce). ParSTLTests demonstrates the use of various STL algorithms with different execution policies (seq, par, par_unseq). It applies these algorithms to large datasets and prints the results for each execution. This computation can be offloaded to the GPU device with the help of `-fsycl-pstl-offload` compiler option and standard <algorithm> header inclusion is explicitly required for PSTL Offload to work.
39+
FileWordCount sample also demonstrates the use of transform, copy, copy_if, and for_each standard C++17 Parallel Algorithms. . The ParSTLTests uses STL algorithms such as reduce, accumulate, find, copy_if, inclusive_scan, min_element, max_element, minmax_element, is_partitioned, lexicographical_compare, binary_search, lower_bound, and upper_bound. These algorithms perform tasks like summing elements, finding values, copying based on conditions, scanning, and searching within large datasets.
3940
The `-fsycl-pstl-offload` option enables the offloading of C++ standard parallel algorithms that were only called with `std::execution::par_unseq` policy to a SYCL device. The offloaded algorithms are implemented via the oneAPI Data Parallel C++ Library (oneDPL). This option is an experimental feature. If the argument is not specified, the compiler offloads to the default SYCL device.
4041
The performance of memory allocations may be improved by using the `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` environment variable.
4142

@@ -106,7 +107,19 @@ When working with the command-line interface (CLI), you should configure the one
106107
$ make run_fwc1 //for PAR Policy
107108
$ unset ONEAPI_DEVICE_SELECTOR
108109
```
109-
110+
Run `pSTL_offload-ParSTLTest` on GPU.
111+
```
112+
$ export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
113+
$ ./ParSTLTest
114+
$ unset ONEAPI_DEVICE_SELECTOR
115+
```
116+
Run `pSTL_offload-ParSTLTest` on CPU.
117+
```
118+
$ export ONEAPI_DEVICE_SELECTOR=*:cpu
119+
$ ./ParSTLTest
120+
$ unset ONEAPI_DEVICE_SELECTOR
121+
```
122+
110123
#### Troubleshooting
111124
112125
If an error occurs, you can get more details by running `make` with the `VERBOSE=1` argument:

0 commit comments

Comments
 (0)