Merge pull request #2475 from VeenaGayathri12/development

jimmytwei · web-flow · commit d2eeab27b8dc · 2024-09-16T21:22:05.000-07:00
Add Sample for PSTL Functionality Demonstrating Performance of Various C++ STL Algorithms
diff --git a/Libraries/oneDPL/pSTL_offload/ParSTLTests/CMakeLists.txt b/Libraries/oneDPL/pSTL_offload/ParSTLTests/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(ParSTLTests)
+
+find_package(TBB REQUIRED)
+set(CMAKE_CXX_COMPILER icpx)
+
+if(GPU)
+  #To build for Intel® Data Center GPU Max 1550 or 1100
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -fsycl-pstl-offload=gpu -D GPU")
+endif()
+if(CPU)
+  #To build for Intel® UHD Graphics, Intel® Gen9, Gen11, Xeon CPU 
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D CPU ")
+endif()
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+SET(CMAKE_EXE_LINKER_FLAGS  "-ltbb ${CMAKE_EXE_LINKER_FLAGS}")
+
+add_executable (ParSTLTest main.cpp)
diff --git a/Libraries/oneDPL/pSTL_offload/ParSTLTests/main.cpp b/Libraries/oneDPL/pSTL_offload/ParSTLTests/main.cpp
@@ -0,0 +1,178 @@
+
+
+#include <algorithm>
+#include <chrono>
+#include <execution>
+#include <iostream>
+#include <numeric>
+#include <vector>
+template <typename TFunc>
+void RunAndMeasure(const char* title, TFunc func) {
+  const auto start = std::chrono::steady_clock::now();
+  auto ret = func();
+  const auto end = std::chrono::steady_clock::now();
+  std::cout << title << ": "
+            << std::chrono::duration<double, std::milli>(end - start).count()
+            << " ms, res " << ret << "\n";
+}
+
+int main() {
+  int size=1024000000;
+  std::vector<double> v(1024000000, 0.5);
+  std::vector<double> result(v.size());
+
+  std::vector<double> v1(size);
+  std::iota(v1.begin(), v1.end(), 1.0);
+
+
+ RunAndMeasure("std::warm up", [&v] {
+    return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
+  });
+
+  RunAndMeasure("std::accumulate",
+                [&v] { return std::accumulate(v.begin(), v.end(), 0.0); });
+
+  RunAndMeasure("std::reduce, seq", [&v] {
+    return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
+  });
+
+  RunAndMeasure("std::reduce, par", [&v] {
+    return std::reduce(std::execution::par, v.begin(), v.end(), 0.0);
+  });
+
+  RunAndMeasure("std::reduce, par_unseq", [&v] {
+    return std::reduce(std::execution::par_unseq, v.begin(), v.end(), 0.0);
+  });
+
+  
+  RunAndMeasure("std::find, seq", [&v] {
+    auto res = std::find(std::execution::seq, std::begin(v), std::end(v), 0.6);
+    return res == std::end(v) ? 0.0 : 1.0;
+  });
+
+  RunAndMeasure("std::find, par", [&v] {
+    auto res = std::find(std::execution::par, std::begin(v), std::end(v), 0.6);
+    return res == std::end(v) ? 0.0 : 1.0;
+  });
+
+   RunAndMeasure("std::find, par_unseq", [&v] {
+    auto res = std::find(std::execution::par_unseq, std::begin(v), std::end(v), 0.6);
+    return res == std::end(v) ? 0.0 : 1.0;
+  }); 
+    RunAndMeasure("std::copy_if, seq", [&v, &result] {
+        auto new_end = std::copy_if(std::execution::seq, v.begin(), v.end(), result.begin(),
+                                    [](double value) { return value > 0.4; });
+        return std::distance(result.begin(), new_end);
+    });
+
+    RunAndMeasure("std::copy_if, par", [&v, &result] {
+        auto new_end = std::copy_if(std::execution::par, v.begin(), v.end(), result.begin(),
+                                    [](double value) { return value > 0.4; });
+        return std::distance(result.begin(), new_end);
+    });
+
+    RunAndMeasure("std::copy_if, par_unseq", [&v, &result] {
+        auto new_end = std::copy_if(std::execution::par_unseq, v.begin(), v.end(), result.begin(),
+                                    [](double value) { return value > 0.4; });
+        return std::distance(result.begin(), new_end);
+    });
+
+    RunAndMeasure("std::inclusive_scan, seq", [&v] {
+        std::vector<double> scan_result(v.size());
+        std::inclusive_scan(std::execution::seq, v.begin(), v.end(), scan_result.begin());
+        return scan_result.back();
+    });
+
+    RunAndMeasure("std::inclusive_scan, par", [&v] {
+        std::vector<double> scan_result(v.size());
+        std::inclusive_scan(std::execution::par, v.begin(), v.end(), scan_result.begin());
+        return scan_result.back();
+    });
+
+    RunAndMeasure("std::inclusive_scan, par_unseq", [&v] {
+        std::vector<double> scan_result(v.size());
+        std::inclusive_scan(std::execution::par_unseq, v.begin(), v.end(), scan_result.begin());
+        return scan_result.back();
+    });
+
+
+    RunAndMeasure("std::min_element, seq", [&v1] {
+        return *std::min_element(std::execution::seq, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::min_element, par", [&v1] {
+        return *std::min_element(std::execution::par, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::min_element, par_unseq", [&v1] {
+        return *std::min_element(std::execution::par_unseq, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::max_element, seq", [&v1] {
+        return *std::max_element(std::execution::seq, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::max_element, par", [&v1] {
+        return *std::max_element(std::execution::par, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::max_element, par_unseq", [&v1] {
+        return *std::max_element(std::execution::par_unseq, v1.begin(), v1.end());
+    });
+
+    RunAndMeasure("std::minmax_element, seq", [&v1] {
+        auto result = std::minmax_element(std::execution::seq, v1.begin(), v1.end());
+        return *result.first + *result.second;
+    });
+
+    RunAndMeasure("std::minmax_element, par", [&v1] {
+        auto result = std::minmax_element(std::execution::par, v1.begin(), v1.end());
+        return *result.first + *result.second;
+    });
+
+    RunAndMeasure("std::minmax_element, par_unseq", [&v1] {
+        auto result = std::minmax_element(std::execution::par_unseq, v1.begin(), v1.end());
+        return *result.first + *result.second;
+    });
+
+    RunAndMeasure("std::is_partitioned, seq", [&v] {
+        return std::is_partitioned(std::execution::seq, v.begin(), v.end(), [](double n) { return n < 1.0; });
+    });
+
+    RunAndMeasure("std::is_partitioned, par", [&v] {
+        return std::is_partitioned(std::execution::par, v.begin(), v.end(), [](double n) { return n < 1.0; });
+    });
+
+    RunAndMeasure("std::is_partitioned, par_unseq", [&v] {
+        return std::is_partitioned(std::execution::par_unseq, v.begin(), v.end(), [](double n) { return n < 1.0; });
+    });
+
+    RunAndMeasure("std::lexicographical_compare,  seq", [&v] {
+        std::vector<double> v2(1024000000, 0.5);
+        return std::lexicographical_compare(std::execution::seq, v.begin(), v.end(), v2.begin(), v2.end());
+    });
+
+    RunAndMeasure("std::lexicographical_compare, par", [&v] {
+        std::vector<double> v2(1024000000, 0.5);
+        return std::lexicographical_compare(std::execution::par, v.begin(), v.end(), v2.begin(), v2.end());
+    });
+
+    RunAndMeasure("std::lexicographical_compare, par_unseq", [&v] {
+        std::vector<double> v2(1024000000, 0.5);
+        return std::lexicographical_compare(std::execution::par_unseq, v.begin(), v.end(), v2.begin(), v2.end());
+    });
+
+    RunAndMeasure("std::binary_search", [&v] {
+        return std::binary_search( v.begin(), v.end(), 0.5);
+    });
+
+    RunAndMeasure("std::lower_bound", [&v1] {
+        return *std::lower_bound(v1.begin(), v1.end(), 0.5);
+    });
+
+    RunAndMeasure("std::upper_bound", [&v1] {
+        return *std::upper_bound( v1.begin(), v1.end(), 0.5);
+    });
+
+  return 0;
+}
diff --git a/Libraries/oneDPL/pSTL_offload/README.md b/Libraries/oneDPL/pSTL_offload/README.md
@@ -14,12 +14,13 @@ The `pSTL_offload` sample demonstrates the offloading of C++ standard parallel a
 
 Offloading the C++ standard parallel STL code (par-unseq policy) to GPU and CPU  without any code changes when using the `-fsycl-pstl-offload` compiler option with Intel® DPC+/C+ compiler. It is an experimental feature of oneDPL.
 
-This folder contains two sample examples in the following folders:
+This folder contains three sample examples in the following folders:
 
 | Folder Name                           | Description
 |:---                                   |:---
 | `FileWordCount`                       | Counting Words in Files Example
 | `WordCount`                           | Counting Words generated Example
+| 'ParSTLTests'                         | Examples of Various STL Algorithms with Execution Policies
 
 > **Note**: For more information refer to [Get Started with Parallel STL](https://www.intel.com/content/www/us/en/developer/articles/guide/get-started-with-parallel-stl.html).
 
@@ -34,8 +35,8 @@ This folder contains two sample examples in the following folders:
 
 ## Key Implementation Details
 
-The example includes two samples `FileWordCount` and `WordCount` which count the number of words in files and the number of words generated respectively using the standard C++17 Parallel Algorithm [transfor_reduce](https://en.cppreference.com/w/cpp/algorithm/transform_reduce). This computation can be offloaded to the GPU device with the help of `-fsycl-pstl-offload` compiler option and standard <algorithm> header inclusion is explicitly required for PSTL Offload to work.
-FileWordCount sample also demonstrates the use of transform, copy, copy_if, and for_each standard C++17 Parallel Algorithms.
+The example includes three samples `FileWordCount` , `WordCount` and and ParSTLTests. FileWordCount and WordCount   counts the number of words  which count the number of words in files and the number of words generated respectively using the standard C++17 Parallel Algorithm [transfor_reduce](https://en.cppreference.com/w/cpp/algorithm/transform_reduce). ParSTLTests demonstrates the use of various STL algorithms with different execution policies (seq, par, par_unseq). It applies these algorithms to large datasets and prints the results for each execution. This computation can be offloaded to the GPU device with the help of `-fsycl-pstl-offload` compiler option and standard <algorithm> header inclusion is explicitly required for PSTL Offload to work.
+FileWordCount sample also demonstrates the use of transform, copy, copy_if, and for_each standard C++17 Parallel Algorithms. .  The ParSTLTests uses STL algorithms such as reduce, accumulate, find, copy_if, inclusive_scan, min_element, max_element, minmax_element, is_partitioned, lexicographical_compare, binary_search, lower_bound, and upper_bound. These algorithms perform tasks like summing elements, finding values, copying based on conditions, scanning, and searching within large datasets. 
 The `-fsycl-pstl-offload` option enables the offloading of C++ standard parallel algorithms that were only called with `std::execution::par_unseq` policy to a SYCL device. The offloaded algorithms are implemented via the oneAPI Data Parallel C++ Library (oneDPL). This option is an experimental feature. If the argument is not specified, the compiler offloads to the default SYCL device.
 The performance of memory allocations may be improved by using the `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` environment variable.
 
@@ -106,7 +107,19 @@ When working with the command-line interface (CLI), you should configure the one
     $ make run_fwc1              //for PAR Policy
     $ unset ONEAPI_DEVICE_SELECTOR
     ```
-    
+   Run `pSTL_offload-ParSTLTest` on GPU.
+    ```
+    $ export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
+    $ ./ParSTLTest
+    $ unset ONEAPI_DEVICE_SELECTOR
+    ```
+   Run `pSTL_offload-ParSTLTest` on CPU.
+    ```
+    $ export ONEAPI_DEVICE_SELECTOR=*:cpu
+    $ ./ParSTLTest
+    $ unset ONEAPI_DEVICE_SELECTOR
+    ```
+
 #### Troubleshooting
 
 If an error occurs, you can get more details by running `make` with the `VERBOSE=1` argument: