From c9b30318780a793be08535901d3fd31f066eedb3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 26 Mar 2025 18:03:46 +0000 Subject: [PATCH 1/2] initial commit --- .../cpp-loop-size-context/Example.md | 65 ++++++++++++++ .../cpp-loop-size-context/Introduction.md | 25 ++++++ .../cpp-loop-size-context/_index.md | 41 +++++++++ .../cpp-loop-size-context/_next-steps.md | 8 ++ .../providing-inside-knowledge.md | 89 +++++++++++++++++++ 5 files changed, 228 insertions(+) create mode 100644 content/learning-paths/cross-platform/cpp-loop-size-context/Example.md create mode 100644 content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md create mode 100644 content/learning-paths/cross-platform/cpp-loop-size-context/_index.md create mode 100644 content/learning-paths/cross-platform/cpp-loop-size-context/_next-steps.md create mode 100644 content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md b/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md new file mode 100644 index 0000000000..dc0bd8ec8b --- /dev/null +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md @@ -0,0 +1,65 @@ +--- +title: Example +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example + +The following `C++` snippet takes user input as the loop size so that the loop size, `max_loop_size`, is only known at runtime. This initialises an array of size, , `max_loop_size` with the value for each element corresponding to the index position. The function, `foo`, loops of each element to print out the sum of all elements. + +Copy the snippet below into a file named, `no-context.cpp`. + +```cpp +#include +#include + +void foo(const int* x, int max_loop_size) +{ + int sum = 0; + for (int k = 0; k < max_loop_size; k++) { + sum += x[k]; + } + std::cout << "Sum: " << sum << std::endl; +} + +int main() { + int max_loop_size; + std::cout << "Enter a value for max_loop_size (must be a multiple of 4): "; + std::cin >> max_loop_size; + + int x[max_loop_size]; + // Initialise test data + for(int i = 0; i < max_loop_size; ++i) x[i] = i; + + // Start timing + auto start = std::chrono::high_resolution_clock::now(); + foo(x, max_loop_size); + // Stop timing + auto end = std::chrono::high_resolution_clock::now(); + + // Calculate and display the elapsed time + auto duration = std::chrono::duration_cast(end - start).count(); + std::cout << "Time taken by foo: " << duration << " nanoseconds" << std::endl; + + return 0; +} +``` + +Compiling using the following command. + +```bash +g++ -O3 -march=armv8-a+simd -o no_context +``` + +Running the example with the number 4000 leads to the following results. Naturally you will see variability depending on which platform you run this on. + +```output +./no_context +Enter a value for max_loop_size (must be a multiple of 4): 40000 +Sum: 799980000 +Time taken by foo: 138100 nanoseconds +``` + diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md b/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md new file mode 100644 index 0000000000..d247b487e6 --- /dev/null +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md @@ -0,0 +1,25 @@ +--- +title: Setup +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +Often the programmer will have a better understanding of their software and the inputs than the compiler. For example, if the loop size is calculated at runtime, the compiler will have to account for a variable size. However, a developer may have knowledge of the runtime profile, for example if the loop size is always a multiple of a specific number. + +To provide this context to the compiler we will use a simple example written in C++. + +## Setup + +In this learning path I will be using an Arm-based `r7g.large` instance from AWS but any Arm-based machine can be used. + +Install the `g++` compiler with the following commands. Adjust to the appropriate commands for your operating system. + +```bash +sudo apt update +sudo apt install g++ +``` + diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md b/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md new file mode 100644 index 0000000000..ee196037bb --- /dev/null +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md @@ -0,0 +1,41 @@ +--- +title: Learn to improve for loop run time with loop size context + +minutes_to_complete: 15 + +who_is_this_for: C++ developers + +learning_objectives: + - Learn how to add preexisting knowledge of loop sizes to for loops + +prerequisites: + - Access to an Arm-based machine / instance + - Basic understanding of C++ + +author: Kieran Hejmadi + +### Tags +skilllevels: Introductory +subjects: C++ +armips: + - Neoverse +tools_software_languages: + - C++ +operatingsystems: + - Linux + + + +further_reading: + - resource: + title: PLACEHOLDER MANUAL + link: PLACEHOLDER MANUAL LINK + type: documentation + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/_next-steps.md b/content/learning-paths/cross-platform/cpp-loop-size-context/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md b/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md new file mode 100644 index 0000000000..36d2d9111e --- /dev/null +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md @@ -0,0 +1,89 @@ +--- +title: Adding Inside Knowledge +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Adding Inside Knowledge + +To make the compiler aware that the input will be a multiple of 4 we will rewrite our loop size as the following. + +```output +((max_loop_size/4)*4) +``` + +Mathematically this may seem redundant. However since `(max_loop_size/4)` will be truncated to an integer this guarantees `(max_loop_size/4)*4` is a multiple of 4. + +As slightly easier to read method that avoids confusion when arguments are passed in is dividing the variable before passing it in. For example. + +```output +(max_loop_size_div_4 * 4) +``` + +## Adding Insider Knowledge + +```cpp +#include +#include + +void foo(const int* x, int max_loop_size_div_4) +{ + int sum = 0; + for (int k = 0; k < max_loop_size_div_4 * 4; k++) { + sum += x[k]; + } + std::cout << "Sum: " << sum << std::endl; +} + +int main() { + int max_loop_size; + std::cout << "Enter a value for max_loop_size (must be a multiple of 4): "; + std::cin >> max_loop_size; + + int max_loop_size_div_4 = max_loop_size / 4; + int x[max_loop_size]; + // Initialise test data + for(int i = 0; i < max_loop_size; ++i) x[i] = i; + + // Start timing + auto start = std::chrono::high_resolution_clock::now(); + foo(x, max_loop_size_div_4); + // Stop timing + auto end = std::chrono::high_resolution_clock::now(); + + // Calculate and display the elapsed time + auto duration = std::chrono::duration_cast(end - start).count(); + std::cout << "Time taken by foo: " << duration << " nanoseconds" << std::endl; + + return 0; +} +``` + +Again compile with the same compiler flags. + +```bash +g++ -O3 -march=armv8-a+simd -o context +``` + +```output +./context +Enter a value for max_loop_size (must be a multiple of 4): 40000 +Sum: 799980000 +Time taken by foo: 24650 nanoseconds +``` + +## Comparison + +To compare we will use compiler explorer to see the assembly. + +First, looking at the example without context [here](https://godbolt.org/z/qPaW5Kjxa). +Second, looking at the example with context [here](https://godbolt.org/z/rhj65Pe4v). + + +[Here](https://godbolt.org/z/nvx4j1vTK). + +As the assembly shows we have fewer lines of assembly corresponding to the function `foo` as there is less setup code to account given the insider knowledge. + + From 762f3aebc25e439f12b7634a02bea4efc1f96a59 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 26 Mar 2025 18:19:26 +0000 Subject: [PATCH 2/2] final check --- .../cpp-loop-size-context/Example.md | 6 ++--- .../cpp-loop-size-context/Introduction.md | 8 +++---- .../cpp-loop-size-context/_index.md | 6 ++--- .../providing-inside-knowledge.md | 24 ++++++++----------- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md b/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md index dc0bd8ec8b..285460da55 100644 --- a/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/Example.md @@ -8,7 +8,7 @@ layout: learningpathall ## Example -The following `C++` snippet takes user input as the loop size so that the loop size, `max_loop_size`, is only known at runtime. This initialises an array of size, , `max_loop_size` with the value for each element corresponding to the index position. The function, `foo`, loops of each element to print out the sum of all elements. +The following `C++` snippet takes user input as the loop size so that the loop size, `max_loop_size`, is only known at runtime. This initialises an array of size, , `max_loop_size` with the value for each element corresponding to the index position. The function, `foo`, loops through each element to print out the sum of all elements. Copy the snippet below into a file named, `no-context.cpp`. @@ -51,10 +51,10 @@ int main() { Compiling using the following command. ```bash -g++ -O3 -march=armv8-a+simd -o no_context +g++ -O3 -march=armv8-a+simd no_context.cpp -o no_context ``` -Running the example with the number 4000 leads to the following results. Naturally you will see variability depending on which platform you run this on. +Running the example with the number 4000 leads to the following results. You will see runtime variability depending on which platform you run this on. ```output ./no_context diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md b/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md index d247b487e6..6be3f649c5 100644 --- a/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/Introduction.md @@ -8,15 +8,15 @@ layout: learningpathall ## Introduction -Often the programmer will have a better understanding of their software and the inputs than the compiler. For example, if the loop size is calculated at runtime, the compiler will have to account for a variable size. However, a developer may have knowledge of the runtime profile, for example if the loop size is always a multiple of a specific number. +Often, the programmer has deeper insights into their software's behavior and its inputs than the compiler does. For instance, if a loop's size is determined at runtime, the compiler must conservatively handle the possibility of variable sizes, potentially limiting optimization opportunities. However, a developer might know more about the application's runtime characteristics—such as the fact that the loop size always adheres to specific constraints, like being a multiple of a particular number. -To provide this context to the compiler we will use a simple example written in C++. +To illustrate how you can explicitly provide this valuable context to the compiler, we'll walk through a simple C++ example. ## Setup -In this learning path I will be using an Arm-based `r7g.large` instance from AWS but any Arm-based machine can be used. +In this learning path, I will be demonstrating the examples using an Arm-based `r7g.large` instance from AWS; however, you're welcome to follow along using any Arm-based machine that suits your environment or preference. -Install the `g++` compiler with the following commands. Adjust to the appropriate commands for your operating system. +To get started, you'll first need to install the `g++` compiler on your system. Use the following commands as a guide, adjusting them accordingly based on the operating system or distribution you're working with. ```bash sudo apt update diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md b/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md index ee196037bb..d4bdf5007d 100644 --- a/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/_index.md @@ -1,9 +1,9 @@ --- -title: Learn to improve for loop run time with loop size context +title: Learn to Optimize C++ Loops with Size Context minutes_to_complete: 15 -who_is_this_for: C++ developers +who_is_this_for: C++ developer who want to improve the runtime of for loops with basic insider knowledge of the loop size learning_objectives: - Learn how to add preexisting knowledge of loop sizes to for loops @@ -16,7 +16,7 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory -subjects: C++ +subjects: ML armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md b/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md index 36d2d9111e..bac5012d63 100644 --- a/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md +++ b/content/learning-paths/cross-platform/cpp-loop-size-context/providing-inside-knowledge.md @@ -8,21 +8,23 @@ layout: learningpathall ## Adding Inside Knowledge -To make the compiler aware that the input will be a multiple of 4 we will rewrite our loop size as the following. +To explicitly inform the compiler that our input will always be a multiple of 4, we can rewrite the loop size calculation as follows: ```output ((max_loop_size/4)*4) ``` -Mathematically this may seem redundant. However since `(max_loop_size/4)` will be truncated to an integer this guarantees `(max_loop_size/4)*4` is a multiple of 4. +At first glance, this calculation might seem mathematically redundant. However, since the expression `(max_loop_size/4)` is an integer division, it truncates the result, effectively guaranteeing that `(max_loop_size/4)*4` will always yield a number divisible by 4. The compiler can pick up on this information and optimise accordingly. -As slightly easier to read method that avoids confusion when arguments are passed in is dividing the variable before passing it in. For example. +As slightly easier to read method that avoids confusion when passing arguments is to divide the variable and rename before it is passed in. For example. ```output (max_loop_size_div_4 * 4) ``` -## Adding Insider Knowledge +## Improved Example + +Copy the snippet below and paste into a file named `context.cpp`. ```cpp #include @@ -64,7 +66,7 @@ int main() { Again compile with the same compiler flags. ```bash -g++ -O3 -march=armv8-a+simd -o context +g++ -O3 -march=armv8-a+simd context.cpp -o context ``` ```output @@ -73,17 +75,11 @@ Enter a value for max_loop_size (must be a multiple of 4): 40000 Sum: 799980000 Time taken by foo: 24650 nanoseconds ``` +In this particular run, the time taken has significantly reduced compared to our previous example. ## Comparison -To compare we will use compiler explorer to see the assembly. - -First, looking at the example without context [here](https://godbolt.org/z/qPaW5Kjxa). -Second, looking at the example with context [here](https://godbolt.org/z/rhj65Pe4v). - - -[Here](https://godbolt.org/z/nvx4j1vTK). - -As the assembly shows we have fewer lines of assembly corresponding to the function `foo` as there is less setup code to account given the insider knowledge. +To compare we will use compiler explorer to see the assembly [here](https://godbolt.org/z/nvx4j1vTK). +As the assembly shows we have fewer lines of assembly corresponding to the function `foo` when context is added. This is because the compiler can optimise the conditional checking and any clean up code given the context.