diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f88caac4b2..d0c76ac6e9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -21,6 +21,8 @@ on: required: true HUGO_AUDIO_API: required: true + HUGO_PHI_ONNX_LLM_API: + required: true HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: required: true HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: @@ -73,6 +75,7 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_PHI_ONNX_LLM_API: ${{ secrets.HUGO_PHI_ONNX_LLM_API }} HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4e3dad94e4..72c9b0f0e9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,5 +24,6 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_PHI_ONNX_LLM_API: ${{ secrets.HUGO_PHI_ONNX_LLM_API }} HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 1fbe6be03b..b3b393fb83 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -24,5 +24,6 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_PHI_ONNX_LLM_API: ${{ secrets.HUGO_PHI_ONNX_LLM_API }} HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} diff --git a/.wordlist.txt b/.wordlist.txt index 6620632450..00a6d1f24e 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -4323,4 +4323,129 @@ taskset unicast wrk's yy -zenoh \ No newline at end of file +zenoh +AFM +AOR +AWSEC +Agrawal +Arcee +Atheros +ChenYing +Colima +Corellium +Corestone +Croci +DBAREMETAL +Denormal +Docker's +Dpls +ETHOSU +FVP's +Gopalakrishnan +Gorman +HVM +Higham +Huai +ICML +IIR +IIoT +ImageStreams +Joana +Kuo +LANs +Liang +Libmath +MACC +MACCs +MachineSets +MobileNet +OpenShift's +PMLR +PipelineRun +PreserveOriginal +Queryable +Reimport +SPRA +STRINGIFY +Sram +TMS +Tekton +VPs +VSCode +WANs +WR +Waheed +Weidmann +Wikitest +XQuartz +Xiu +ZGVnN +Zenon +afm +aot +arXiv +arcee +armpl +ath +bitbake +bootloaders +cntr +cosf +cpp's +datadir +dddd +denormal +denormalized +edgeimpulse +ethosu +expf +geo +gupta +iMac +instrinsics +ish +keypair +learnable +libcurl +lldb +mL +mM +mR +mlr +nbc +nbr +nodeAffinity +nordic +oc +oe +openshift +pcs +podTemplate +podTemplate's +postinstallation +prepending +prog +queryable +redhat +reimport +shadergraph +sheel +spra +stefanalfbo +subnormals +taskrun +tg +thisunrolling +tokio +topologies +umax +varg +vexp +vgetq +vres +wifi +wlan +wlp +wlx +xquartz +zenohd \ No newline at end of file diff --git a/assets/contributors.csv b/assets/contributors.csv index 8de0eb0454..6df4525ab1 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -91,3 +91,6 @@ Gian Marco Iodice,Arm,,,, Aude Vuilliomenet,Arm,,,, Andrew Kilroy,Arm,,,, Peter Harris,Arm,,,, +Chenying Kuo,Adlink,evshary,evshary,, +William Liang,,wyliang,,, +Waheed Brown,Arm,https://github.com/armwaheed,https://www.linkedin.com/in/waheedbrown/,, diff --git a/content/learning-paths/automotive/_index.md b/content/learning-paths/automotive/_index.md index f25f1f9cce..8a002e10db 100644 --- a/content/learning-paths/automotive/_index.md +++ b/content/learning-paths/automotive/_index.md @@ -12,15 +12,17 @@ title: Automotive weight: 4 subjects_filter: - Containers and Virtualization: 3 -- Performance and Architecture: 1 +- Performance and Architecture: 2 operatingsystems_filter: - Baremetal: 1 -- Linux: 3 +- Linux: 4 - RTOS: 1 tools_software_languages_filter: - Automotive: 1 +- C: 1 - Docker: 2 - Python: 2 +- Raspberry Pi: 1 - ROS 2: 1 -- ROS2: 1 +- ROS2: 2 --- diff --git a/content/learning-paths/automotive/openadkit2_safetyisolation/4_multiinstance_executing.md b/content/learning-paths/automotive/openadkit2_safetyisolation/4_multiinstance_executing.md index 226a0d6542..6cca5ad53f 100644 --- a/content/learning-paths/automotive/openadkit2_safetyisolation/4_multiinstance_executing.md +++ b/content/learning-paths/automotive/openadkit2_safetyisolation/4_multiinstance_executing.md @@ -1,5 +1,6 @@ --- -title: Executing the Multi-Instance with DDS-Based Communication +title: Executing OpenAD Kit in a Distributed ROS 2 Instances + weight: 5 ### FIXED, DO NOT MODIFY diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md index 932afa453c..be13e4de9f 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md @@ -1,93 +1,162 @@ --- -title: Set up your Environment +title: Set up your SME2 development environment weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Installing software for this Learning Path +## Choose your SME2 setup: native or emulated -To follow this Learning Path, you will need to set up an environment to develop with SME2. +{{< notice Note>}} +This Learning Path demonstrates how to use SME2 on macOS on a device with an M4 chip. It does not provide instructions for using SME on iPhone or iPad, even though they have SME2 support. +{{< /notice >}} -You will require: +To build or run SME2-accelerated code, first set up your development environment. +This section walks you through the required tools and two supported setup options: - - A compiler with support for SME2 instructions. You can use [Clang](https://www.llvm.org/) - version 18 or later, or [GCC](https://gcc.gnu.org/) version 14, or later. This Learning - Path uses ``Clang``. +* [**Native SME2 hardware**](#native-sme2) - build and run directly on a system with SME2 support, see [Devices with native SME2 support](#devices) - - An emulator to execute code with the SME2 instructions. This Learning - Path uses [Arm's Fixed Virtual Platform (FVP) model](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms). +* [**Docker-based emulation**](#docker-sme2) - use a container to emulate SME2 in bare metal mode (without an OS) -You will also require Git and Docker installed on your machine. +## Download and explore the code examples -### Set up Git +To get started, begin by [downloading the code examples](https://gitlab.arm.com/learning-cde-examples/code-examples/-/archive/main/code-examples-main.tar.gz?path=learning-paths/cross-platform/multiplying-matrices-with-sme2). -To check if Git is already installed on your machine, use the following command line in a terminal: +Now extract the archive, and change directory to: +``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2.`` -```BASH { output_lines=2 } -git --version -git version 2.47.1 +```BASH +tar xfz code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2.tar.gz -s /code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2/code-examples/ +cd code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2 ``` -If the above command line fails with a message similar to "``git: command not found``", then install Git following the steps for your machine's OS. +The directory structure should look like this: + +```TXT +code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2/ +├── .clang-format +├── .devcontainer/ +│ └── devcontainer.json +├── .git/ +├── .gitignore +├── Makefile +├── README.rst +├── docker/ +│ ├── assets.source_me +│ ├── build-all-containers.sh +│ ├── build-my-container.sh +│   └── sme2-environment.docker +├── hello.c +├── main.c +├── matmul.h +├── matmul_asm.c +├── matmul_asm_impl.S +├── matmul_intr.c +├── matmul_vanilla.c +├── misc.c +├── misc.h +├── preprocess_l_asm.S +├── preprocess_vanilla.c +├── run-fvp.sh +└── sme2_check.c +``` + +Among other files, it includes: +- Code examples. +- A `Makefile` to build the code. +- `run-fvp.sh` to run the FVP model. +- A `docker` directory containing: + - `assets.source_me` to provide toolchain paths. + - `build-my-container.sh`, a script that automates building the Docker image from the `sme2-environment.docker` file. It runs the Docker build command with the correct arguments so you don’t have to remember them. + - `sme2-environment.docker`, a custom Docker file that defines the steps to build the SME2 container image. It installs all the necessary dependencies, including the SME2-compatible compiler and Arm FVP emulator. + - `build-all-containers.sh`, a script to build multi-architecture images. +- `.devcontainer/devcontainer.json` for VS Code container support. + +{{% notice Note %}} +From this point, all instructions assume that your current directory is +``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``, so ensure that you are in the correct directory before proceeding. +{{% /notice %}} + +## Set up a system with native SME2 support {#native-sme2} + +To run SME2 code natively, ensure your system includes SME2 hardware and uses a compiler version that supports SME2. + +For the compiler, you can use [Clang](https://www.llvm.org/) version 18 or later, or [GCC](https://gcc.gnu.org/) version 14 or later. This Learning Path uses ``clang``. + +{{% notice Note %}} +At the time of writing, macOS ships with `clang` version 17.0.0, which doesn't support SME2. Use a newer version, such as 20.1.7, available through Homebrew.{{% /notice%}} + +You can check your compiler version using the command:``clang --version`` + +### Install Clang + +Install Clang using the instructions below, selecting either macOS or Linux/Ubuntu, depending on your setup: {{< tabpane code=true >}} + {{< tab header="Linux/Ubuntu" language="bash">}} -sudo apt install git + sudo apt install clang {{< /tab >}} + {{< tab header="macOS" language="bash">}} -brew install git + brew install llvm {{< /tab >}} + {{< /tabpane >}} -### Docker +You are now all set to start hacking with SME2. + +## Set up a system using SME2 emulation with Docker {#docker-sme2} + +If your machine doesn't support SME2, or you want to emulate it, you can use the Docker-based environment. -To enable you to get started easily and with the tools that you need, you can fetch a Docker container with the required compiler and FVP. Alternatively, if you do wish to build the container yourself, the ``Dockerfile`` is also available. +The Docker container includes both a compiler and [Arm's Fixed Virtual Platform (FVP) +model](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms) +for emulating code that uses SME2 instructions. You can either run the prebuilt container image provided in this Learning Path or build it yourself using the Docker file that is included. +If building manually, follow the instructions in the ``sme2-environment.docker`` file to install the required tools on your machine. + +### Install and verify Docker {{% notice Note %}} -This Learning Path works without ``docker``, but the compiler and the FVP must be available in your search path. +Docker is optional, but if you don’t use it, you must manually install the compiler and FVP, and ensure they’re in your `PATH`. {{% /notice %}} -Start by checking that ``docker`` is installed on your machine by typing the following -command line in a terminal: +To begin, start by checking that Docker is installed on your machine: ```BASH { output_lines="2" } docker --version Docker version 27.3.1, build ce12230 ``` -If the above command fails with a message similar to "``docker: command not found``" -then follow the steps from the [Docker Install Guide](https://learn.arm.com/install-guides/docker/). +If the above command fails with an error message similar to "``docker: command not found``", then follow the steps from the [Docker install guide](/install-guides/docker/) to install Docker. {{% notice Note %}} -You might need to login again or restart your machine for the changes to take effect. +You might need to log out and back in again or restart your machine for the changes to take +effect. {{% /notice %}} -Once you have confirmed that Docker is installed on your machine, you can check that it is operating normally with the following: +Once you have confirmed that Docker is installed on your machine, you can check +that it is working with the following: ```BASH { output_lines="2-27" } docker run hello-world Unable to find image 'hello-world:latest' locally latest: Pulling from library/hello-world -478afc919002: Pull complete -Digest: sha256:305243c734571da2d100c8c8b3c3167a098cab6049c9a5b066b6021a60fcb966 +c9c5fd25a1bd: Pull complete +Digest: sha256:940c619fbd418f9b2b1b63e25d8861f9cc1b46e3fc8b018ccfe8b78f19b8cc4f Status: Downloaded newer image for hello-world:latest Hello from Docker! This message shows that your installation appears to be working correctly. -To generate this message, Docker followed these steps: - +To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. - 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (arm64v8) - 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. - 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. @@ -101,109 +170,90 @@ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` -## Environment +You can use Docker in the following ways: +- [Directly from the command line](#run-commands-from-a-terminal-using-docker) - for example, when you are working from a terminal on your local machine. -Now, [download the code examples](https://gitlab.arm.com/learning-code-examples/code-examples/-/archive/main/code-examples-main.tar.gz?path=learning-paths/cross-platform/multiplying-matrices-with-sme2) -for this learning path, expand the archive and change your current directory to -``code-examples/learning-paths/cross-platform/sme2`` : +- [Within a containerized environment](#use-an-interactive-docker-shell) - by configuring VS Code to execute all the commands inside a Docker container, allowing you to work seamlessly within the +Docker environment. -```BASH -tar xfz code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2.tar.gz -s /code-examples-main-learning-paths-cross-platform-multiplying-matrices-with-sme2/code-examples/ -cd code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2 -``` +### Run commands from a terminal using Docker -This list of content in this directory should look like this : +When a command is executed in the Docker container environment, you must prepend +it with instructions on the command line so that your shell executes it within +the container. -```TXT -code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2/ -├── .clang-format -├── .devcontainer/ -│ └── devcontainer.json -├── .git/ -├── .gitignore -├── Makefile -├── README.rst -├── docker/ -│ ├── assets.source_me -│ ├── build-all-containers.sh -│ ├── build-my-container.sh -│   └── sme2-environment.docker -├── hello.c -├── main.c -├── matmul.h -├── matmul_asm.c -├── matmul_asm_impl.S -├── matmul_intr.c -├── matmul_vanilla.c -├── misc.c -├── misc.h -├── preprocess_l_asm.S -├── preprocess_vanilla.c -├── run-fvp.sh -└── sme2_check.c -``` +For example, to execute ``COMMAND ARGUMENTS`` in the SME2 Docker container, the +command line looks like this: -It contains: -- Code examples. -- A ``Makefile`` that builds the code examples. -- A shell script called ``run-fvp.sh`` that runs the FVP. -- A directory called ``docker`` that contains materials related to Docker, which are: - - A script called ``assets.source_me`` that provides the FVP and compiler toolchain references. - - A Docker recipe called ``sme2-environment.docker`` to build the container that - you will use. - - A shell script called ``build-my-container.sh`` that you can use if you want to build the Docker container. This is not essential however, as ready-made images are made available for you. - - A script called ``build-all-containers.sh`` that was used to create the image for you to download to provide multi-architecture support for both x86_64 and AArch64. -- A configuration script for VS Code to be able to use the container from the IDE called ``.devcontainer/devcontainer.json``. +```BASH +docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 COMMAND ARGUMENTS +``` -{{% notice Note %}} -From this point in the Learning Path, all instructions assume that your current -directory is ``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``.{{% /notice %}} +This invokes Docker, using the +``armswdev/sme2-learning-path:sme2-environment-v2`` container image, and mounts +the current working directory (the +``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``) +inside the container to ``/work``, then sets ``/work`` as the working directory +and runs ``COMMAND ARGUMENTS`` in this environment. +For example, to run ``make``, you need to enter: -## Using the environment +```BASH +docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make +``` -Docker containers provide you with the functionality to execute commands in an isolated environment, where you have all the necessary tools that you require without having to clutter your machine. The containers runs independently, which means that they do not interfere with other containers on the same machine or server. +### Use an interactive Docker shell -You can use Docker in the following ways: -- Directly from the command line. For example, when you are working from a terminal on your local machine. -- Within a containerized environment. Configure VS Code to execute run all the commands inside a Docker container, allowing you to work seamlessly within the Docker environment. +The standard `docker run` commands can be long and repetitive. To streamline your workflow, you can start an interactive Docker session that allows you to run commands directly - without having to prepend docker run each time. -### Working from a terminal +To launch an interactive shell inside the container, use the `-it` flag: -When a command is executed in the Docker container environment, you must prepend it with instructions on the command line so that your shell executes it within the container. +```BASH +docker run --rm -it -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 +``` -For example, to execute ``COMMAND ARGUMENTS`` in the SME2 Docker container, the command line looks like this: +You are now in the Docker container, and you can execute all commands directly. For +example, the ``make`` command can now be simply invoked with: -```SH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 COMMAND ARGUMENTS +```BASH +make ``` -This invokes Docker, using the -``armswdev/sme2-learning-path:sme2-environment-v1``container -image, and mounts the current working directory (the ``code-examples/learning-paths/cross-platform/multiplying-matrices-with-sme2``) -inside the container to ``/work``, then sets ``/work`` as the -working directory and runs ``COMMAND ARGUMENTS`` in this environment. +To exit the container, simply hit CTRL+D. Note that the container is not persistent (it was invoked with ``--rm``), so each invocation will use a container freshly built from the image. All the files reside outside the container, so changes you make to them will be persistent. -For example, to run ``make``, you need to enter: +### Develop with Docker in Visual Studio Code -```SH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 make -``` +If you are using Visual Studio Code as your IDE, the container setup is already configured with `devcontainer/devcontainer.json`. -### Working from within the Docker container +Make sure you have the [Microsoft Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension installed. -Make sure you have the [Microsoft Dev -Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) -extension installed. - -Then select the **Reopen in Container** menu entry as Figure 1 shows. +Then select the **Reopen in Container** menu entry as shown below. It automatically finds and uses ``.devcontainer/devcontainer.json``: -![example image alt-text#center](VSCode.png "Figure 1: Setting up the Docker Container.") +![VSCode Docker alt-text#center](VSCode.png "Figure 1: Setting up the Docker container.") -All your commands now run within the container, so there is no need to prepend them with a Docker invocation, as VS Code handles all this seamlessly for you. +All your commands now run within the container, so there is no need to prepend +them with a Docker invocation, as VS Code handles all this seamlessly for you. {{% notice Note %}} -For the rest of this Learning Path, shell commands include the full Docker invocation so that users not using VS Code can copy the complete command line. However, if you are using VS Code, you only need to use the `COMMAND ARGUMENTS` part. +For the rest of this Learning Path, shell commands include the full Docker +invocation so that if you are not using VS Code you can copy the complete command line. +However, if you are using VS Code, you only need to use the `COMMAND ARGUMENTS` +part. {{% /notice %}} + +### Devices with native SME2 support {#devices} + +These Apple devices support SME2 natively. + + + +| Device | Release Date | Chip Options | +|-------------------------------------|--------------|---------------------------| +| iPhone 16 | 2024 | A18 | +| iPad Pro (7th generation) | 2024 | M4 | +| iMac (2024) | 2024 | M4 | +| Mac Mini (2024) | 2024 | M4, M4 Pro, M4 Max | +| MacBook Pro (14-inch, 16-inch, 2024)| 2024 | M4 Pro, M4 Max | +| MacBook Air (2025) | 2025 | M4 | diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md new file mode 100644 index 0000000000..6cc5e2382d --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further.md @@ -0,0 +1,77 @@ +--- +title: Going further +weight: 12 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Beyond this implementation + +There are many different ways that you can extend and optimize the matrix multiplication algorithm beyond the specific SME2 implementation that you've explored in this Learning Path. While the current approach is tuned for performance on a specific hardware target, further improvements can make your code more general, more efficient, and better suited to a wider range of applications. + +Advanced optimization techniques are essential when adapting algorithms to real-world scenarios. These often include processing matrices of different shapes and sizes, handling mixed data types, or maximizing throughput for large batch operations. The ability to generalize and fine-tune your implementation opens the door to more scalable and reusable code that performs well across workloads. + +Whether you're targeting different data types, improving parallelism, or adapting to unusual matrix shapes, these advanced techniques give you more control over both correctness and performance. + +Some ideas of improvements that you might like to test out include: + +* Generalization +* Loop unrolling +* The strategic use of matrix properties + +## Generalize the algorithm for different data types + +So far, you've focused on multiplying floating-point matrices. In practice, matrix operations often involve integer types as well. + +The structure of the algorithm (The core logic - tiling, outer product, and accumulation) remains consistent across data types. It uses preprocessing with tiling and outer product–based multiplication. To adapt it for other data types, you only need to change how values are: + +* Loaded from memory +* Accumulated (often with widening) +* Stored to the output + +Languages that support [generic programming](https://en.wikipedia.org/wiki/Generic_programming), such as C++ with templates, make this easier. + +Templates allow you to: + +* Swap data types flexibly +* Handle accumulation in a wider format when needed +* Reuse algorithm logic across multiple matrix types + +By expressing the algorithm generically, you benefit from the compiler generating multiple optimized variants, allowing you the opportunity to focus on: + +- Creating efficient algorithm design +- Testing and verification +- SME2-specific optimization + +## Unroll loops to compute multiple tiles + +For clarity, the `matmul_intr_impl` function in this Learning Path processes one tile at a time. However SME2 supports multi-vector operations that enable better performance through loop unrolling. + +For example, the `preprocess_l_intr` function uses: + +```c +svld1_x2(...); // Load two vectors at once +``` +Loading two vectors at a time enables the simultaneous computing of more tiles. Since the matrices are already laid out efficiently in memory, consecutive loading is fast. Implementing this approach can make improvements to the ``macc`` to load ``ratio``. + +In order to check your understanding of SME2, you can try to implement this unrolling yourself in the intrinsic version (the assembly version already has this optimization). You can check your work by comparing your results to the expected reference values. + +## Optimize for special matrix shapes + +One method for optimization is to use strategies that are flexible depending on the matrices' dimensions. This is especially easy to set up when working in C or C++, rather than directly in assembly language. + +By playing with the mathematical properties of matrix multiplication and the outer product, it is possible to minimize data movement as well as reduce the overall number of operations to perform. + +For example, it is common that one of the matrices is actually a vector, meaning that it has a single row or column, and then it becomes advantageous to transpose it. Can you see why? + +The answer is that as the elements are stored contiguously in memory, an ``Nx1``and ``1xN`` matrices have the exact same memory layout. The transposition becomes a no-op, and the matrix elements stay in the same place in memory. + +An even more *degenerated* case that is easy to manage is when one of the matrices is essentially a scalar, which means that it is a matrix with one row and one column. + +Although the current code used here handles it correctly from a results point of view, a different algorithm and use of instructions might be more efficient. Can you think of another way? + + +In order to check your understanding of SME2, you can try to implement thisunrolling yourself in the intrinsic version (the asm version already has this optimization). You can check your work by comparing your results to the expected reference values. + + diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md index d188f3aca5..5c8a6e3f19 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/2-check-your-environment.md @@ -1,60 +1,94 @@ --- -title: Test your environment +title: Test your SME2 development environment weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -In this section, you will check that your environment is all set up and ready to develop with SME2. This will be your first hands-on experience with the environment. - -## Compile the examples - -First, compile the example code with Clang: - -```BASH { output_lines="2-19" } -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 make -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o hello hello.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o sme2_check.o sme2_check.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o misc.o misc.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_check sme2_check.o misc.o -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -DIMPL=asm -c -o main_asm.o main.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o matmul_asm.o matmul_asm.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o matmul_asm_impl.o matmul_asm_impl.S -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o preprocess_l_asm.o preprocess_l_asm.S -clang --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o matmul_vanilla.o matmul_vanilla.c -clang --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o preprocess_vanilla.o preprocess_vanilla.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_asm main_asm.o matmul_asm.o matmul_asm_impl.o preprocess_l_asm.o matmul_vanilla.o preprocess_vanilla.o misc.o -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -DIMPL=intr -c -o main_intr.o main.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -c -o matmul_intr.o matmul_intr.c -clang --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -O2 -Wall -std=c99 -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_intr main_intr.o matmul_intr.o matmul_vanilla.o preprocess_vanilla.o misc.o +In this section, you'll verify that your environment is ready for SME2 development. This is your first hands-on task and confirms that the toolchain, hardware (or emulator), and compiler are set up correctly. + +## Build the code examples + +Use the `make` command to compile all examples and generate assembly listings: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2-19">}} +make +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -o hello hello.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o sme2_check.o sme2_check.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o misc.o misc.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_check sme2_check.o misc.o +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -DIMPL=asm -c -o main_asm.o main.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o matmul_asm.o matmul_asm.c +/opt/homebrew/opt/llvm/bin/clang -Wall -march=native+sve+sme2 -DBAREMETAL=0 -c -o matmul_asm_impl.o matmul_asm_impl.S +/opt/homebrew/opt/llvm/bin/clang -Wall -march=native+sve+sme2 -DBAREMETAL=0 -c -o preprocess_l_asm.o preprocess_l_asm.S +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -c -o matmul_vanilla.o matmul_vanilla.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -c -o preprocess_vanilla.o preprocess_vanilla.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_matmul_asm main_asm.o matmul_asm.o matmul_asm_impl.o preprocess_l_asm.o matmul_vanilla.o preprocess_vanilla.o misc.o +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -DIMPL=intr -c -o main_intr.o main.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -DBAREMETAL=0 -c -o matmul_intr.o matmul_intr.c +/opt/homebrew/opt/llvm/bin/clang -O2 -Wall -std=c99 -march=native+sme2 -o sme2_matmul_intr main_intr.o matmul_intr.o matmul_vanilla.o preprocess_vanilla.o misc.o +/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d hello > hello.lst +/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_check > sme2_check.lst +/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_matmul_asm > sme2_matmul_asm.lst +/opt/homebrew/opt/llvm/bin/llvm-objdump --demangle -d sme2_matmul_intr > sme2_matmul_intr.lst + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2-19">}} +docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o hello hello.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o sme2_check.o sme2_check.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o misc.o misc.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_check sme2_check.o misc.o +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -DIMPL=asm -c -o main_asm.o main.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o matmul_asm.o matmul_asm.c +clang -Wall --target=aarch64-none-elf -march=armv9.4-a+sme2 -DBAREMETAL=1 -c -o matmul_asm_impl.o matmul_asm_impl.S +clang -Wall --target=aarch64-none-elf -march=armv9.4-a+sme2 -DBAREMETAL=1 -c -o preprocess_l_asm.o preprocess_l_asm.S +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -c -o matmul_vanilla.o matmul_vanilla.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -fno-exceptions -fno-rtti -mno-unaligned-access -c -o preprocess_vanilla.o preprocess_vanilla.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_asm main_asm.o matmul_asm.o matmul_asm_impl.o preprocess_l_asm.o matmul_vanilla.o preprocess_vanilla.o misc.o +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -DIMPL=intr -c -o main_intr.o main.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -DBAREMETAL=1 -c -o matmul_intr.o matmul_intr.c +clang -O2 -Wall -std=c99 --target=aarch64-none-elf -march=armv9.4-a+sme2 -fno-exceptions -fno-rtti -mno-unaligned-access -nostartfiles -lcrt0-semihost -lsemihost -nostartfiles -lcrt0-semihost -lsemihost -Wl,--defsym=__boot_flash=0x80000000 -Wl,--defsym=__flash=0x80001000 -Wl,--defsym=__ram=0x81000000 -T picolibc.ld -o sme2_matmul_intr main_intr.o matmul_intr.o matmul_vanilla.o preprocess_vanilla.o misc.o llvm-objdump --demangle -d hello > hello.lst llvm-objdump --demangle -d sme2_check > sme2_check.lst llvm-objdump --demangle -d sme2_matmul_asm > sme2_matmul_asm.lst llvm-objdump --demangle -d sme2_matmul_intr > sme2_matmul_intr.lst -``` + {{< /tab >}} +{{< /tabpane >}} - Executed within the docker ``armswdev/sme2-learning-path:sme2-environment-v1`` environment, the ``make`` command performs the following tasks: +The `make` command performs the following tasks: +- It builds four executables: `hello`, `sme2_check`, `sme2_matmul_asm`, and + `sme2_matmul_intr`. +- It creates the assembly listings for the four executables: `hello.lst`, + `sme2_check.lst`, `sme2_matmul_asm.lst`, and `sme2_matmul_intr.lst`. -- It builds four executables: ``hello``, ``sme2_check``, ``sme2_matmul_asm``, and ``sme2_matmul_intr``. -- It creates the assembly listings for the four executables: ``hello.lst``, ``sme2_check.lst``, ``sme2_matmul_asm.lst``, and ``sme2_matmul_intr.lst``. + These targets compile and link all example programs and generate disassembly listings for inspection. -{{% notice Note %}} -At any point, you can clean the directory of all the files that have been built by invoking ``make clean``: +At any point, you can clean the directory of all the files that have been built +by invoking `make clean`: -```BASH -$ docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 make clean -``` -{{% /notice %}} +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2">}} + make clean + rm hello sme2_check sme2_matmul_asm sme2_matmul_intr hello.lst sme2_check.lst sme2_matmul_asm.lst sme2_matmul_intr.lst *.o + {{< /tab >}} -## Basic checks + {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make clean + rm hello sme2_check sme2_matmul_asm sme2_matmul_intr hello.lst sme2_check.lst sme2_matmul_asm.lst sme2_matmul_intr.lst *.o + {{< /tab >}} +{{< /tabpane >}} -The very first program that you should run is the famous "Hello, world !" example that -will tell you if your environment is set up correctly. +## Run a Hello World program -The source code is contained in ``hello.c`` and looks like this: +The very first program that you should run is the famous "Hello, world!" example +that will tell you if your environment is set up correctly. -```C +The source code is contained in `hello.c` and looks like this: + +```C { line_numbers="true" } #include #include @@ -64,73 +98,75 @@ int main(int argc, char *argv[]) { } ``` -Run the FVP simulation of the ``hello`` program with: +Run the `hello` program with: -```BASH { output_lines="2-4" } -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh hello -Hello, world ! +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2">}} + ./hello + Hello, world ! + {{< /tab >}} -Info: /OSCI/SystemC: Simulation stopped by user. -``` + {{< tab header="Emulated SME2 support" language="bash" output_lines="2-4">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh hello + Hello, world ! -The important line here is "``Hello, world !``" as it demonstrates that the generic code -can be compiled and run on the FVP. + Info: /OSCI/SystemC: Simulation stopped by user. + {{< /tab >}} +{{< /tabpane >}} -## SME2 checks +In the emulated case, you may see that the FVP prints out extra lines. The key confirmation is the presence of "Hello, world!" in the output. It demonstrates that the generic code can be compiled and executed. -You will now run the ``sme2_check`` program, which checks that SME2 works as -expected, in both the compiler and in the FVP. +## Check SME2 availability -The source code is found in -``sme2_check.c``: +You will now run the `sme2_check` program, which verifies that SME2 works as expected. This checks both the compiler and the CPU (or the emulated CPU) are properly supporting SME2. -```C -#include -#include +The `sme2_check` program verifies that SME2 is available and working. It confirms: + +* The compiler supports SME2 (via __ARM_FEATURE_SME2) +* The system or emulator reports SME2 capability + +* Streaming mode works as expected + +The source code is found in `sme2_check.c`: + +```C { line_numbers="true" } #include "misc.h" +#include +#include +#include + #ifdef __ARM_FEATURE_SME2 #include #else #error __ARM_FEATURE_SME2 is not defined #endif -#define get_cpu_ftr(regId, feat, msb, lsb) \ - ({ \ - unsigned long __val; \ - __asm__("mrs %0, " #regId : "=r"(__val)); \ - printf("%-20s: 0x%016lx\n", #regId, __val); \ - printf(" - %-10s: 0x%08lx\n", #feat, \ - (__val >> lsb) & ((1 << (msb - lsb)) - 1)); \ - }) +__arm_locally_streaming void function_in_streaming_mode() { + printf("In streaming_mode: %d, SVL: %" PRIu64 " bits\n", + __arm_in_streaming_mode(), svcntb() * 8); +} int main(int argc, char *argv[]) { - get_cpu_ftr(ID_AA64PFR0_EL1, SVE, 35, 32); - get_cpu_ftr(ID_AA64PFR1_EL1, SME, 27, 24); - int n = 0; -#ifdef __ARM_FEATURE_SME2 - setup_sme(); - n = svcntb() * 8; +#if BAREMETAL == 1 + setup_sme_baremetal(); #endif - if (n) { - printf("SVE is available with length %d\n", n); - } else { - printf("SVE is unavailable.\n"); + + if (!display_cpu_features()) { + printf("SME2 is not supported on this CPU.\n"); exit(EXIT_FAILURE); } - printf("Checking has_sme: %d\n", __arm_has_sme()); - printf("Checking in_streaming_mode: %d\n", __arm_in_streaming_mode()); + printf("Checking initial in_streaming_mode: %d\n", + __arm_in_streaming_mode()); - printf("Starting streaming mode...\n"); - __asm__("smstart"); + printf("Switching to streaming mode...\n"); - printf("Checking in_streaming_mode: %d\n", __arm_in_streaming_mode()); + function_in_streaming_mode(); - printf("Stopping streaming mode...\n"); - __asm__("smstop"); + printf("Switching back from streaming mode...\n"); printf("Checking in_streaming_mode: %d\n", __arm_in_streaming_mode()); @@ -138,36 +174,80 @@ int main(int argc, char *argv[]) { } ``` -The ``sme2_check`` program displays the SVE field of the ``ID_AA64PFR0_EL1`` system register and the SME field of the ``ID_AA64PFR1_EL1`` system register. It will then check if SVE and SME are available, then finally will switch into streaming mode and back from streaming mode. - -The ``__ARM_FEATURE_SME2`` macro is provided by the compiler when it targets an SME-capable target, which is specified with the ``-march=armv9.4-a+sme2`` command line option to ``clang`` in -file ``Makefile``. - -The ``arm_sme.h`` include file is part of the Arm C Library -Extension ([ACLE](https://arm-software.github.io/acle/main/)). - -The ACLE provides types and function declarations to enable C/C++ programmers to make the best possible use of the Arm architecture. You can use the SME-related part of the library, but it does also provide support for Neon or other Arm architectural extensions. - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh sme2_check -``` - -The output should be similar to: - -```TXT -ID_AA64PFR0_EL1 : 0x1101101131111112 - - SVE : 0x00000001 -ID_AA64PFR1_EL1 : 0x0000101002000001 - - SME : 0x00000002 -SVE is available with length 512 -Checking has_sme: 1 -Checking in_streaming_mode: 0 -Starting streaming mode... -Checking in_streaming_mode: 1 -Stopping streaming mode... -Checking in_streaming_mode: 0 - -Info: /OSCI/SystemC: Simulation stopped by user. -``` - -You have now checked that the code can be compiled and run with full SME2 support, and are all set to move to the next section. +The ``__ARM_FEATURE_SME2`` macro (line 7) is provided by the compiler when it +targets an SME-capable target, which is specified with the ``+sme2`` +architectural feature in ``-march=armv9.4-a+sme2`` (emulated environment) or +``-march=native+sme2`` command line option to ``clang`` in file ``Makefile``. + +The ``arm_sme.h`` file included at line 8 is part of the Arm C Library Extension +([ACLE](https://arm-software.github.io/acle/main/)). The ACLE provides types and +function declarations to enable C/C++ programmers to make the best possible use +of the Arm architecture. You can use the SME-related part of the library, but it +does also provide support for Neon or other Arm architectural extensions. + +In order to run in a baremetal environment (like the one being used in the +emulated SME2 support), where no operating system has done the setup of the +processor for the user land programs, an additional step is required to turn +SME2 on. This is the purpose of the ``setup_sme_baremetal()`` call at line 21. +In environments where SME2 is natively supported, nothing needs to be done, +which is why the execution of this function is conditioned by the ``BAREMETAL`` +macro. ``BAREMETAL`` is set to 1 in the ``Makefile`` when the FVP is targeted, +and set to 0 otherwise. The body of the ``setup_sme_baremetal`` function is +defined in ``misc.c``. + +The ``sme2_check`` program then displays whether SVE, SME and SME2 are supported +at line 24. The checking of SVE, SME and SME2 is done differently depending on +``BAREMETAL``. This platform specific behaviour is abstracted by the +``display_cpu_features()``: +- In baremetal mode, our program has access to system registers and can inspect system registers for SME2 support. The program will print the SVE field of the ``ID_AA64PFR0_EL1`` system register and the SME field of the ``ID_AA64PFR1_EL1`` system register. +- In non baremetal mode, on an Apple platform the program needs to use a higher + level API call. + +The body of the ``display_cpu_features`` function is defined in ``misc.c``. + +If SME2 is not available, ``sme2_check`` will emit a diagnostic message (line +25) and exit (line 26). + +``sme2_check`` will then print the initial streaming mode state at line 29 +(which is expected to be 0), then will switch to streaming mode (line 34) when +invoking function ``function_in_streaming_mode`` to show the Streaming Vector +Length (a.k.a ``SVL``), and then switch back to non streaming mode (when +returning from ``function_in_streaming_mode``). Function +``function_in_streaming_mode`` is defined at line 13. Note that it has been +annotated with the ``__arm_locally_streaming`` attribute, which instructs the +compiler to automatically switch to streaming mode when invoking this function. +Streaming mode will be discussed in more depth in the next section. + +Look for the following confirmation messages in the output: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2-9">}} + ./sme2_check + HAS_SVE: 0 + HAS_SME: 1 + HAS_SME2: 1 + Checking initial in_streaming_mode: 0 + Switching to streaming mode... + In streaming_mode: 1, SVL: 512 bits + Switching back from streaming mode... + Checking in_streaming_mode: 0 + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2-12">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_check + ID_AA64PFR0_EL1 : 0x1101101131111112 + - SVE : 0x00000001 + ID_AA64PFR1_EL1 : 0x0000101002000001 + - SME : 0x00000002 + Checking has_sme: 1 + Checking initial in_streaming_mode: 0 + Switching to streaming mode... + In streaming_mode: 1, SVL: 512 bits + Switching back from streaming mode... + Checking in_streaming_mode: 0 + + Info: /OSCI/SystemC: Simulation stopped by user. + {{< /tab >}} +{{< /tabpane >}} + +You've now confirmed that your environment can compile and run SME2 code, and that SME2 features like streaming mode are working correctly. You're ready to continue to the next section and start working with SME2 in practice. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-streaming-mode.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-streaming-mode.md new file mode 100644 index 0000000000..ac84bd8eef --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-streaming-mode.md @@ -0,0 +1,65 @@ +--- +title: Streaming mode and ZA state in SME +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Understanding streaming mode + +Programs can switch between streaming and non-streaming mode during execution. When one streaming-mode function calls another, parts of the processor state - such as ZA storage - might need to be saved and restored. This behavior is governed by the Arm C Language Extensions (ACLE) and is managed by the compiler. + +To use streaming mode, you simply annotate the relevant functions with the appropriate keywords. The compiler handles the low-level mechanics of streaming mode management, removing the need for error-prone, manual work. + +{{% notice Note %}} +For more information, see the [Introduction to streaming and non-streaming mode](https://arm-software.github.io/acle/main/acle.html#controlling-the-use-of-streaming-mode). The rest of this section references content from the ACLE specification. +{{% /notice %}} + +## Streaming mode behavior and compiler handling + +Streaming mode changes how the processor and compiler manage execution context. Here's how it works: + +* The AArch64 architecture defines a concept called *streaming mode*, controlled +by a processor state bit `PSTATE.SM`. + +* At any given point in time, the processor is either in streaming mode (`PSTATE.SM == 1`) or in non-streaming mode (`PSTATE.SM == 0`). + +* To enter streaming mode, there is the instruction `SMSTART`, and to return to non-streaming mode, the instruction is `SMSTOP`. + +* Streaming mode affects C and C++ code in the following ways: + + - It can change the length of SVE vectors and predicates. The length of an SVE vector in streaming mode is called the *Streaming Vector Length* (SVL), which might differ from the non-streaming vector length. See [Effect of streaming mode on VL](https://arm-software.github.io/acle/main/acle.html#effect-of-streaming-mode-on-vl) for further information. + - Some instructions, and their associated ACLE intrinsics, can only be executed in streaming mode.These are called *streaming intrinsics*. + - Other instructions are restricted to non-streaming mode. These are called *non-streaming intrinsics*. + +The ACLE specification extends the C and C++ abstract machine model to include streaming mode. At any given time, the abstract machine is either in streaming or non-streaming mode. + +This distinction between abstract machine mode and processor mode is mostly a specification detail. At runtime, the processor’s mode may differ from the abstract machine’s mode - as long as the observable program behavior remains consistent (as per the "as-if" rule). + +{{% notice Note %}} +One practical consequence of this is that C and C++ code does not specify the exact placement of `SMSTART` and `SMSTOP` instructions; the source code simply places limits on where such instructions go. For example, when stepping through a program in a debugger, the processor mode might sometimes be different from the one implied by the source code. +{{% /notice %}} + +ACLE provides attributes that specify whether the abstract machine executes statements: + +- In non-streaming mode, in which case they are called *non-streaming statements*. +- In streaming mode, in which case they are called *streaming statements*. +- In either mode, in which case they are called *streaming-compatible statements*. + +## Working with ZA state + +SME also introduces a matrix storage area called ZA, sized `SVL.B` × `SVL.B` bytes. It +also provides a processor state bit called `PSTATE.ZA` to control whether ZA +is enabled. + +In C and C++, ZA usage is specified at the function level: a function either uses ZA or it doesn't. That is, a function either has ZA state or it does not. + +Functions that use ZA can either: + +- Share the caller’s ZA state +- Allocate a new ZA state for themselves + +When new state is needed, the compiler is responsible for preserving the caller’s state using a *lazy saving* scheme. For more information, see the [AAPCS64 section of the ACLE spec](https://arm-software.github.io/acle/main/acle.html#AAPCS64). + + \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-vanilla-matmul.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-vanilla-matmul.md deleted file mode 100644 index e78e399bbc..0000000000 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/3-vanilla-matmul.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: Vanilla matrix multiplication -weight: 5 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## Vanilla matrix multiplication - -In this section, you will learn about an example of standard matrix multiplication in C. - -### Algorithm description - -The vanilla matrix multiplication operation takes two input matrices, A [Ar -rows x Ac columns] and B [Br rows x Bc columns], to produce an output matrix C -[Cr rows x Cc columns]. The operation consists of iterating on each row of A -and each column of B, multiplying each element of the A row with its corresponding -element in the B column then summing all these products, as Figure 2 shows. - -![example image alt-text#center](matmul.png "Figure 2: Standard Matrix Multiplication.") - -This implies that the A, B, and C matrices have some constraints on their -dimensions: - -- A's number of columns must match B's number of rows: Ac == Br. -- C has the dimensions Cr == Ar and Cc == Bc. - -You can learn more about matrix multiplication, including its history, -properties and use, by reading this [Wikipedia -article on Matrix Multiplication](https://en.wikipedia.org/wiki/Matrix_multiplication). - -In this Learning Path, you will see the following variable names: - -- ``matLeft`` corresponds to the left-hand side argument of the matrix - multiplication. -- ``matRight``corresponds to the right-hand side of the matrix multiplication. -- ``M`` is ``matLeft`` number of rows. -- ``K`` is ``matLeft`` number of columns (and ``matRight`` number of rows). -- ``N`` is ``matRight`` number of columns. -- ``matResult``corresponds to the result of the matrix multiplication, with - ``M`` rows and ``N`` columns. - -### C implementation - -A literal implementation of the textbook matrix multiplication algorithm, as -described above, can be found in file ``matmul_vanilla.c``: - -```C -void matmul(uint64_t M, uint64_t K, uint64_t N, - const float *restrict matLeft, const float *restrict matRight, - float *restrict matResult) { - for (uint64_t m = 0; m < M; m++) { - for (uint64_t n = 0; n < N; n++) { - - float acc = 0.0; - - for (uint64_t k = 0; k < K; k++) - acc += matLeft[m * K + k] * matRight[k * N + n]; - - matResult[m * N + n] = acc; - } - } -} -``` - -In this Learning Path, the matrices are laid out in memory as contiguous -sequences of elements, in [Row-Major -Order](https://en.wikipedia.org/wiki/Row-_and_column-major_order). The -``matmul`` function performs the algorithm described above. - -The pointers to ``matLeft``, ``matRight`` and ``matResult`` have been annotated as -``restrict``, which informs the compiler that the memory areas designated by -those pointers do not alias. This means that they do not overlap in any way, so that the -compiler does not need to insert extra instructions to deal with these cases. -The pointers to ``matLeft`` and ``matRight`` are marked as ``const`` as neither of these two matrices are modified by ``matmul``. - -You now have a reference standard matrix multiplication function. You will use it later -on in this Learning Path to ensure that the assembly version and the intrinsics -version of the multiplication algorithm do not contain errors. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-outer-product.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-outer-product.md deleted file mode 100644 index ea55642846..0000000000 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-outer-product.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -title: Outer product -weight: 6 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- -## Matrix multiplication with the outer product - -In this section, you will learn how you can use the outer product with the SME engine to improve matrix multiplication. - -In this standard matrix multiplication example, the core of the computation can be represented as: - -```C - acc += matLeft[m * K + k] * matRight[k * N + n]; -``` - -Which translates to 1 multiply-accumulate, which is also known as ``macc``, for two loads (``matLeft[m * K + k]`` -and ``matRight[k *N + n]``). It therefore has a 1:2 ``macc`` to ``load`` ratio. - -From a memory system perspective, this is not effective, especially since this -computation is done within a triple-nested loop, repeatedly loading data from -memory. - -To exacerbate matters, large matrices might not fit in cache. In order to improve the matrix multiplication efficiency, the goal is to increase the ``macc`` to ``load`` ratio, which means to increase the number of multiply-accumulate operations per load. - -Figure 3 below shows how the matrix multiplication of ``matLeft`` (3 rows, 2 -columns) by ``matRight`` (2 rows, 3 columns) can be decomposed as the sum of the -outer products: - -![example image alt-text#center](outer_product.png "Figure 3: Outer Product-based Matrix Multiplication.") - -The SME engine builds on the -[Outer Product](https://en.wikipedia.org/wiki/Outer_product) as matrix -multiplication can be expressed as the -[sum of column-by-row outer products](https://en.wikipedia.org/wiki/Outer_product#Connection_with_the_matrix_product). - - -## About transposition - -From the previous page, you will recall that matrices are laid out in row-major -order. This means that loading row-data from memory is efficient as the memory -system operates efficiently with contiguous data. An example of this is where caches are loaded row by row, and data prefetching is simple - just load the data from ``current address + sizeof(data)``. This is not the case for loading column-data from memory though, as it requires more work from the memory system. - -In order to further improve the effectiveness of the matrix multiplication, it -is therefore desirable to change the layout in memory of the left-hand side -matrix, which is called ``matLeft`` in the code examples in this Learning Path. -The improved layout would ensure that elements from the same column are located -next to each other in memory. This is essentially a matrix transposition, -which changes ``matLeft`` from row-major order to column-major order. - -{{% notice Important %}} -It is important to note here that this reorganizes the layout of the matrix in -memory in order for the algorithm implementation to be more efficient. The -transposition affects only the memory layout. ``matLeft`` is transformed to -column-major order, but from a mathematical perspective, ``matleft`` is -*not* transposed. -{{% /notice %}} - -### Transposition in the real world - -In the same way that trees don't reach the sky, the SME engine has physical implementation limits. It operates with tiles in the ZA storage. Tiles are 2D portions of the matrices being processed. SME has dedicated instructions to load data to, and store data from tiles efficiently, as well as instructions to operate with and on tiles, for example the [fmopa](https://developer.arm.com/documentation/ddi0602/latest/SME-Instructions/FMOPA--non-widening---Floating-point-outer-product-and-accumulate-?lang=en) -instruction which takes two vectors as inputs and accumulate all the outer -products to a 2D tile. The tile in ZA storage is what allows SME to increase the -``macc`` to ``load`` ratio, as all the tile elements are loaded to the tile, to -be used with the SME outer product instructions. - -Taking into account that the ZA storage is finite, the desirable transposition -of the ``matLeft`` matrix that was discussed in the previous section needs to -adapted to the tile dimensions, so that a tile is easy to access. The -``matLeft`` preprocessing has thus some aspects of transpositions, but takes -into account the tiling as well and is referred to in the code as -``preprocess``. - -Here is at the algorithmic level what ``preprocess_l`` does in practice: - -```C -void preprocess_l(uint64_t nbr, uint64_t nbc, uint64_t SVL, - const float *restrict a, float *restrict a_mod) { - - // For all tiles of SVL x SVL data - for (uint64_t By = 0; By < nbr; By += SVL) { - for (uint64_t Bx = 0; Bx < nbc; Bx += SVL) { - // For this tile - const uint64_t dest = By * nbc + Bx * SVL; - for (uint64_t j = 0; j < SVL; j++) { - for (uint64_t i = 0; i < SVL && (Bx + i) < nbc; i++) { - if (By + j < nbr) { - a_mod[dest + i * SVL + j] = a[(By + j) * nbc + Bx + i]; - } else { - // These elements are outside of matrix a, so zero them. - a_mod[dest + i * SVL + j] = 0.0; - } - } - } - } - } -} -``` - -``preprocess_l`` will be used to check the assembly and intrinsic versions of -the matrix multiplication perform the preprocessing step correctly. This code is -located in file ``preprocess_vanilla.c``. - -{{% notice Note %}} -In real-world applications, it might be possible to arrange for ``matLeft`` to -be stored in column-major order, eliminating the need for transposition, and making the preprocessing step unnecessary. Matrix processing frameworks and libraries often have attributes within the matrix object to track if it is row- or column-major order, and whether it has been transposed to avoid unnecessary computations. -{{% /notice %}} diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-vanilla-matmul.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-vanilla-matmul.md new file mode 100644 index 0000000000..f8524ebeae --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/4-vanilla-matmul.md @@ -0,0 +1,76 @@ +--- +title: Vanilla matrix multiplication +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you'll implement a basic matrix multiplication algorithm in C using row-major memory layout. This version acts as a reference implementation that you'll use to validate the correctness of optimized versions later in the Learning Path. + +## Vanilla matrix multiplication algorithm + +The vanilla matrix multiplication operation takes two input matrices: + +* Matrix A [`Ar` rows x `Ac` columns] +* Matrix B [`Br` rows x `Bc` columns] + +It produces an output matrix C [`Cr` rows x `Cc` columns]. + +The algorithm works by iterating over each row of A and each column of B. It multiplies the corresponding elements and sums the products to generate each element of matrix C, as shown in the figure below. + +The diagram below shows how matrix C is computed by iterating over rows of A and columns of B: + +![Standard Matrix Multiplication alt-text#center](matmul.png "Figure 2: Standard matrix multiplication.") + +This implies that the A, B, and C matrices have some constraints on their +dimensions: + +- The number of columns in A must equal the number of rows in B: `Ac == Br`. +- Matrix C must have the dimensions Cr == Ar and Cc == Bc. + +For more information about matrix multiplication, including its history, +properties and use, see this [Wikipedia article on Matrix Multiplication](https://en.wikipedia.org/wiki/Matrix_multiplication). + +## Variable mappings in this Learning Path + +The following variable names are used throughout the Learning Path to represent matrix dimensions and operands: + +- `matLeft` corresponds to the left-hand side argument of the matrix multiplication. +- `matRight`corresponds to the right-hand side of the matrix multiplication. +- `M` is `matLeft` number of rows. +- `K` is `matLeft` number of columns (and `matRight` number of rows). +- `N` is `matRight` number of columns. +- `matResult`corresponds to the result of the matrix multiplication, with `M` rows and `N` columns. + +## C implementation + +Here is the full reference implementation from `matmul_vanilla.c`: + +```C { line_numbers="true" } +void matmul(uint64_t M, uint64_t K, uint64_t N, + const float *restrict matLeft, const float *restrict matRight, + float *restrict matResult) { + for (uint64_t m = 0; m < M; m++) { + for (uint64_t n = 0; n < N; n++) { + + float acc = 0.0; + + for (uint64_t k = 0; k < K; k++) + acc += matLeft[m * K + k] * matRight[k * N + n]; + + matResult[m * N + n] = acc; + } + } +} +``` + +## Memory layout and pointer annotations + +In this Learning Path, the matrices are laid out in memory as contiguous sequences of elements, in [row-major order](https://en.wikipedia.org/wiki/Row-_and_column-major_order). The `matmul` function performs the algorithm described above. + +The pointers to `matLeft`, `matRight` and `matResult` have been annotated as `restrict`, which informs the compiler that the memory areas designated by those pointers do not alias. This means that they do not overlap in any way, so that the compiler does not need to insert extra instructions to deal with these cases. The pointers to `matLeft` and `matRight` are marked as `const` as neither of these two matrices are modified by `matmul`. + +This function gives you a working baseline for matrix multiplication. You'll use it later in the Learning Path to verify the correctness of optimized implementations using SME2 intrinsics and assembly. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md new file mode 100644 index 0000000000..1e28558f2d --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-outer-product.md @@ -0,0 +1,96 @@ +--- +title: Outer product +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you'll learn how to improve matrix multiplication performance using the SME engine and outer product operations. + +This approach increases the number of multiply-accumulate (MACC) operations per memory load, reducing bandwidth pressure and improving overall throughput. + +## Increase MACC efficiency using outer products + +In the vanilla implementation, the core multiply-accumulate step looks like this: + +```C + acc += matLeft[m * K + k] * matRight[k * N + n]; +``` + +This translates to one multiply-accumulate operation, known as `macc`, for two loads (`matLeft[m * K + k]` and `matRight[k * N + n]`). It therefore has a 1:2 `macc` to `load` ratio of multiply-accumulate operations (MACCs) to memory loads - one multiply-accumulate and two loads per iteration, which is inefficient. This becomes more pronounced in triple-nested loops and when matrices exceed cache capacity. + +To improve performance, you want to increase the `macc` to `load` ratio, which means increasing the number of multiply-accumulate operations per load - you can express matrix multiplication as a sum of column-by-row outer products. + +The diagram below illustrates how the matrix multiplication of `matLeft` (3 rows, 2 +columns) by `matRight` (2 rows, 3 columns) can be decomposed into a sum of column-by-row outer +products: + +![example image alt-text#center](outer_product.png "Figure 3: Outer product-based matrix multiplication.") + +The SME engine builds on the [Outer +Product](https://en.wikipedia.org/wiki/Outer_product) because matrix +multiplication can be expressed as the [sum of column-by-row outer +products](https://en.wikipedia.org/wiki/Outer_product#Connection_with_the_matrix_product). + +## Optimize memory layout with transposition + +From the previous page, you will recall that matrices are laid out in row-major order. This means that loading row-data from memory is efficient as the memory-system operates efficiently with contiguous data. An example of this is where caches are loaded row by row, and data prefetching is simple - just load the data from `current address + sizeof(data)`. This is not the case for loading +column-data from memory though, as it requires more work from the memory system. + +To further improve matrix multiplication effectiveness, it is desirable to change the layout in memory of the left-hand side matrix, called `matLeft` in the code examples in this Learning Path. The improved layout ensures that elements from the same column are located next to each other in memory. This is essentially a matrix transposition, which changes `matLeft` from +row-major order to column-major order. + +{{% notice Important %}} +This transformation affects only the memory layout. From a mathematical perspective, `matLeft` is not transposed. It is reorganized for better data locality. +{{% /notice %}} + +### Transposition in practice + +The SME engine operates on tiles - 2D blocks of data stored in the ZA storage. SME provides dedicated instructions to load, store, and compute on tiles efficiently. + +For example, the [FMOPA](https://developer.arm.com/documentation/ddi0602/latest/SME-Instructions/FMOPA--non-widening---Floating-point-outer-product-and-accumulate-?lang=en) instruction takes two vectors as input and accumulates their outer product into a tile. The tile in ZA storage allows SME to increase the `macc` to`load` ratio by loading all the tile elements to be used with the SME outer +product instructions. + +But since ZA storage is finite, you need to you need to preprocess `matLeft` to match the tile dimensions - this includes transposing portions of the matrix and padding where needed. + +### Preprocessing with preprocess_l + +The following function shows how `preprocess_l` transforms the matrix at the algorithmic level: + +```C { line_numbers = "true" } +void preprocess_l(uint64_t nbr, uint64_t nbc, uint64_t SVL, + const float *restrict a, float *restrict a_mod) { + + // For all tiles of SVL x SVL data + for (uint64_t By = 0; By < nbr; By += SVL) { + for (uint64_t Bx = 0; Bx < nbc; Bx += SVL) { + // For this tile + const uint64_t dest = By * nbc + Bx * SVL; + for (uint64_t j = 0; j < SVL; j++) { + for (uint64_t i = 0; i < SVL && (Bx + i) < nbc; i++) { + if (By + j < nbr) { + a_mod[dest + i * SVL + j] = a[(By + j) * nbc + Bx + i]; + } else { + // These elements are outside of matrix a, so zero them. + a_mod[dest + i * SVL + j] = 0.0; + } + } + } + } + } +} +``` + +This routine is defined in `preprocess_vanilla.c.` It's used to ensure the assembly and intrinsics-based matrix multiplication routines work with the expected input format. + +{{% notice Note %}} +In production environments, it might be possible to arrange for `matLeft` to be +stored in column-major order, eliminating the need for transposition and making +the preprocessing step unnecessary. Matrix processing frameworks and libraries +often have attributes within the matrix object to track if it is in row- or +column-major order, and whether it has been transposed to avoid unnecessary +computations. +{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-sme2-matmul-asm.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-sme2-matmul-asm.md deleted file mode 100644 index 9504eff06d..0000000000 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/5-sme2-matmul-asm.md +++ /dev/null @@ -1,202 +0,0 @@ ---- -title: SME2 assembly matrix multiplication -weight: 7 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- -## Matrix multiplication with SME2 in assembly - -In this chapter, you will use an SME2-optimized matrix multiplication written -directly in assembly. - -### Description - -This Learning Path reuses the assembly version provided in the [SME Programmer's -Guide](https://developer.arm.com/documentation/109246/0100/matmul-fp32--Single-precision-matrix-by-matrix-multiplication) -where you will find a high-level and an in-depth description of the two steps -performed. - -The assembly versions have been modified so they coexist nicely with -the intrinsic versions. In this Learning Path, the ``preprocess`` function is -defined in ``preprocess_l_asm.S`` and the outer product-based matrix -multiplication is found in ``matmul_asm_impl.S``. - -These two functions have been stitched together in ``matmul_asm.c`` with the same prototype as the reference implementation of matrix multiplication, so that a top-level ``matmul_asm`` can -be called from the ``main`` function: - -```C -void matmul_asm(uint64_t M, uint64_t K, uint64_t N, - const float *restrict matLeft, const float *restrict matRight, - float *restrict matLeft_mod, float *restrict matResult) { - __asm volatile("" - : - : - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", - "p10", "p11", "p12", "p13", "p14", "p15", "z0", "z1", "z2", - "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", - "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", - "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", - "z28", "z29", "z30", "z31"); - - preprocess_l_asm(M, K, matLeft, matLeft_mod); - matmul_asm_impl(M, K, N, matLeft_mod, matRight, matResult); - - __asm volatile("" - : - : - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", - "p10", "p11", "p12", "p13", "p14", "p15", "z0", "z1", "z2", - "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", - "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", - "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", - "z28", "z29", "z30", "z31"); -} -``` - -Note here the use of the ``__asm`` statement forcing the compiler to save the SVE/SME registers. - -The high-level ``matmul_asm`` function is called from ``main.c``: - -```C -#include "matmul.h" -#include "misc.h" - -#include -#include -#include -#include - -#ifndef __ARM_FEATURE_SME2 -#error __ARM_FEATURE_SME2 is not defined -#endif - -#ifndef IMPL -#error matmul implementation selection macro IMPL is not defined -#endif - -#define STRINGIFY_(I) #I -#define STRINGIFY(I) STRINGIFY_(I) -#define FN(M, I) M##I -#define MATMUL(I, M, K, N, mL, mR, mM, m) FN(matmul_, I)(M, K, N, mL, mR, mM, m) - -// Assumptions: -// nbr in matLeft (M): any -// nbc in matLeft, nbr in matRight (K): any K > 2 (for the asm version) -// nbc in matRight (N): any - -int main(int argc, char **argv) { - - /* Size parameters */ - uint64_t M, N, K; - if (argc >= 4) { - M = strtoul(argv[1], NULL, 0); - K = strtoul(argv[2], NULL, 0); - N = strtoul(argv[3], NULL, 0); - } else { - /* Default: 125x35x70 */ - M = 125; - K = 35; - N = 70; - } - - printf("\nSME2 Matrix Multiply fp32 *%s* example with args %lu %lu %lu\n", - STRINGIFY(IMPL), M, K, N); - - setup_sme(); - - const uint64_t SVL = svcntsw(); - - /* Calculate M of transformed matLeft. */ - const uint64_t M_mod = SVL * (M / SVL + (M % SVL != 0 ? 1 : 0)); - - float *matRight = (float *)malloc(K * N * sizeof(float)); - - float *matLeft = (float *)malloc(M * K * sizeof(float)); - float *matLeft_mod = (float *)malloc(M_mod * K * sizeof(float)); - float *matLeft_mod_ref = (float *)malloc(M_mod * K * sizeof(float)); - - float *matResult = (float *)malloc(M * N * sizeof(float)); - float *matResult_ref = (float *)malloc(M * N * sizeof(float)); - -#ifdef DEBUG - initialize_matrix(matLeft, M * K, LINEAR_INIT); - initialize_matrix(matRight, K * N, LINEAR_INIT); - initialize_matrix(matLeft_mod, M_mod * K, DEAD_INIT); - initialize_matrix(matResult, M * N, DEAD_INIT); - - print_matrix(M, K, matLeft, "matLeft"); - print_matrix(K, N, matRight, "matRight"); -#else - initialize_matrix(matLeft, M * K, RANDOM_INIT); - initialize_matrix(matRight, K * N, RANDOM_INIT); -#endif - - MATMUL(IMPL, M, K, N, matLeft, matRight, matLeft_mod, matResult); - - // Compute the reference values with the vanilla implementations. - matmul(M, K, N, matLeft, matRight, matResult_ref); - preprocess_l(M, K, SVL, matLeft, matLeft_mod_ref); - - unsigned error = compare_matrices(K, M_mod, matLeft_mod_ref, matLeft_mod, - "Matrix preprocessing"); - if (!error) - error = compare_matrices(M, N, matResult_ref, matResult, - "Matrix multiplication"); - - free(matRight); - - free(matLeft); - free(matLeft_mod); - free(matLeft_mod_ref); - - free(matResult); - free(matResult_ref); - - return error ? EXIT_FAILURE : EXIT_SUCCESS; -} -``` - -The same ``main.c`` file is used for the assembly and intrinsic-based versions -of the matrix multiplication. It first sets the ``M``, ``K`` and ``N`` -parameters, to either the arguments supplied on the command line or uses the default -value. - -Depending on the ``M``, ``K``, ``N`` dimension parameters, ``main`` allocates memory for all the matrices and initializes ``matLeft`` and ``matRight`` with random data. The actual matrix multiplication implementation is provided through the ``IMPL`` macro. - -It then runs the matrix multiplication from ``IMPL`` and computes the reference values for the preprocessed matrix as well as the result matrix. It then compares the actual values to the reference values and reports errors, if there are any. Finally, all the memory is deallocated before exiting the program with a success or failure return code. - -### Compile and run it - -First, make sure that the ``sme2_matmul_asm`` executable is up-to-date: - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 make sme2_matmul_asm -``` - -Then execute ``sme2_matmul_asm`` on the FVP: - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh sme2_matmul_asm -``` - -The output should be something similar to: - -```TXT -SME2 Matrix Multiply fp32 *asm* example with args 125 35 70 -Matrix preprocessing: PASS ! -Matrix multiplication: PASS ! - -Info: /OSCI/SystemC: Simulation stopped by user. -``` - -{{% notice Tip %}} -The example above uses the default values for the ``M`` (125), ``K``(25) and ``N``(70) -parameters. You can override this and provide your own values on the command line: - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh sme2_matmul_asm 7 8 9 -``` - -Here the values ``M=7``, ``K=8`` and ``N=9`` are used instead. -{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md new file mode 100644 index 0000000000..e41965f946 --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-asm.md @@ -0,0 +1,395 @@ +--- +title: SME2 assembly matrix multiplication +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- +## Overview + +In this section, you'll learn how to run an SME2-optimized matrix multiplication implemented directly in assembly. + +This implementation is based on the algorithm described in [Arm's SME Programmer's +Guide](https://developer.arm.com/documentation/109246/0100/matmul-fp32--Single-precision-matrix-by-matrix-multiplication) and has been adapted to integrate with the existing C and intrinsics-based code in this Learning Path. It demonstrates how to apply low-level optimizations for matrix multiplication using the SME2 instruction set, with a focus on preprocessing and outer-product accumulation. + +You'll explore how the assembly implementation works in practice, how it interfaces with C wrappers, and how to verify or benchmark its performance. Whether you're validating correctness or measuring execution speed, this example provides a clear, modular foundation for working with SME2 features in your own codebase. + +By mastering this assembly implementation, you'll gain deeper insight into SME2 execution patterns and how to integrate low-level optimizations in high-performance workloads. + +## About the SME2 assembly implementation + +This Learning Path reuses the assembly version described in [The SME Programmer's +Guide](https://developer.arm.com/documentation/109246/0100/matmul-fp32--Single-precision-matrix-by-matrix-multiplication) +where you will find both high-level concepts and in-depth descriptions of the two key steps: +preprocessing and matrix multiplication. + +The assembly code has been modified to work seamlessly alongside the intrinsic version. + +The key changes include: +* Delegating streaming mode control to the compiler +* Avoiding register `x18`, which is reserved as a platform register + +Here: +- The `preprocess` function is named `preprocess_l_asm` and is defined in `preprocess_l_asm.S` +- The outer product-based matrix multiplication is named `matmul_asm_impl` and is defined in `matmul_asm_impl.S` + +Both functions are declared in `matmul.h`: + +```C +// Matrix preprocessing, in assembly. +void preprocess_l_asm(uint64_t M, uint64_t K, const float *restrict a, + float *restrict a_mod) __arm_streaming __arm_inout("za"); + +// Matrix multiplication (with the *transposed* RHS), in assembly. +void matmul_asm_impl( + uint64_t M, uint64_t K, uint64_t N, const float *restrict matLeft_mod, + const float *restrict matRight, + float *restrict matResult) __arm_streaming __arm_inout("za"); +``` + +Both functions are annotated with the `__arm_streaming` and `__arm_inout("za")` attributes. These indicate that the function expects streaming mode to be active and does not need to save or restore the ZA storage. + +These two functions are stitched together in `matmul_asm.c` with the same prototype as the reference implementation of matrix multiplication, so that a top-level `matmul_asm` can be called from the `main` function: + +```C +__arm_new("za") __arm_locally_streaming void matmul_asm( + uint64_t M, uint64_t K, uint64_t N, const float *restrict matLeft, + const float *restrict matRight, float *restrict matLeft_mod, + float *restrict matResult) { + + preprocess_l_asm(M, K, matLeft, matLeft_mod); + matmul_asm_impl(M, K, N, matLeft_mod, matRight, matResult); +} +``` + +You can see that `matmul_asm` is annotated with two attributes: `__arm_new("za")` and `__arm_locally_streaming`. These attributes instruct the compiler to enable streaming mode and manage ZA state on entry and return. + +## How it integrates with the main function + +The same `main.c` file supports both the intrinsic and assembly implementations. The implementation to use is selected at compile time via the `IMPL` macro. This design reduces duplication and simplifies maintenance. + +## Execution modes + +- On a baremetal platform, the program runs in *verification mode*, where it compares the results of the assembly-based matrix multiplication with the vanilla reference implementation. When targeting a non-baremetal platform, a *benchmarking mode* is also available. + +```C { line_numbers="true" } +#ifndef __ARM_FEATURE_SME2 +#error __ARM_FEATURE_SME2 is not defined +#endif + +#ifndef IMPL +#error matmul implementation selection macro IMPL is not defined +#endif + +#include "matmul.h" +#include "misc.h" + +#include +#include +#include +#include + +#define STRINGIFY_(I) #I +#define STRINGIFY(I) STRINGIFY_(I) +#define FN(M, I) M##I +#define MATMUL(I, M, K, N, mL, mR, mM, m) FN(matmul_, I)(M, K, N, mL, mR, mM, m) + +void usage(const char *prog_name) { +#if BAREMETAL == 1 + printf("Usage: %s \n", prog_name); + printf(" M: number of rows in matLeft (default: 125)\n"); + printf(" K: number of columns in matLeft and matRight (default: 35)\n"); + printf(" N: number of columns in matRight (default: 70)\n"); + printf("Example: matmul 125 35 70\n"); +#else + printf("Depending on the number of arguments, the program can be invoked " + "in two modes:\n"); + printf(" - verification mode. The program will run the assembly or " + "intrinsics implementatation of the matrix multiplication and " + "compare the results with a reference implementation.\n"); + printf(" - benchmarking mode. The program will run the assembly or " + "intrinsics implementation of the matrix multiplication a number of " + "times and print the time taken to perform the operation.\n"); + + printf("\n"); + printf("Verification mode:\n"); + printf(" %s\n", prog_name); + printf(" %s \n", prog_name); + printf("with:\n"); + printf(" - M: number of rows in matLeft (default: 125)\n"); + printf(" - K: number of columns in matLeft and number of rows in matRight " + "(default: 35). Must be > 2 for assembly version of matmul.\n"); + printf(" - N: number of columns in matRight (default: 70)\n"); + printf("Example: %s 67 18 23\n", prog_name); + + printf("\n"); + printf("Benchmarking mode:\n"); + printf(" %s \n", prog_name); + printf(" %s \n", prog_name); + printf("with:\n"); + printf(" - I: number of iterations to perform. Must be > 0.\n"); + printf(" - M: number of rows in matLeft (default: 125)\n"); + printf(" - K: number of columns in matLeft and number of rows in matRight " + "(default: 35). Must be > 2 for assembly version of matmul.\n"); + printf(" - N: number of columns in matRight (default: 70)\n"); + printf("Example: %s 1000 67 18 23\n", prog_name); +#endif +} + +int main(int argc, char **argv) { + + /* Matrices size parameters, defaults to 125x35x70. + Assumptions (for assembly handwritten matmul) are: + - number of rows in matLeft (M): any + - number of columns in matLeft and number of rows in matRight (K): any K > 2 + - number of columns in matRight (N): any + */ + uint64_t I = 0; // Number of iterations to perform for benchmarking. + uint64_t M = 125; // Number of rows in matLeft. + uint64_t N = 35; // Number of columns in matRight. + uint64_t K = 70; // Number of columns (resp. rows) in matLeft (resp. matRight). + + switch (argc) { + case 1: + // Verification mode, with default matrix sizes. + break; +#if BAREMETAL == 0 + case 2: + // Benchmarking mode, with default matrix sizes. + I = strtoull(argv[1], NULL, 0); + if (I == 0) { + printf("Error, in benchmarking mode, I must be > 0.\n"); + return EXIT_FAILURE; + } + break; +#endif + case 4: + // Verification mode, with user-defined matrix sizes. + M = strtoul(argv[1], NULL, 0); + K = strtoul(argv[2], NULL, 0); + N = strtoul(argv[3], NULL, 0); + break; +#if BAREMETAL == 0 + case 5: + // Benchmarking mode, with user-defined matrix sizes. + I = strtoull(argv[1], NULL, 0); + if (I == 0) { + printf("Error, in benchmarking mode, I must be > 0.\n"); + return EXIT_FAILURE; + } + M = strtoul(argv[2], NULL, 0); + K = strtoul(argv[3], NULL, 0); + N = strtoul(argv[4], NULL, 0); + break; +#endif + default: + usage(argv[0]); + return EXIT_FAILURE; + } + + // Check assumptions hold. + if (strcmp(STRINGIFY(IMPL), "asm")==0 && K <= 2) { + printf("Error, for assembly implementation of matmul, K must be > 2.\n"); + return EXIT_FAILURE; + } + + // Describe the operation that will be performed. + printf("SME2 Matrix Multiply fp32 *%s* ", STRINGIFY(IMPL)); + if (I != 0) + printf("[benchmarking mode, %" PRIu64 " iterations] ", I); + else + printf("[verification mode] "); + printf("with M=%" PRIu64 ", K=%" PRIu64 ", N=%" PRIu64 "\n", M, K, N); + +#if BAREMETAL == 1 + setup_sme_baremetal(); +#endif + + const uint64_t SVL = svcntsw(); + + // Calculate M of transformed matLeft. + const uint64_t M_mod = SVL * (M / SVL + (M % SVL != 0 ? 1 : 0)); + + // Allocate memory for all matrices. + float *matRight = (float *)malloc(K * N * sizeof(float)); + + float *matLeft = (float *)malloc(M * K * sizeof(float)); + float *matLeft_mod = (float *)malloc(M_mod * K * sizeof(float)); + float *matLeft_mod_ref = (float *)malloc(M_mod * K * sizeof(float)); + + float *matResult = (float *)malloc(M * N * sizeof(float)); + float *matResult_ref = (float *)malloc(M * N * sizeof(float)); + + // Initialize matrices. Input matrices are initialized with random values in + // non-debug mode. In debug mode, all matrices are initialized with linear + // or known values for easier debugging. +#ifdef DEBUG + initialize_matrix(matLeft, M * K, LINEAR_INIT); + initialize_matrix(matRight, K * N, LINEAR_INIT); + initialize_matrix(matLeft_mod, M_mod * K, DEAD_INIT); + initialize_matrix(matResult, M * N, DEAD_INIT); + + print_matrix(M, K, matLeft, "matLeft"); + print_matrix(K, N, matRight, "matRight"); +#else + initialize_matrix(matLeft, M * K, RANDOM_INIT); + initialize_matrix(matRight, K * N, RANDOM_INIT); +#endif + + unsigned error = 0; + if (I == 0) { + // Verification mode. + MATMUL(IMPL, M, K, N, matLeft, matRight, matLeft_mod, matResult); + + // Compute the reference values with the vanilla implementations. + preprocess_l(M, K, SVL, matLeft, matLeft_mod_ref); + matmul(M, K, N, matLeft, matRight, matResult_ref); + + error = compare_matrices(K, M_mod, matLeft_mod_ref, matLeft_mod, + "Matrix preprocessing"); + if (!error) + error = compare_matrices(M, N, matResult_ref, matResult, + "Matrix multiplication"); + } else { +#if BAREMETAL == 0 + // Benchmarking mode. + uint64_t min_time = UINT64_MAX; + uint64_t max_time = 0; + double sum = 0.0; + + // Warm-up runs to ensure the CPU is ready for benchmarking. + for (uint64_t i = 0; i < 10; i++) + matmul(M, K, N, matLeft, matRight, matResult_ref); + + // Measure the time taken by the matrix multiplication. + for (uint64_t i = 0; i < I; i++) { + const uint64_t start_time = get_time_microseconds(); + matmul(M, K, N, matLeft, matRight, matResult_ref); + const uint64_t elapsed_time = get_time_microseconds() - start_time; + + if (elapsed_time < min_time) + min_time = elapsed_time; + if (elapsed_time > max_time) + max_time = elapsed_time; + sum += elapsed_time; + } + printf("Reference implementation: min time = %" PRIu64 " us, " + "max time = %" PRIu64 " us, avg time = %.2f us\n", + min_time, max_time, sum / I); + + // Benchmarking mode (SME2 implementation). + min_time = UINT64_MAX; + max_time = 0; + sum = 0.0; + + // Warm-up runs to ensure the CPU is ready for benchmarking. + for (uint64_t i = 0; i < 10; i++) + MATMUL(IMPL, M, K, N, matLeft, matRight, matLeft_mod, matResult); + + // Measure the time taken by the SME2 matrix multiplication. + for (uint64_t i = 0; i < I; i++) { + const uint64_t start_time = get_time_microseconds(); + MATMUL(IMPL, M, K, N, matLeft, matRight, matLeft_mod, matResult); + const uint64_t elapsed_time = get_time_microseconds() - start_time; + + if (elapsed_time < min_time) + min_time = elapsed_time; + if (elapsed_time > max_time) + max_time = elapsed_time; + sum += elapsed_time; + } + printf("SME2 implementation *%s*: min time = %" PRIu64 " us, " + "max time = %" PRIu64 " us, avg time = %.2f us\n", + STRINGIFY(IMPL), min_time, max_time, sum / I); +#else + printf("Error, can not run in benchmarking mode in baremetal.\n"); + return EXIT_FAILURE; +#endif + } + + // Free allocated memory. + free(matRight); + + free(matLeft); + free(matLeft_mod); + free(matLeft_mod_ref); + + free(matResult); + free(matResult_ref); + + return error ? EXIT_FAILURE : EXIT_SUCCESS; +} +``` + +The same `main.c` file is used for the assembly and intrinsic-based versions of the matrix multiplication. It first sets the `M`, `K` and `N` parameters, to either the arguments supplied on the command line (lines 93-95) or uses the default value (lines 73-75). In non-baremetal mode, it also accepts (lines 82-89 and lines 98-108), as first parameter, an iteration count `I` +used for benchmarking. + +Depending on the `M`, `K`, `N` dimension parameters, `main` allocates memory for all the matrices and initializes `matLeft` and `matRight` with random data. The actual matrix multiplication implementation is provided through the `IMPL` macro. + +In *verification mode*, it then runs the matrix multiplication from `IMPL` (line 167) and computes the reference values for the preprocessed matrix as well as the result matrix (lines 170 and 171). It then compares the actual values to the reference values and reports errors, if there are any (lines 173-177). Finally, all the memory is deallocated (lines 236-243) before exiting the +program with a success or failure return code at line 245. + +In *benchmarking mode*, it will first run the vanilla reference matrix multiplication (resp. assembly- or intrinsic-based matrix multiplication) 10 times without measuring elapsed time to warm-up the CPU. It will then measure the elapsed execution time of the vanilla reference matrix multiplication (resp.assembly- or intrinsic-based matrix multiplication) `I` times and then compute +and report the minimum, maximum and average execution times. + +{{% notice Note %}} +Benchmarking and profiling are not simple tasks. The purpose of this Learning Path is to provide some basic guidelines on the performance improvement that can be obtained with SME2. +{{% /notice %}} + +### Compile and run it + +First, make sure that the `sme2_matmul_asm` executable is up-to-date: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2">}} + make sme2_matmul_asm + make: `sme2_matmul_asm' is up to date. + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make sme2_matmul_asm + make: 'sme2_matmul_asm' is up to date. + {{< /tab >}} +{{< /tabpane >}} + +Then execute `sme2_matmul_asm` either natively or on the FVP: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} + ./sme2_matmul_asm + SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 + Matrix preprocessing: PASS ! + Matrix multiplication: PASS ! + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_asm + SME2 Matrix Multiply fp32 *asm* [verification mode] with M=125, K=70, N=35 + Matrix preprocessing: PASS ! + Matrix multiplication: PASS ! + + Info: /OSCI/SystemC: Simulation stopped by user. + {{< /tab >}} +{{< /tabpane >}} + +`sme2_matmul_asm` prints the version of the matrix multiplication performed +(asm or intr) as well as the `M`, `K` and `N` parameters. It also prints +whether the preprocessing and matrix multiplication passed (`PASS`) or failed +(`FAILED`) the comparison the vanilla reference implementation. + +{{% notice Tip %}} +The example above uses the default values for the `M` (125), `K`(70) and `N`(70) +parameters. You can override this and provide your own values on the command line: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash">}} + ./sme2_matmul_asm 7 8 9 + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_asm 7 8 9 + {{< /tab >}} +{{< /tabpane >}} + +In this example, `M=7`, `K=8`, and `N=9` are used. +{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-intr.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-intr.md deleted file mode 100644 index 413f54bf48..0000000000 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/6-sme2-matmul-intr.md +++ /dev/null @@ -1,344 +0,0 @@ ---- -title: SME2 intrinsics matrix multiplication -weight: 8 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- -## Matrix multiplication with SME2 intrinsics - -In this section, you will write an SME2 optimized matrix multiplication in C using the intrinsics that the compiler provides. - -*Intrinsics*, also know known as *compiler intrinsics* or *intrinsic functions*, are the functions available to application developers that the compiler has an -intimate knowledge of. This enables the compiler to either translate the function to a specific instruction or to perform specific optimizations, or both. - -You can learn more about intrinsics in this [Wikipedia -Article on Intrinsic Function](https://en.wikipedia.org/wiki/Intrinsic_function). - -Using intrinsics allows the programmer to use the specific instructions -required to achieve the required performance while writing in C all the typically-required standard code, such as loops. This produces performance close to what can be reached with hand-written assembly whilst being significantly more maintainable and portable. - -All Arm-specific intrinsics are specified in the -[ACLE](https://github.com/ARM-software/acle), which is the Arm C Language Extension. ACLE -is supported by the main compilers, most notably [GCC](https://gcc.gnu.org/) and -[Clang](https://clang.llvm.org). - -## Streaming mode - -On the previous page, assembly language provided the programmer with full access to processor features. However, this comes at the cost of increased complexity and maintenance, particularly when managing large codebases with deeply nested function calls. Additionally, the assembly version operates at a very low level and does not fully handle the SME state. - -In real-world large-scale software, the program moves back and forth from streaming mode, and some streaming mode routines call other streaming mode routines, which means that some state needs to be saved and restored. This includes the ZA storage. This is defined in the ACLE and -supported by the compiler: the programmer *just* has to annotate the function -with some keywords and set up some registers (see function ``setup_sme`` in -``misc.c`` for an example). See [Introduction to streaming and non-streaming mode](https://arm-software.github.io/acle/main/acle.html#controlling-the-use-of-streaming-mode) -for further information. The rest of this section references information from the ACLE. - -The AArch64 architecture defines a concept called *streaming mode*, controlled -by a processor state bit called ``PSTATE.SM``. At any given point in time, the -processor is either in streaming mode (``PSTATE.SM==1``) or in non-streaming mode -(``PSTATE.SM==0``). There is an instruction called ``SMSTART`` to enter streaming mode -and an instruction called ``SMSTOP`` to return to non-streaming mode. - -Streaming mode has three main effects on C and C++ code: - -- It can change the length of SVE vectors and predicates: the length of an SVE - vector in streaming mode is called the “streaming vector length” (SVL), which - might be different from the normal non-streaming vector length. See - [Effect of streaming mode on VL](https://arm-software.github.io/acle/main/acle.html#effect-of-streaming-mode-on-vl) - for more details. -- Some instructions can only be executed in streaming mode, which means that - their associated ACLE intrinsics can only be used in streaming mode. These - intrinsics are called “streaming intrinsics”. -- Some other instructions can only be executed in non-streaming mode, which - means that their associated ACLE intrinsics can only be used in non-streaming - mode. These intrinsics are called “non-streaming intrinsics”. - -The C and C++ standards define the behavior of programs in terms of an *abstract -machine*. As an extension, the ACLE specification applies the distinction -between streaming mode and non-streaming mode to this abstract machine: at any -given point in time, the abstract machine is either in streaming mode or in -non-streaming mode. - -This distinction between processor mode and abstract machine mode is mostly just -a specification detail. However, the usual “as if” rule applies: the -processor's actual mode at runtime can be different from the abstract machine's -mode, provided that this does not alter the behavior of the program. One -practical consequence of this is that C and C++ code does not specify the exact -placement of ``SMSTART`` and ``SMSTOP`` instructions; the source code simply places -limits on where such instructions go. For example, when stepping through a -program in a debugger, the processor mode might sometimes be different from the -one implied by the source code. - -ACLE provides attributes that specify whether the abstract machine executes statements: - -- In non-streaming mode, in which case they are called *non-streaming statements*. -- In streaming mode, in which case they are called *streaming statements*. -- In either mode, in which case they are called *streaming-compatible statements*. - -SME provides an area of storage called ZA, of size ``SVL.B`` x ``SVL.B`` bytes. It -also provides a processor state bit called ``PSTATE.ZA`` to control whether ZA -is enabled. - -In C and C++ code, access to ZA is controlled at function granularity: a -function either uses ZA or it does not. Another way to say this is that a -function either “has ZA state” or it does not. - -If a function does have ZA state, the function can either share that ZA state -with the function's caller or create new ZA state “from scratch”. In the latter -case, it is the compiler's responsibility to free up ZA so that the function can -use it; see the description of the lazy saving scheme in -[AAPCS64](https://arm-software.github.io/acle/main/acle.html#AAPCS64) for details -about how the compiler does this. - -## Implementation - -Here again, a top level function named ``matmul_intr`` in ``matmul_intr.c`` -will be used to stitch together the preprocessing and the multiplication: - -```C "{ line_numbers = true }" -__arm_new("za") __arm_locally_streaming void matmul_intr( - uint64_t M, uint64_t K, uint64_t N, const float *restrict matLeft, - const float *restrict matRight, float *restrict matLeft_mod, - float *restrict matResult) { - uint64_t SVL = svcntsw(); - preprocess_l_intr(M, K, SVL, matLeft, matLeft_mod); - matmul_intr_impl(M, K, N, SVL, matLeft_mod, matRight, matResult); -} -``` - -Note the ``__arm_new("za")`` and ``__arm_locally_streaming`` at line 1 that will -make the compiler save the ZA storage so we can use it without destroying its -content if it was still in use by one of the callers. - -``SVL``, the dimension of the ZA storage, is requested from the underlying -hardware with the ``svcntsw()`` function call at line 5, and passed down to the -``preprocess_l_intr`` and ``matmul_intr_impl`` functions. ``svcntsw()`` is a -function provided be the ACLE library. - -### Matrix preprocessing - -```C "{ line_numbers = true }" -void preprocess_l_intr( - uint64_t M, uint64_t K, uint64_t SVL, const float *restrict a, - float *restrict a_mod) __arm_streaming __arm_inout("za") { - const uint64_t M_mod = SVL * (M / SVL + (M % SVL != 0 ? 1 : 0)); - - // The outer loop, iterating over rows (M dimension) - for (uint64_t row = 0; row < M; row += SVL) { - - svbool_t pMDim = svwhilelt_b32(row, M); - - // The inner loop, iterating on columns (K dimension). - for (uint64_t col = 0; col < K; col += 2 * SVL) { - - svcount_t pKDim = svwhilelt_c32(col, K, 2); - - // Load-as-rows - for (uint64_t trow = 0; trow < SVL; trow += 4) { - svcount_t p0 = svpsel_lane_c32(pKDim, pMDim, trow + 0); - svcount_t p1 = svpsel_lane_c32(pKDim, pMDim, trow + 1); - svcount_t p2 = svpsel_lane_c32(pKDim, pMDim, trow + 2); - svcount_t p3 = svpsel_lane_c32(pKDim, pMDim, trow + 3); - - const uint64_t tile_UL_corner = (row + trow) * K + col; - svfloat32x2_t zp0 = svld1_x2(p0, &a[tile_UL_corner + 0 * K]); - svfloat32x2_t zp1 = svld1_x2(p1, &a[tile_UL_corner + 1 * K]); - svfloat32x2_t zp2 = svld1_x2(p2, &a[tile_UL_corner + 2 * K]); - svfloat32x2_t zp3 = svld1_x2(p3, &a[tile_UL_corner + 3 * K]); - - svfloat32x4_t zq0 = svcreate4(svget2(zp0, 0), svget2(zp1, 0), - svget2(zp2, 0), svget2(zp3, 0)); - svfloat32x4_t zq1 = svcreate4(svget2(zp0, 1), svget2(zp1, 1), - svget2(zp2, 1), svget2(zp3, 1)); - svwrite_hor_za32_f32_vg4( - /* tile: */ 0, /* slice: */ trow, zq0); - svwrite_hor_za32_f32_vg4( - /* tile: */ 1, /* slice: */ trow, zq1); - } - - // Read-as-columns and store - const uint64_t dest_0 = row * K + col * SVL; - const uint64_t dest_1 = dest_0 + SVL * SVL; - for (uint64_t tcol = 0; tcol < SVL; tcol += 4) { - svcount_t p0 = svwhilelt_c32(dest_0 + tcol * SVL, K * M_mod, 4); - svcount_t p1 = svwhilelt_c32(dest_1 + tcol * SVL, K * M_mod, 4); - svfloat32x4_t zq0 = - svread_ver_za32_f32_vg4(/* tile: */ 0, /* slice: */ tcol); - svfloat32x4_t zq1 = - svread_ver_za32_f32_vg4(/* tile: */ 1, /* slice: */ tcol); - svst1(p0, &a_mod[dest_0 + tcol * SVL], zq0); - svst1(p1, &a_mod[dest_1 + tcol * SVL], zq1); - } - } - } -} -``` - -Note that ``preprocess_l_intr`` has been annotated at line 3 with: - -- ``__arm_streaming``, because this function is using streaming instructions, - -- ``__arm_inout("za")``, because ``preprocess_l_intr`` reuses the ZA storage - from its caller. - -The matrix preprocessing is performed in a double nested loop, over the ``M`` -(line 7) and ``K`` (line 12) dimensions of the input matrix ``a``. Both loops -have an ``SVL`` step increment, which corresponds to the horizontal and vertical -dimensions of the ZA storage that will be used. The dimensions of ``a`` may not -be perfect multiples of ``SVL`` though... which is why the predicates ``pMDim`` -(line 9) and ``pKDim`` (line 14) are computed in order to know which rows (respectively -columns) are valid. - -The core of ``preprocess_l_intr`` is made of two parts: - -- Lines 17 - 37: load matrix tile as rows. In this part, loop unrolling has been - used at 2 different levels. At the lowest level, 4 rows are loaded at a time - (lines 24-27). But this goes much further because as SME2 has multi-vectors - operations (hence the ``svld1_x2`` intrinsic to load 2 rows in 2 vector - registers), this allows the function to load the consecutive row, which - happens to be the row from the neighboring tile on the right: this means two - tiles are processed at once. At line 29-32, the pairs of vector registers are - rearranged on quads of vector registers so they can be stored horizontally in - the two tiles' ZA storage at lines 33-36 with the ``svwrite_hor_za32_f32_vg4`` - intrinsic. Of course, as the input matrix may not have dimensions that are - perfect multiples of ``SVL``, the ``p0``, ``p1``, ``p2`` and ``p3`` predicates - are computed with the ``svpsel_lane_c32`` intrinsic (lines 18-21) so that - elements outside of the input matrix are set to 0 when they are loaded at - lines 24-27. - -- Lines 39 - 51: read the matrix tile as columns and store them. Now that the 2 - tiles have been loaded *horizontally*, they will be read *vertically* with the - ``svread_ver_za32_f32_vg4`` intrinsic to quad-registers of vectors (``zq0`` - and ``zq1``) at lines 45-48 and then stored with the ``svst1`` intrinsic to - the relevant location in the destination matrix ``a_mod`` (lines 49-50). Note - again the usage of predicates ``p0`` and ``p1`` (computed at lines 43-44) to - ``svst1`` to prevent writing out of the matrix bounds. - -Using intrinsics simplifies function development significantly, provided one has a good understanding of the SME2 instruction set. -Predicates, which are fundamental to SVE and SME, enable a natural expression of algorithms while handling corner cases efficiently. -Notably, there is no explicit condition checking within the loops to account for rows or columns extending beyond matrix bounds. - -### Outer-product multiplication - -```C "{ line_numbers = true }" -void matmul_intr_impl( - uint64_t M, uint64_t K, uint64_t N, uint64_t SVL, - const float *restrict matLeft_mod, const float *restrict matRight, - float *restrict matResult) __arm_streaming __arm_inout("za") { - - // Build the result matrix tile by tile. - for (uint64_t row = 0; row < M; row += SVL) { - - svbool_t pMDim = svwhilelt_b32(row, M); - - for (uint64_t col = 0; col < N; col += SVL) { - - svbool_t pNDim = svwhilelt_b32(col, N); - - // Outer product + accumulation - svzero_za(); - const uint64_t matLeft_pos = row * K; - const uint64_t matRight_UL_corner = col; - for (uint64_t k = 0; k < K; k++) { - svfloat32_t zL = - svld1(pMDim, &matLeft_mod[matLeft_pos + k * SVL]); - svfloat32_t zR = - svld1(pNDim, &matRight[matRight_UL_corner + k * N]); - svmopa_za32_m(0, pMDim, pNDim, zL, zR); - } - - // Store ZA to matResult. - const uint64_t result_tile_UL_corner = row * N + col; - for (uint64_t trow = 0; trow < SVL && row + trow < M; trow += 4) { - svbool_t p0 = svpsel_lane_b32(pNDim, pMDim, row + trow + 0); - svbool_t p1 = svpsel_lane_b32(pNDim, pMDim, row + trow + 1); - svbool_t p2 = svpsel_lane_b32(pNDim, pMDim, row + trow + 2); - svbool_t p3 = svpsel_lane_b32(pNDim, pMDim, row + trow + 3); - - svst1_hor_za32( - /* tile: */ 0, /* slice: */ trow + 0, p0, - &matResult[result_tile_UL_corner + (trow + 0) * N]); - svst1_hor_za32( - /* tile: */ 0, /* slice: */ trow + 1, p1, - &matResult[result_tile_UL_corner + (trow + 1) * N]); - svst1_hor_za32( - /* tile: */ 0, /* slice: */ trow + 2, p2, - &matResult[result_tile_UL_corner + (trow + 2) * N]); - svst1_hor_za32( - /* tile: */ 0, /* slice: */ trow + 3, p3, - &matResult[result_tile_UL_corner + (trow + 3) * N]); - } - } - } -} -``` - -Note again that ``matmul_intr_impl`` function has been annotated at line 4 with: - -- ``__arm_streaming``, because the function is using streaming instructions, - -- ``__arm_inout("za")``, because the function reuses the ZA storage from its caller. - -The multiplication with the outer product is performed in a double-nested loop, -over the ``M`` (line 7) and ``N`` (line 11) dimensions of the input matrices -``matLeft_mod`` and ``matRight``. Both loops have an ``SVL`` step increment, -which corresponds to the horizontal and vertical dimensions of the ZA storage -that will be used as one tile at a time will be processed. The ``M`` and ``N`` -dimensions of the inputs may not be perfect multiples of ``SVL`` so the -predicates ``pMDim`` (line 9) (respectively ``pNDim`` at line 13) are computed in order -to know which rows (respectively columns) are valid. - -The core of the multiplication is done in 2 parts: - -- Outer-product and accumulation at lines 15-25. As ``matLeft`` has been - laid-out perfectly in memory with ``preprocess_l_intr``, this part becomes - straightforward. First, the tile is zeroed with the ``svzero_za`` intrinsics - at line 16 so the outer products can be accumulated in the tile. The outer - products are computed and accumulation over the ``K`` common dimension with - the loop at line 19: the column of ``matleft_mod`` and the row of ``matRight`` - are loaded with the ``svld1`` intrinsics at line 20-23 to vector registers - ``zL`` and ``zR``, which are then used at line 24 with the ``svmopa_za32_m`` - intrinsic to perform the outer product and accumulation (to tile 0). This - is exactly what was shown in Figure 2 earlier in the Learning Path. - Note again the usage of the ``pMDim`` and ``pNDim`` predicates to deal - correctly with the rows and columns respectively which are out of bounds. - -- Storing of the result matrix at lines 27-46. The previous section computed the matrix multiplication result for the current tile, which now needs - to be written back to memory. This is done with the loop at line 29 which will - iterate over all rows of the tile: the ``svst1_hor_za32`` intrinsic at lines - 35-46 stores directly from the tile to memory. Note that the loop has been - unrolled by a factor of 4 (thus the ``trow += 4`` increment, line 29) and the - 4 ``svst1_hor_za32``. Again, the ``pMDim`` and ``pNDim`` predicates deal - gracefully with the parts of the tile which are out-of-bound for the - destination matrix ``matResult``. - -Once again, intrinsics makes it easy to fully leverage SME2, provided you have a solid understanding of its available instructions. -Predicates handle corner cases elegantly, ensuring robust execution. Most importantly, the code adapts to different SVL values across various hardware implementations without requiring recompilation. -This follows the key principle of compile-once, run-everywhere, allowing systems with larger SVL to execute computations more efficiently while using the same binary. - -### Compile and run - -The main function is exactly the same that was used for the assembly version, -with the ``IMPL`` macro defined to be ``intr`` in the ``Makefile``. - -First, make sure that the ``sme2_matmul_intr`` executable is up-to-date: - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 make sme2_matmul_intr -``` - -Then execute ``sme2_matmul_intr`` on the FVP: - -```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh sme2_matmul_intr -``` - -This should output something similar to: - -```TXT -SME2 Matrix Multiply fp32 *intr* example with args 125 35 70 -Matrix preprocessing: PASS ! -Matrix multiplication: PASS ! - -Info: /OSCI/SystemC: Simulation stopped by user. -``` diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md new file mode 100644 index 0000000000..ee245df3ba --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-sme2-matmul-intr.md @@ -0,0 +1,296 @@ +--- +title: Matrix multiplication using SME2 intrinsics in C +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this section, you will write an SME2-optimized matrix multiplication routine in C using the intrinsics that the compiler provides. + +## What are instrinsics? + +*Intrinsics*, also known as *compiler intrinsics* or *intrinsic functions*, are the functions available to application developers that the compiler has intimate knowledge of. This enables the compiler to either translate the function to a specific instruction or to perform specific optimizations, or both. + +You can learn more about intrinsics in this [Wikipedia +Article on Intrinsic Function](https://en.wikipedia.org/wiki/Intrinsic_function). + +Using intrinsics allows the programmer to use the specific instructions required +to achieve the required performance while writing in C all the typically-required standard code, such as loops. This produces performance close to what can be reached with hand-written assembly whilst being significantly more maintainable and portable. + +All Arm-specific intrinsics are specified in the +[ACLE](https://github.com/ARM-software/acle), which is the Arm C Language Extension. ACLE +is supported by the main compilers, most notably [GCC](https://gcc.gnu.org/) and +[Clang](https://clang.llvm.org). + +## Implementation + +Here again, a top level function named `matmul_intr` in `matmul_intr.c` +will be used to stitch together the preprocessing and the multiplication: + +```C "{ line_numbers = true }" +__arm_new("za") __arm_locally_streaming void matmul_intr( + uint64_t M, uint64_t K, uint64_t N, const float *restrict matLeft, + const float *restrict matRight, float *restrict matLeft_mod, + float *restrict matResult) { + uint64_t SVL = svcntsw(); + preprocess_l_intr(M, K, SVL, matLeft, matLeft_mod); + matmul_intr_impl(M, K, N, SVL, matLeft_mod, matRight, matResult); +} +``` + +Note the `__arm_new("za")` and `__arm_locally_streaming` at line 1 that will +make the compiler save the ZA storage so we can use it without destroying its +content if it was still in use by one of the callers. + +`SVL`, the dimension of the ZA storage, is requested from the underlying hardware with the `svcntsw()` function call at line 5, and passed down to the `preprocess_l_intr` and `matmul_intr_impl` functions. `svcntsw()` is a function provided by the ACLE library. + +### Matrix preprocessing + +```C "{ line_numbers = true }" +void preprocess_l_intr( + uint64_t M, uint64_t K, uint64_t SVL, const float *restrict a, + float *restrict a_mod) __arm_streaming __arm_inout("za") { + const uint64_t M_mod = SVL * (M / SVL + (M % SVL != 0 ? 1 : 0)); + + // The outer loop, iterating over rows (M dimension) + for (uint64_t row = 0; row < M; row += SVL) { + + svbool_t pMDim = svwhilelt_b32(row, M); + + // The inner loop, iterating on columns (K dimension). + for (uint64_t col = 0; col < K; col += 2 * SVL) { + + svcount_t pKDim = svwhilelt_c32(col, K, 2); + + // Load-as-rows + for (uint64_t trow = 0; trow < SVL; trow += 4) { + svcount_t p0 = svpsel_lane_c32(pKDim, pMDim, trow + 0); + svcount_t p1 = svpsel_lane_c32(pKDim, pMDim, trow + 1); + svcount_t p2 = svpsel_lane_c32(pKDim, pMDim, trow + 2); + svcount_t p3 = svpsel_lane_c32(pKDim, pMDim, trow + 3); + + const uint64_t tile_UL_corner = (row + trow) * K + col; + svfloat32x2_t zp0 = svld1_x2(p0, &a[tile_UL_corner + 0 * K]); + svfloat32x2_t zp1 = svld1_x2(p1, &a[tile_UL_corner + 1 * K]); + svfloat32x2_t zp2 = svld1_x2(p2, &a[tile_UL_corner + 2 * K]); + svfloat32x2_t zp3 = svld1_x2(p3, &a[tile_UL_corner + 3 * K]); + + svfloat32x4_t zq0 = svcreate4(svget2(zp0, 0), svget2(zp1, 0), + svget2(zp2, 0), svget2(zp3, 0)); + svfloat32x4_t zq1 = svcreate4(svget2(zp0, 1), svget2(zp1, 1), + svget2(zp2, 1), svget2(zp3, 1)); + svwrite_hor_za32_f32_vg4( + /* tile: */ 0, /* slice: */ trow, zq0); + svwrite_hor_za32_f32_vg4( + /* tile: */ 1, /* slice: */ trow, zq1); + } + + // Read-as-columns and store + const uint64_t dest_0 = row * K + col * SVL; + const uint64_t dest_1 = dest_0 + SVL * SVL; + for (uint64_t tcol = 0; tcol < SVL; tcol += 4) { + svcount_t p0 = svwhilelt_c32(dest_0 + tcol * SVL, K * M_mod, 4); + svcount_t p1 = svwhilelt_c32(dest_1 + tcol * SVL, K * M_mod, 4); + svfloat32x4_t zq0 = + svread_ver_za32_f32_vg4(/* tile: */ 0, /* slice: */ tcol); + svfloat32x4_t zq1 = + svread_ver_za32_f32_vg4(/* tile: */ 1, /* slice: */ tcol); + svst1(p0, &a_mod[dest_0 + tcol * SVL], zq0); + svst1(p1, &a_mod[dest_1 + tcol * SVL], zq1); + } + } + } +} +``` + +Note that `preprocess_l_intr` has been annotated at line 3 with: + +- `__arm_streaming`, because this function is using streaming instructions, + +- `__arm_inout("za")`, because `preprocess_l_intr` reuses the ZA storage + from its caller. + +The matrix preprocessing is performed in a double nested loop, over the `M` +(line 7) and `K` (line 12) dimensions of the input matrix `a`. Both loops +have an `SVL` step increment, which corresponds to the horizontal and vertical +dimensions of the ZA storage that will be used. The dimensions of `a` may not +be perfect multiples of `SVL` though... which is why the predicates `pMDim` +(line 9) and `pKDim` (line 14) are computed in order to know which rows +(respectively columns) are valid. + +The core of `preprocess_l_intr` is made of two parts: + +- Lines 17 - 37: load matrix tile as rows. In this part, loop unrolling has been + used at 2 different levels. At the lowest level, 4 rows are loaded at a time + (lines 24-27). But this goes much further because as SME2 has multi-vectors + operations (hence the `svld1_x2` intrinsic to load 2 rows in 2 vector + registers), this allows the function to load the consecutive row, which + happens to be the row from the neighboring tile on the right: this means two + tiles are processed at once. At line 29-32, the pairs of vector registers are + rearranged on quads of vector registers so they can be stored horizontally in + the two tiles' ZA storage at lines 33-36 with the `svwrite_hor_za32_f32_vg4` + intrinsic. Of course, as the input matrix may not have dimensions that are + perfect multiples of `SVL`, the `p0`, `p1`, `p2` and `p3` predicates + are computed with the `svpsel_lane_c32` intrinsic (lines 18-21) so that + elements outside of the input matrix are set to 0 when they are loaded at + lines 24-27. + +- Lines 39 - 51: read the matrix tile as columns and store them. Now that the 2 + tiles have been loaded *horizontally*, they will be read *vertically* with the + `svread_ver_za32_f32_vg4` intrinsic to quad-registers of vectors (`zq0` + and `zq1`) at lines 45-48 and then stored with the `svst1` intrinsic to + the relevant location in the destination matrix `a_mod` (lines 49-50). Note + again the usage of predicates `p0` and `p1` (computed at lines 43-44) to + `svst1` to prevent writing out of the matrix bounds. + +Using intrinsics simplifies function development significantly, provided one has +a good understanding of the SME2 instruction set. Predicates, which are +fundamental to SVE and SME, enable a natural expression of algorithms while +handling corner cases efficiently. Notably, there is no explicit condition +checking within the loops to account for rows or columns extending beyond matrix +bounds. + +### Outer-product multiplication + +```C "{ line_numbers = true }" +void matmul_intr_impl( + uint64_t M, uint64_t K, uint64_t N, uint64_t SVL, + const float *restrict matLeft_mod, const float *restrict matRight, + float *restrict matResult) __arm_streaming __arm_inout("za") { + + // Build the result matrix tile by tile. + for (uint64_t row = 0; row < M; row += SVL) { + + svbool_t pMDim = svwhilelt_b32(row, M); + + for (uint64_t col = 0; col < N; col += SVL) { + + svbool_t pNDim = svwhilelt_b32(col, N); + + // Outer product + accumulation + svzero_za(); + const uint64_t matLeft_pos = row * K; + const uint64_t matRight_UL_corner = col; + for (uint64_t k = 0; k < K; k++) { + svfloat32_t zL = + svld1(pMDim, &matLeft_mod[matLeft_pos + k * SVL]); + svfloat32_t zR = + svld1(pNDim, &matRight[matRight_UL_corner + k * N]); + svmopa_za32_m(0, pMDim, pNDim, zL, zR); + } + + // Store ZA to matResult. + const uint64_t result_tile_UL_corner = row * N + col; + for (uint64_t trow = 0; trow < SVL && row + trow < M; trow += 4) { + svbool_t p0 = svpsel_lane_b32(pNDim, pMDim, row + trow + 0); + svbool_t p1 = svpsel_lane_b32(pNDim, pMDim, row + trow + 1); + svbool_t p2 = svpsel_lane_b32(pNDim, pMDim, row + trow + 2); + svbool_t p3 = svpsel_lane_b32(pNDim, pMDim, row + trow + 3); + + svst1_hor_za32( + /* tile: */ 0, /* slice: */ trow + 0, p0, + &matResult[result_tile_UL_corner + (trow + 0) * N]); + svst1_hor_za32( + /* tile: */ 0, /* slice: */ trow + 1, p1, + &matResult[result_tile_UL_corner + (trow + 1) * N]); + svst1_hor_za32( + /* tile: */ 0, /* slice: */ trow + 2, p2, + &matResult[result_tile_UL_corner + (trow + 2) * N]); + svst1_hor_za32( + /* tile: */ 0, /* slice: */ trow + 3, p3, + &matResult[result_tile_UL_corner + (trow + 3) * N]); + } + } + } +} +``` + +Note again that `matmul_intr_impl` function has been annotated at line 4 with: + +- `__arm_streaming`, because the function is using streaming instructions, + +- `__arm_inout("za")`, because the function reuses the ZA storage from its caller. + +The multiplication with the outer product is performed in a double-nested loop, +over the `M` (line 7) and `N` (line 11) dimensions of the input matrices +`matLeft_mod` and `matRight`. Both loops have an `SVL` step increment, +which corresponds to the horizontal and vertical dimensions of the ZA storage +that will be used as one tile at a time will be processed. The `M` and `N` +dimensions of the inputs may not be perfect multiples of `SVL` so the +predicates `pMDim` (line 9) (respectively `pNDim` at line 13) are computed in order +to know which rows (respectively columns) are valid. + +The core of the multiplication is done in 2 parts: + +- Outer-product and accumulation at lines 15-25. As `matLeft` has been + laid-out perfectly in memory with `preprocess_l_intr`, this part becomes + straightforward. First, the tile is zeroed with the `svzero_za` intrinsics + at line 16 so the outer products can be accumulated in the tile. The outer + products are computed and accumulation over the `K` common dimension with + the loop at line 19: the column of `matleft_mod` and the row of `matRight` + are loaded with the `svld1` intrinsics at line 20-23 to vector registers + `zL` and `zR`, which are then used at line 24 with the `svmopa_za32_m` + intrinsic to perform the outer product and accumulation (to tile 0). This + is exactly what was shown in Figure 2 earlier in the Learning Path. + Note again the usage of the `pMDim` and `pNDim` predicates to deal + correctly with the rows and columns respectively which are out of bounds. + +- Storing of the result matrix at lines 27-46. The previous section computed the + matrix multiplication result for the current tile, which now needs to be + written back to memory. This is done with the loop at line 29 which will + iterate over all rows of the tile: the `svst1_hor_za32` intrinsic at lines + 35-46 stores directly from the tile to memory. Note that the loop has been + unrolled by a factor of 4 (thus the `trow += 4` increment, line 29) and the + 4 `svst1_hor_za32`. Again, the `pMDim` and `pNDim` predicates deal + gracefully with the parts of the tile which are out-of-bound for the + destination matrix `matResult`. + +Once again, intrinsics makes it easy to fully leverage SME2, provided you have a +solid understanding of its available instructions. The compiler is automatically +handling many low-level aspects (saving / restoring of the diferent contexts), +as well as not using registers that are reserved on specific platforms (like +`x18`). Predicates handle corner cases elegantly, ensuring robust execution. +Most importantly, the code adapts to different SVL values across various +hardware implementations without requiring recompilation. This follows the key +principle of compile-once, run-everywhere, allowing systems with larger SVL to +execute computations more efficiently while using the same binary. + +### Compile and run + +The main function is exactly the same that was used for the assembly version, +with the `IMPL` macro defined to be `intr` in the `Makefile`. + +First, make sure that the `sme2_matmul_intr` executable is up-to-date: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2">}} + make sme2_matmul_intr + make: `sme2_matmul_intr' is up to date. + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 make sme2_matmul_intr + make: 'sme2_matmul_intr' is up to date. + {{< /tab >}} +{{< /tabpane >}} + +Then execute `sme2_matmul_intr` either natively or on the FVP: + +{{< tabpane code=true >}} + {{< tab header="Native SME2 support" language="bash" output_lines="2-4">}} + ./sme2_matmul_intr + SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 + Matrix preprocessing: PASS ! + Matrix multiplication: PASS ! + {{< /tab >}} + + {{< tab header="Emulated SME2 support" language="bash" output_lines="2-6">}} + docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh sme2_matmul_intr + SME2 Matrix Multiply fp32 *intr* [verification mode] with M=125, K=70, N=35 + Matrix preprocessing: PASS ! + Matrix multiplication: PASS ! + + Info: /OSCI/SystemC: Simulation stopped by user. + {{< /tab >}} +{{< /tabpane >}} diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md new file mode 100644 index 0000000000..da241b1ad7 --- /dev/null +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-benchmarking.md @@ -0,0 +1,82 @@ +--- +title: Benchmarking +weight: 10 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this section, you'll benchmark matrix multiplication performance using SME2, if your machine supports native execution of SME2 instructions. + +## About benchmarking and emulation + +Emulation is generally not the best way to assess the performance of a piece of +code. Emulation focuses on correctly simulating instructions and not accurate execution timing. For example, as explained in the [outer product section](../5-outer-product/), improving performance involves increasing the `macc`-to-`load` ratio. + +Emulators, including the FVP, do not model in detail memory bandwidth, cache behavior, or latency. At best, an emulator provides an instruction count for the vanilla reference implementation versus the assembly-/intrinsic-based versions of the matrix multiplication, which is useful for functional validation but not for precise benchmarking. + +## Benchmarking on a platform with native SME2 support + +{{% notice Note %}} +Benchmarking and profiling are complex tasks. This Learning Path provides a *simplified* framework for observing SME2-related performance improvements. +{{% /notice %}} + +If your machine natively supports SME2, then benchmarking is possible. When +`sme2_matmul_asm` and `sme2_matmul_intr` were compiled with `BAREMETAL=0`, the +*benchmarking mode* is available. + +*Benchmarking mode* is enabled by prepending the `M`, `K`, `N` optional parameters with an iteration count (`I`). + +## Run the intrinsic version + +Now measure the execution time of `sme2_matmul_intr` for 1000 multiplications of +matrices with the default sizes: + +```BASH { output_lines="2-4"} +./sme2_matmul_intr 1000 +SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 +Reference implementation: min time = 101 us, max time = 438 us, avg time = 139.42 us +SME2 implementation *intr*: min time = 1 us, max time = 8 us, avg time = 1.82 us +``` + +The execution time is reported in microseconds. A wide spread between the minimum and maximum figures can be noted and is expected as the way of doing the benchmarking is simplified for the purpose of simplicity. You will, however, note that the intrinsic version of the matrix multiplication brings on average a 76x execution time reduction. + +{{% notice Tip %}} +You can override the default values for `M` (125), `K` (25), and `N` (70) and +provide your own values on the command line. For example, you can benchmark the +`M=7`, `K=8`, and `N=9` case with: + +```BASH { output_lines="2-4"} +./sme2_matmul_intr 1000 7 8 9 +SME2 Matrix Multiply fp32 *intr* [benchmarking mode, 1000 iterations] with M=7, K=8, N=9 +Reference implementation: min time = 0 us, max time = 14 us, avg time = 0.93 us +SME2 implementation *intr*: min time = 0 us, max time = 1 us, avg time = 0.61 us +``` +{{% /notice %}} + +Now measure the execution time of `sme2_matmul_asm` for 1000 multiplications of +matrices with the default sizes: + +```BASH { output_lines="2-4"} +./sme2_matmul_asm 1000 +SME2 Matrix Multiply fp32 *asm* [benchmarking mode, 1000 iterations] with M=125, K=70, N=35 +Reference implementation: min time = 101 us, max time = 373 us, avg time = 136.49 us +SME2 implementation *asm*: min time = 1 us, max time = 8 us, avg time = 1.44 us +``` + +You can note that, although the vanilla reference matrix multiplication is the +same, there is some variability in the execution time. + +You'll also note that the assembly version of the SME2 matrix multiplication is +slightly faster (1.44 us compared to 1.82 us for the intrinsic-based version). +This *must not* convince you that assembly is better though! The comparison done +here is far from being an apples-to-apples comparison: +- Firstly, the assembly version has some requirements on the `K` parameter that + the intrinsic version does not have. +- Second, the assembly version has an optimization that the intrinsic version, + for the sake of readability in this Learning Path, does not have (see the + [Going further + section](/learning-paths/cross-platform/multiplying-matrices-with-sme2/10-going-further/) + to know more). +- Last, but not least, the intrinsic version is *easily* readable and + maintainable. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-going-further.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-going-further.md deleted file mode 100644 index be2ac04bad..0000000000 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/8-going-further.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: Going further -weight: 10 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## Generalize the algorithms - -In this Learning Path, you focused on using SME2 for matrix -multiplication with floating point numbers. However in practice, any library or framework supporting matrix multiplication should -also handle various integer types. - -You can see that the algorithm structure for matrix preprocessing as well -as multiplication with the outer product does not change at all for other data -types - they only need to be adapted. - -This is suitable for languages with [generic -programming](https://en.wikipedia.org/wiki/Generic_programming) like C++ with -templates. You can even make the template manage a case where the value -accumulated during the product uses a larger type than the input matrices. SME2 has the instructions to deal efficiently with this common case scenario. - -This enables the library developer to focus on the algorithm, testing, and optimizations, while allowing the compiler to generate multiple variants. - -## Unroll further - -You might have noticed that ``matmul_intr_impl`` computes only one tile at a time, for the sake of simplicity. - -SME2 does support multi-vector instructions, and some were used in ``preprocess_l_intr``, for example, ``svld1_x2``. - -Loading two vectors at a time enables the simultaneous computing of more tiles, and as the input matrices have been laid out in memory in a neat way, the consecutive -loading of the data is efficient. Implementing this approach can make improvements to the ``macc`` to load ``ratio``. - -In order to check your understanding of SME2, you can try to implement this unrolling yourself. You can check your work by comparing your results to the expected -reference values. - -## Apply strategies - -One method for optimization is to use strategies that are flexible depending on the matrices' dimensions. This is especially easy to set up when working in C or C++, -rather than directly in assembly language. - -By playing with the mathematical properties of matrix multiplication and the outer product, it is possible to minimize data movement as well as reduce the overall number of operations to perform. - -For example, it is common that one of the matrices is actually a vector, meaning that it has a single row or column, and then it becomes advantageous to transpose it. Can you see why? - -The answer is that as the elements are stored contiguously in memory, an ``Nx1`` and ``1xN`` matrices have the exact same memory layout. The transposition becomes a no-op, and the matrix elements stay in the same place in memory. - -An even more *degenerated* case that is easy to manage is when one of the matrices is essentially a scalar, which means that it is a matrix with one row and one column. - -Although our current code handles it correctly from a results point of view, a different algorithm and use of instructions might be more efficient. Can you think of another way? diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-debugging.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md similarity index 68% rename from content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-debugging.md rename to content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md index 3d490bd8af..d05e5a7ea0 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/7-debugging.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/9-debugging.md @@ -1,19 +1,20 @@ --- title: Debugging -weight: 9 +weight: 11 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Debugging +Debugging is an essential part of development, especially when working close to the hardware. -### Looking at the generated code +In this section, you will learn about the different ways to debug and troubleshoot SME2 code. -In some cases, it is useful to look at the code generated by the compiler. In this Learning Path, the assembly listings have been produced and you can -inspect them. +## Inspect the generated assembly -For example, the inner loop with the outer product and the accumulation of the matrix multiplication with intrinsics from the listing file ``sme2_matmul_intr.lst`` looks like this: +Sometimes it's helpful to review the assembly code generated by the compiler. In this Learning Path, listings have already been generated for you. You can inspect these files to verify that SME2 instructions were emitted correctly. + +For example, here’s a snippet from `sme2_matmul_intr.lst`, showing the inner loop of the matrix multiplication using intrinsics: ```TXT ... @@ -26,21 +27,22 @@ For example, the inner loop with the outer product and the accumulation of the m 8000186c: 54ffff41 b.ne 0x80001854 ... ``` +This sequence shows how `ld1w` loads vector registers, followed by the `fmopa` outer product operation. -### With debuggers +### Debug with gdb or lldb -Both of the main debuggers, ``gdb`` and ``lldb``, have some support for debugging SME2 code. Their usage is not shown in this Learning Path though, the main -reason for this being that this Learning Path focuses on the CPU in *baremetal* mode. +Both of the main debuggers, `gdb` and `lldb`, have some support for debugging SME2 code. Their usage is not shown in this Learning Path though. -This is a simplistic, and minimalistic environment, without an operating system, for example. Debug mode requires a debug monitor to interface between the debugger, the program, and the CPU. +{{% notice Note %}} +If you're using the FVP emulator, debugging is more complex. Because there's no operating system, you'll need a debug monitor to interface between your program, the CPU, and your debugger. +{{% /notice %}} -### With trace +### Analyze instruction trace with Tarmac -The FVP can emit an instruction trace file in text format, known as the Tarmac trace. This provides a convenient way for you to understand what the program is doing. +The FVP can emit an instruction trace file in text format, known as the Tarmac trace.This trace shows instruction-by-instruction execution and register contents, which is helpful for low-level debugging. -In the excerpt shown below, you can see that the SVE register ``z0`` has been loaded with 16 values, as predicate ``p0`` was true, with an ``LD1W`` -instruction, whereas ``z1`` was loaded with only two values, as ``p1``. ``z0``, and ``z1`` are later used by the ``fmopa`` instruction to compute the -outer product, and the trace displays the content of the ZA storage. +In the excerpt shown below, you can see that the SVE register `z0` has been loaded with 16 values, as predicate `p0` was true, with an `LD1W` instruction, whereas `z1` was loaded with only two values, as `p1`. `z0`, and `z1` are later used by the `fmopa` instruction to compute the outer +product, and the trace displays the content of the ZA storage. ```TXT 923530000 ps IT (92353) 80001b08 a540a1a0 O EL3h_s : LD1W {z0.S},p0/Z,[x13] @@ -91,20 +93,17 @@ outer product, and the trace displays the content of the ZA storage. 923580000 ps R ZA0H_S_15 00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_4479e70a_44f4223e ``` -You can get a Tarmac trace when invoking ``run-fvp.sh`` by adding the ``--trace`` option as the *first* argument, for example: +You can get a Tarmac trace when invoking `run-fvp.sh` by adding the `--trace` option as the *first* argument, for example: ```BASH -docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v1 ./run-fvp.sh --trace sme2_matmul_asm +docker run --rm -v "$PWD:/work" -w /work armswdev/sme2-learning-path:sme2-environment-v2 ./run-fvp.sh --trace sme2_matmul_asm ``` -Tracing is not enabled by default. It slows down the simulation significantly and the trace file can become very large for programs with large matrices. - -{{% notice Debugging tip %}} -It can be helpful when debugging to understand where an element in the -Tile is coming from. The current code base allows you to do that in ``debug`` -mode, when ``-DDEBUG`` is passed to the compiler in the ``Makefile``. If you -look into ``main.c``, you will notice that the matrix initialization is no -longer random, but instead initializes each element with its linear -index. This makes it *easier* to find where the matrix elements are loaded in -the tile in tarmac trace, for example. -{{% /notice %}} \ No newline at end of file +{{% notice Tip %}} +Tracing is disabled by default because it significantly slows down simulation and generates large files for big matrices. +{{% /notice %}} + +## Use debug mode for matrix inspection + +It can be helpful when debugging to understand where an element in the tile is coming from. The current code base allows you to do that in `debug` mode, when `-DDEBUG` is passed to the compiler in the `Makefile`. If you look into `main.c`, you will notice that the matrix initialization is no +longer random, but instead initializes each element with its linear index. This makes it *easier* to find where the matrix elements are loaded in the tile in tarmac trace, for example. diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md index c869b5af7f..9edcbecd67 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/_index.md @@ -1,25 +1,25 @@ --- title: Accelerate Matrix Multiplication Performance with SME2 -minutes_to_complete: 30 +minutes_to_complete: 60 -who_is_this_for: This Learning Path is an advanced topic for developers who want to learn about accelerating the performance of matrix multiplication using Arm's Scalable Matrix Extension Version 2 (SME2). +who_is_this_for: This Learning Path is an advanced topic for developers who want to accelerate the performance of matrix multiplication using Arm's Scalable Matrix Extension Version 2 (SME2). learning_objectives: - - Implement a reference matrix multiplication without using SME2. - - Use SME2 assembly instructions to improve the matrix multiplication performance. - - Use SME2 intrinsics to improve the matrix multiplication performance using the C programming language. - - Compile and run code with SME2 instructions. + - Implement a baseline matrix multiplication kernel in C without SME2 + - Use SME2 assembly instructions to accelerate matrix multiplication performance + - Use SME2 intrinsics to vectorize and optimize matrix multiplication + - Compile code with SME2 intrinsics and assembly + - Benchmark and validate SME2-accelerated matrix multiplication on Arm hardware or in a Linux-based emulation environment + - Compare performance metrics between baseline and SME2-optimized implementations prerequisites: - - Basic knowledge of Arm's Scalable Matrix Extension (SME). - - Basic knowledge of Arm's Scalable Vector Extension (SVE). - - An intermediate understanding of C programming language and assembly language. - - A computer running Linux, MacOS, or Windows. - - Installations of Git and Docker. - - An emulator to run code with SME2 instructions. - - A compiler with support for SME2 instructions. - + - Working knowledge of Arm’s SVE and SME instruction sets + - Intermediate proficiency with the C programming language and the Armv9-A assembly language + - A computer running Linux, macOS, or Windows + - Installations of Git and Docker for project setup and emulation + - A platform that supports SME2 (see the list of [devices with SME2 support](/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started/#devices) or an emulator to run code with SME2 instructions + - Compiler support for SME2 instructions (for example, LLVM 17+ with SME2 backend support) author: Arnaud de Grandmaison @@ -33,6 +33,7 @@ tools_software_languages: - C - Clang - Runbook + - LLVM operatingsystems: - Linux @@ -52,18 +53,26 @@ further_reading: title: Port Code to Arm Scalable Vector Extension (SVE) link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/sve type: website - - resource: - title: Arm Scalable Matrix Extension (SME) Introduction (Part 1) - link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction - type: blog - resource: title: Introducing the Scalable Matrix Extension for the Armv9-A Architecture link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture type: website + - resource: + title: Arm Scalable Matrix Extension (SME) Introduction (Part 1) + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction + type: blog - resource: title: Arm Scalable Matrix Extension (SME) Introduction (Part 2) link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2 type: blog + - resource: + title: (Part 3) Matrix-matrix multiplication. Neon, SVE, and SME compared + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/matrix-matrix-multiplication-neon-sve-and-sme-compared + type: blog + - resource: + title: Build adaptive libraries with multiversioning + link: https://learn.arm.com/learning-paths/cross-platform/function-multiversioning/ + type: website - resource: title: SME Programmer's Guide link: https://developer.arm.com/documentation/109246/latest diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/overview.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/overview.md index db395e1ff9..ed46aca044 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/overview.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/overview.md @@ -6,35 +6,45 @@ weight: 2 layout: learningpathall --- -# Overview of Arm's Scalable Matrix Extension Version 2 +## Arm's Scalable Matrix Extension Version 2 (SME2) -### What is SME2? +Arm’s Scalable Matrix Extension Version 2 (SME2) is a hardware feature designed to accelerate dense linear algebra operations, enabling high-throughput execution of matrix-based workloads. -The Scalable Matrix Extension (SME) is an extension to the Armv9-A architecture. The Scalable Matrix Extension Version 2 (SME2) extends the SME architecture by accelerating vector operations to increase the number of applications that can benefit from the computational efficiency of SME, beyond its initial focus on outer products and matrix-matrix multiplication. +Whether you're building for AI inference, HPC, or scientific computing, SME2 provides fine-grained control and high-performance vector processing. -SME2 extends SME by introducing multi-vector data-processing instructions, load to and store from multi-vectors, and a multi-vector predication mechanism. -Additional architectural features of SME2 include: +## Extending the SME architecture -* Multi-vector multiply-accumulate instructions, with Z vectors as multiplier and multiplicand inputs and accumulating results into ZA array vectors, including widening multiplies that accumulate into more vectors than they read. +SME is an extension to the Armv9-A architecture and is designed to accelerate matrix-heavy computations, such as outer products and matrix-matrix multiplications. -* Multi-vector load, store, move, permute, and convert instructions, that use multiple SVE Z vectors as source and destination registers to pre-process inputs and post-process outputs of the ZA-targeting SME2 instructions. +SME2 builds on SME by accelerating vector operations to increase the number of applications that can benefit from the computational efficiency of SME, beyond its initial focus on outer products and matrix-matrix multiplication. -* *Predicate-as-counter*, which is an alternative predication mechanism that is added to the original SVE predication mechanism, to control operations performed on multiple vector registers. +## Key architectural features of SME2 -* Compressed neural network capability using dedicated lookup table instructions and outer product instructions that support binary neural networks. +SME2 adds several capabilities to the original SME architecture: -* A 512-bit architectural register ZT0, that supports the lookup table feature. +* **Multi-vector multiply-accumulate instructions**, that use Z vectors as multiplier and multiplicand inputs, and accumulate results into ZA array vectors. This includes widening multiplies that write to more vectors than they read from. -### Suggested reading +* **Multi-vector load, store, move, permute, and convert instructions**, that use multiple SVE Z vectors as source and destination registers to efficiently pre-process inputs and post-process outputs of the ZA-targeting SME2 instructions. -If you are not familiar with matrix multiplication, or would benefit from refreshing your knowledge, this [Wikipedia article on Matrix multiplication](https://en.wikipedia.org/wiki/Matrix_multiplication) is a good start. +* A **predicate-as-counter mechanism**, which is a new predication mechanism that is added alongside the original SVE approach to enable fine-grained control over operations across multiple vector registers. -This Learning Path assumes some basic understanding of SVE and SME. If you are not familiar with SVE or SME, these are some useful resources that you can read first: +* **Compressed neural network support**, using dedicated lookup table and outer product instructions that support binary neural network workloads. - - [Introducing the Scalable Matrix Extension for the Armv9-A - Architecture](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture). - - [Arm Scalable Matrix Extension (SME) Introduction (Part - 1)](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction). - - [Arm Scalable Matrix Extension (SME) Introduction (Part - 2)](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2). \ No newline at end of file +* A **512-bit architectural register ZT0**, which is a dedicated register that enables fast, table-driven data transformations. + +## Further information + +This Learning Path does assume some basic understanding of SVE, SME, and matrix multiplication, however if you do want to refresh or grow your knowledge, these are some useful resources that you might find helpful: + +On matrix multiplication: + +- The [Wikipedia article](https://en.wikipedia.org/wiki/Matrix_multiplication) + +On SVE and SME: + +- [Introducing the Scalable Matrix Extension for the Armv9-A Architecture - Martin Weidmann, Arm](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture) +- [Arm Scalable Matrix Extension (SME) Introduction (Part 1) - Zenon Xiu](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction) +- [Arm Scalable Matrix Extension (SME) Introduction (Part 2) - Zenon Xiu](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2) +- [Matrix-matrix multiplication. Neon, SVE, and SME compared (Part 3)](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/.matrix-matrix-multiplication-neon-sve-and-sme-compared) +- [Learn about function multiversioning - Alexandros Lamprineas, Arm](https://learn.arm.com/learning-paths/cross-platform/function-multiversioning/) \ No newline at end of file diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/1_intro-zenoh.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/1_intro-zenoh.md new file mode 100644 index 0000000000..65d30cc1ea --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/1_intro-zenoh.md @@ -0,0 +1,62 @@ +--- +title: Introduction to Zenoh +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## The Need for Scalable Communication in Robotics and Edge Computing + +Modern robotics and industrial IoT (IIoT) systems are evolving rapidly—from indoor collaborative arms on assembly lines to fleets of outdoor autonomous delivery robots. +These applications must operate in real-time, often across multiple compute nodes, over networks that may span factory LANs, 5G cellular links, or even cloud data centers. + +Such systems require fast, reliable, and scalable data communication between nodes. +This includes not just broadcasting sensor updates or actuator commands (i.e., pub/sub), but also performing state queries, storing values for later use, and even distributed computation across nodes. A modern protocol must be: +* Low-latency: Immediate delivery of time-critical messages. +* High-throughput: Efficient data flow across many devices. +* Decentralized: No reliance on central brokers or fixed infrastructure. +* Flexible: Able to run on lightweight edge devices, across WANs, or inside cloud-native environments. + +Traditional communication stacks such as DDS ([Data Distribution Service](https://en.wikipedia.org/wiki/Data_Distribution_Service)) serve as the backbone for middleware like ROS 2. However, DDS struggles in multi-network or wireless environments where multicast is unavailable, or NAT traversal is needed. +These constraints can severely impact deployment and performance at the edge. + + +## Zenoh: An Open-Source Pub/Sub Protocol for the Industrial Edge + +[Eclipse Zenoh](https://zenoh.io/) is a modern, [open-source](https://github.com/eclipse-zenoh/zenoh) data-centric communication protocol that goes beyond traditional pub/sub. Designed specifically for edge computing, IIoT, robotics, and autonomous systems, Zenoh unifies: + +* Data in motion through a powerful and efficient pub/sub model. +* Data at rest via geo-distributed storage plugins. +* Data in use with direct query support. +* On-demand computation via queryable nodes that can generate data dynamically. + +Unlike most traditional stacks, Zenoh is fully decentralized and designed to operate across cloud-to-edge-to-thing topologies, making it ideal for industrial robotics, autonomous systems, and smart environments. +It supports heterogeneous platforms, is implemented in Rust for performance and safety, and also offers bindings for Python, enabling rapid prototyping. + +Zenoh is particularly effective in wireless, 5G, or cross-network deployments where multicast and DDS fall short. +Its routing engine avoids excessive discovery traffic, conserves bandwidth, and supports seamless bridging between legacy ROS 2/DDS apps and modern, optimized Zenoh networks using zenoh-bridge-dds. + +In this learning path, you’ll use Zenoh to build and validate a multi-node distributed communication system across multiple Arm-based platforms, gaining hands-on experience with data exchange and coordination between edge devices. + +To make the upcoming demo more intuitive and easy to follow, we’ll demonstrate the setup using two physical Cortex-A Linux devices. + +I’ll be using Raspberry Pi boards in this learning path, but you’re free to substitute them with any Cortex-A devices that support network connectivity with Linux-based OS installed, depending on your development setup. + +In real-world ROS 2 deployment scenarios, developers typically conduct validation and performance testing across systems with more than two nodes. +To simulate such environments, using [Arm virtual hardware](https://www.arm.com/products/development-tools/simulation/virtual-hardware) is also a common and efficient approach. + +This will help you quickly validate your architecture choices and communication patterns when designing distributed applications. + +* Raspberry Pi, +* Linux-based Cortex-A, or +* Arm Virtual Hardware (AVH). + +After this learning path, you will: +* Understand the core architecture and data flow principles behind Eclipse Zenoh, including its support for pub/sub, querying, and queryable edge functions. +* Build and run distributed Zenoh examples across multiple Arm-based nodes—using Raspberry Pi or AVH to simulate scalable deployment environments. +* Rebuild and extend the Zenoh queryable example to simulate edge-side logic. + +By the end of this learning path, you’ll have deployed a fully functional, scalable, and latency-aware Zenoh system. + +You can also check [here](https://zenoh.io/docs/getting-started/first-app) to find some simple examples. diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/2_zenoh-install.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/2_zenoh-install.md new file mode 100644 index 0000000000..5a750202ec --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/2_zenoh-install.md @@ -0,0 +1,107 @@ +--- +title: Setting Up Zenoh on Arm Devices +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Setting Up Zenoh on Arm Devices + +The following instructions are verified on both Raspberry Pi 4/5 and Arm Virtual Hardware, but you can implement them on any Cortex-A Linux device. + +Before building Zenoh, make sure your system has the necessary development tools and runtime libraries. + +### Install the Rust build environment + +First, we need to install the [Rust](https://www.rust-lang.org/) build environment, since the core of Zenoh is totally developed using Rust to keep it safe and efficient. + +Follow this [installation guide](https://learn.arm.com/install-guides/rust/) to install Rust and Cargo on Arm Linux, a build system for Rust. Or, simply use the following commands. + +```bash +curl https://sh.rustup.rs -sSf | sh +``` + +Follow the prompts to complete the installation. If successful, you’ll see: + +```output +Rust is installed now. Great! +``` + +For more details, refer to [Rust’s official install guide.](https://doc.rust-lang.org/cargo/getting-started/installation.html#install-rust-and-cargo) + +### Install ROS 2 + +[Robot Operating System](https://www.ros.org/) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it's all open source. + +Since ROS was started in 2007, a lot has changed in the robotics and ROS community. The goal of the [ROS 2](https://docs.ros.org/en/rolling/index.html) project is to adapt to these changes, leveraging what is great about ROS 1 and improving what isn’t. + +Here is the quick [installation guide](https://learn.arm.com/install-guides/ros2/) about how to install ROS 2 in Arm platform. + +### Download and build the Zenoh source + +Now, we can clone the Zenoh. + +```bash +cd ~ +git clone https://github.com/eclipse-zenoh/zenoh.git +``` + +After that, simply use cargo to build the source. + +```bash +cd zenoh +cargo build --release --all-targets -j $(nproc) +``` + +This will take several minutes depending on your device. Once the installation is complete, you should see: + +```output +cargo build --release --all-targets -j $(nproc) + Updating crates.io index + Downloaded humantime v2.2.0 + Downloaded spin v0.10.0 + Downloaded crossbeam-channel v0.5.14 + Downloaded uhlc v0.8.1 + Downloaded 4 crates (182.5 KB) in 2.19s +warning: output filename collision. +The lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)` has the same output filename as the lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)`. +Colliding filename is: /home/ubuntu/zenoh/target/release/deps/libzenoh_plugin_storage_manager.so +The targets should have unique names. +Consider changing their names to be unique or compiling them separately. +This may become a hard error in the future; see . +warning: output filename collision. +The lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)` has the same output filename as the lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)`. +Colliding filename is: /home/ubuntu/zenoh/target/release/deps/libzenoh_plugin_storage_manager.so.dwp +The targets should have unique names. +Consider changing their names to be unique or compiling them separately. +This may become a hard error in the future; see . +warning: output filename collision. +The lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)` has the same output filename as the lib target `zenoh_plugin_storage_manager` in package `zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager)`. +Colliding filename is: /home/ubuntu/zenoh/target/release/deps/libzenoh_plugin_storage_manager.rlib +The targets should have unique names. +Consider changing their names to be unique or compiling them separately. +This may become a hard error in the future; see . + Compiling proc-macro2 v1.0.86 + Compiling unicode-ident v1.0.13 + Compiling libc v0.2.158 + Compiling version_check v0.9.5 + Compiling autocfg v1.3.0 +... + Compiling zenoh-link-quic v1.4.0 (/home/ubuntu/zenoh/io/zenoh-links/zenoh-link-quic) + Compiling zenoh_backend_traits v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-backend-traits) + Compiling zenoh-plugin-storage-manager v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-storage-manager) + Compiling zenoh-ext v1.4.0 (/home/ubuntu/zenoh/zenoh-ext) + Compiling zenoh-ext-examples v1.4.0 (/home/ubuntu/zenoh/zenoh-ext/examples) + Compiling zenoh-plugin-example v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-plugin-example) + Compiling zenoh-backend-example v1.4.0 (/home/ubuntu/zenoh/plugins/zenoh-backend-example) + Finished `release` profile [optimized] target(s) in 6m 28s +``` + +After the build process, the binary executables will be stored under the directory of `~/zenoh/target/release/examples/`. + +{{% notice Note %}} +Installation time may vary depending on your device’s processing power. +{{% /notice %}} + +With Zenoh successfully compiled, you’re ready to explore how nodes communicate using the Zenoh runtime. diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/3_zenoh-multinode.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/3_zenoh-multinode.md new file mode 100644 index 0000000000..8d21f0bfc0 --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/3_zenoh-multinode.md @@ -0,0 +1,114 @@ +--- +title: Setting Up a Multi-Node Environment +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Deploying Zenoh on Multiple Raspberry Pi Devices Using Docker + +After building Zenoh and its core examples, your next step is to deploy them across multiple Arm-based devices. + +In this session, you’ll use Raspberry Pi boards to simulate a scalable, distributed environment—but the same workflow applies to any Arm Linux system, including Arm cloud instances and Arm Virtual Hardware. + +You’ll learn how to use Docker to deploy the environment on physical devices, and how to duplicate virtual instances using snapshot cloning on Arm Virtual Hardware. + +This setup lets you simulate `real-world`, `cross-node communication`, making it ideal for validating Zenoh's performance in robotics and industrial IoT use cases. + +### Install Docker on Raspberry Pi + +To simplify this process and ensure consistency, you’ll use Docker to containerize your Zenoh and ROS 2 environment. +This lets you quickly replicate the same runtime on any device without needing to rebuild from source. + +This enables multi-node testing and real-world distributed communication scenarios. + +First, install the docker environment on each of Raspberry Pi if you don't have that. + +```bash +curl -sSL https://get.docker.com | sh +sudo usermod -aG docker pi +``` + +Log out and back in, or run newgrp docker to activate Docker group permissions. + +### Create a ROS 2 + DDS Docker Image + +In a working directory, create a `Dockerfile` with the following content to create the ROS 2 / DDS docker image. + +```bash +FROM ros:galactic +RUN apt-get update +RUN apt-get install -y ros-galactic-demo-nodes-cpp ros-galactic-rmw-cyclonedds-cpp ros-galactic-turtlesim +ENV RMW_IMPLEMENTATION=rmw_cyclonedds_cpp +CMD bash +``` + +Under the directory where the above Dockerfile exists, run the following command to generate the docker image. + +```bash +$ docker build -t zenoh-node . +``` + +After this has been done, the created ROS 2 docker image can be seen by the following command. + +```bash +$ docker images | grep zenoh-node +``` + +```output +zenoh-node latest b7a9c27cf8a8 About a minute ago 962MB +``` + +### Transfer the Docker Image on Another RPi + +You now need to transfer the Docker image to your second device. Choose one of the following methods: + +You have two options: + +Option 1: Save and copy via file + +```bash +docker save zenoh-node > zenoh-node.tar +scp zenoh-node.tar pi@:/home/pi/ +``` + +On the target device: +```bash +docker load < zenoh-node.tar +``` + +Option 2: Push to a container registry (e.g., DockerHub or GHCR). + +You can also push the image to Docker Hub or GitHub Container Registry and pull it on the second device. + +### Run the Docker Image + +Once the image is successfully loaded into second device, you can run the container by + +```bash +docker run -it --network=host zenoh-node +``` + +Now, all the Zenoh example binaries are now available within this container, allowing you to test pub/sub and query flows across devices. + +### Another Duplicate Setting Option on Arm Virtual Hardware + +If you have [Corellium](https://www.corellium.com/) account, you can + +1. Set up and install Zenoh on a single AVH instance. +2. Use the [Clone](https://support.corellium.com/features/snapshots) function to duplicate the environment. +3. Optionally, you may optionally rename the device to avh* for easy device recognition by changing the setting in the `/etc/hostname` file. + +## Run Zenoh in Multi-Node Environment + +You’re now ready to run and test Zenoh communication flows across distributed edge devices. + +The source of the examples written in Rust will be provided, and both are interoperable. The +Rust binaries are already available under: `$ZENOH_LOC/target/release/examples/` directory. + +The following sections illustrate the procedures to run the Zenoh examples so as to demonstrate the primary capabilities of Zenoh +1. Basic Pub/Sub – for real-time message distribution +2. Query and Storage – to persist and retrieving historical data +3. Queryable – to enable on-demand remote computation +4. Dynamic Queryable with Computation diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/4_zenoh-ex1-pubsub.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/4_zenoh-ex1-pubsub.md new file mode 100644 index 0000000000..fc0115618a --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/4_zenoh-ex1-pubsub.md @@ -0,0 +1,39 @@ +--- +title: Zenoh Example-1 Simple Pub/Sub +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example 1: Simple Pub/Sub + +This first test demonstrates real-time publish/subscribe model using two Raspberry Pi devices. + +The following command is to initiate a subscriber for a key expression `demo/example/**`, i.e. a set of topics starting with the path `demo/example`. + +### Step 1: Run Subscriber + +Log in to Pi using any of the methods: + +```bash +cd ~/zenoh/target/release/examples +./z_sub +``` + +### Step 2: Run Publisher + +Then, log in to another machine Pi. + +```bash +cd ~/zenoh/target/release/examples +./z_pub +``` + +The result will look like: +![img1 alt-text#center](zenoh_ex1.gif "Figure 1: Simple Pub/Sub") + +In the left-side window, I have logged into the device Pi4 and run the z_sub program. +It receives values with the key `demo/example/zenoh-rs-pub` continuously published by z_pub running on Pi in the right-side window. + +This basic example shows Zenoh's zero-config discovery and low-latency pub/sub across physical nodes. diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/5_zenoh-ex2-storagequery.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/5_zenoh-ex2-storagequery.md new file mode 100644 index 0000000000..a0cdea6cef --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/5_zenoh-ex2-storagequery.md @@ -0,0 +1,74 @@ +--- +title: Zenoh Example-2 Storage and Query +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example 2: Storage and Query + +The second example adds Zenoh's data storage and querying capabilities—enabling nodes to retrieve historical values on demand. + +Building on the previous Pub/Sub example, you’ll now explore how Zenoh supports `persistent data storage` and `on-demand querying` -- a powerful feature for robotics and IIoT applications. + +In a typical warehouse or factory scenario, autonomous robots may periodically publish sensor data (e.g., position, temperature, battery level), and a central system—or another robot—may later need to query the latest state of each unit. + +Unlike Pub/Sub, which requires live, real-time message exchange, Zenoh's storage and query model enables asynchronous access to data that was published earlier, even if the original publisher is no longer online. + +In this example, you’ll run the zenohd daemon with in-memory storage and use z_put to publish data and z_get to retrieve it. + +This is especially useful for distributed systems where nodes may intermittently connect or request snapshots of state from peers. + + +### Step 1: Start the Zenoh Daemon with In-Memory Storage + +On one Raspberry Pi, launch the Zenoh daemon with a configuration that enables in-memory storage for keys under demo/example/**: + +```bash +cd ~/zenoh/target/release/ +./zenohd --cfg='plugins/storage_manager/storages/demo:{key_expr:"demo/example/**",volume:"memory"}' & +``` + +This starts the Zenoh daemon with in-memory storage support. + +You should see log messages indicating that the storage_manager plugin is loaded. +If port 7447 is already in use, either stop any previous Zenoh processes or configure a custom port using the listen.endpoints.router setting. + +### Step 2: Publish Data + +On 2nd device, use z_put to send a key-value pair that will be handled by the zenohd storage: + +```bash +cd ~/zenoh/target/release/examples +./z_put -k demo/example/test1 -p "Hello from storage!" +``` + +This command stores the string `Hello from storage!` under the key demo/example/test1. + + +### Step 3: Query the Data + +Back on first Raspberry Pi, you can now query the stored data from any Zenoh-connected node: + +```bash +cd ~/zenoh/target/release/examples +./z_get -s demo/example/test1 +``` + +You should see an output similar to: + +```bash +Sending Query 'demo/example/test1'... +>> Received ('demo/example/test1': 'Hello from storage!') +``` + +The result will look like: +![img2 alt-text#center](zenoh_ex2.gif "Figure 2: Storage and Query") + +{{% notice tip %}} +If you have more than two Raspberry Pi devices, you can run the z_get command on a third RPi to validate that storage queries work seamlessly across a multi-node setup. +{{% /notice %}} + +This example shows how Zenoh's Storage + Query model supports asynchronous data access and resilient state-sharing—critical capabilities in robotics and industrial IoT systems where network connectivity may be intermittent or system components loosely coupled. + diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/6_zenoh-ex3-queryable.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/6_zenoh-ex3-queryable.md new file mode 100644 index 0000000000..13f15e0020 --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/6_zenoh-ex3-queryable.md @@ -0,0 +1,73 @@ +--- +title: Zenoh Example-3 Computation on Query using Queryable +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example 3: Computation on Query using Queryable + +Next, you’ll explore Zenoh's queryable capability, which lets a node dynamically respond to data queries by executing a custom computation or data generation function in this example. + +Unlike zenohd which simply returns stored data, a queryable node can register to handle a specific key expression and generate responses at runtime. This is ideal for distributed computing at the edge, where lightweight devices—such as Raspberry Pi nodes—can respond to requests with calculated values (e.g., sensor fusion, AI inference results, or diagnostics). + +### Use Case: On-Demand Battery Health Estimation + +Imagine a robot fleet management system where the central planner queries each robot for its latest battery health score, which is not published continuously but calculated only when queried. + +This saves bandwidth and enables edge compute optimization using Zenoh's Queryable. + +### Step 1: Launch a Queryable Node + +On one Raspberry Pi device, run the built-in Zenoh example to register a queryable handler. + +```bash +cd ~/zenoh/target/release/examples +./z_queryable +``` + +You'll see the output like: + +``` +pi@raspberrypi:~/zenoh/target/release/examples$ ./z_queryable +Opening session... +Declaring Queryable on 'demo/example/zenoh-rs-queryable'... +Press CTRL-C to quit... +``` + +The node is now ready to accept queries on the key demo/example/zenoh-rs-queryable and respond with a predefined message. + +### Step 2: Trigger a Query from Another Node + +On another Raspberry Pi device, run: + +```bash +cd ~/zenoh/target/release/examples +./z_get -s demo/example/zenoh-rs-queryable +``` + +You should see: + +``` +./z_get -s demo/example/zenoh-rs-queryable +Opening session... +Sending Query 'demo/example/zenoh-rs-queryable'... +>> Received ('demo/example/zenoh-rs-queryable': 'Queryable from Rust!') +``` + +The result will look like: +![img3 alt-text#center](zenoh_ex3.gif "Figure 3: Computation on Query using Queryable") + +The value you receive comes not from storage, but from the computation inside the queryable handler. + +### Real-World Application: Distributed Inference & Computation + +This model enables edge-based intelligence, such as: +- Executing custom logic in response to a query (e.g., “calculate load average”) +- Triggering ML inference (e.g., “classify image X on demand”) +- Decentralized diagnostics (e.g., “report actuator status”) + +Queryable is a key feature for data-in-use scenarios, allowing fine-grained, on-demand compute inside your Zenoh-powered architecture. + +Next, you’ll extend this Queryable pattern to perform parameterized computation — simulating edge diagnostics and adaptive inference. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/7_zenoh-querycomp.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/7_zenoh-querycomp.md new file mode 100644 index 0000000000..a870fe5b54 --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/7_zenoh-querycomp.md @@ -0,0 +1,166 @@ +--- +title: Zenoh Example-4 Dynamic Queryable with Computation +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Parameterized Battery Health Estimation with Zenoh Queryable + +Finally, you’ll combine pub/sub, storage, and queryable components to simulate a distributed computation flow—demonstrating how Zenoh enables intelligent, coordinated edge systems. + +You’ll learn how to use Zenoh's Queryable API in Rust to build a parameterized query system for estimating battery health at the edge. + +This extends a previous example by supporting runtime query parameters like battery level and temperature. + +## Use Case: Real-time Battery Health via Computation + +In robotic fleet management, a central controller may need to assess each robot’s battery health on demand. + +Instead of streaming data continuously, robots expose a queryable endpoint that returns a real-time health score based on current battery level and temperature. + +This saves bandwidth and enables lightweight edge-side decision-making.” + +### Step 1: Create a New Zenoh Rust Project + +On any Raspberry Pi: + +```bash +cd ~/zenoh +cargo new zenoh_battery_estimator +``` + +Update following `dependencies` setting at ~/zenoh/zenoh_battery_estimator/Cargo.toml. + +``` +[dependencies] +zenoh = { path = "../zenoh" } +tokio = { version = "1", features = ["full"] } +url = "2" +``` + +### Step 2: Implement the Queryable Node + +Then, log in to another machine Pi. + +Replace the contents of ~/zenoh/zenoh_battery_estimator/src/main.rs with: + +```rust +use zenoh::{open, Config}; +use std::collections::HashMap; +use url::form_urlencoded; + +#[tokio::main] +async fn main() -> zenoh::Result<()> { + let session = open(Config::default()).await?; + + let _queryable = session + .declare_queryable("robot/battery/estimate") + .callback(|query| { + tokio::spawn(async move { + let selector = query.selector(); + let key = selector.key_expr(); + + let params = selector.parameters().as_str(); + + let decoded: HashMap<_, _> = + form_urlencoded::parse(params.as_bytes()).into_owned().collect(); + + let battery = decoded + .get("level") + .unwrap_or(&"50".to_string()) + .parse::() + .unwrap_or(50); + + let temp = decoded + .get("temp") + .unwrap_or(&"25".to_string()) + .parse::() + .unwrap_or(25); + + let health_score = 100 - (100 - battery) - ((temp.saturating_sub(25)) / 2); + let response = format!("Estimated battery health: {}%", health_score); + + let _ = query.reply(key, response).await; + }); + }) + .await?; + + println!("Queryable running on 'robot/battery/estimate'"); + tokio::signal::ctrl_c().await.unwrap(); + Ok(()) +} +``` + +This edge node responds to real-time queries using Zenoh's Queryable API. It listens for requests on the robot/battery/estimate key and returns a calculated battery health score based on provided input parameters. + +The program starts by establishing a Zenoh session using open(Config::default()). It then registers a queryable resource on the robot/battery/estimate key. Whenever this key is queried, a callback function is invoked asynchronously using tokio::spawn. + +Inside the callback: +- Query parameters are extracted from the URL-style selector string. +- Two main parameters are used: level (battery percentage) and temp (temperature in Celsius). +- A health score is computed from these inputs. +- The result is sent back to the querying client using query.reply(). + +This design pattern enables efficient, on-demand data exchange with minimal bandwidth usage—ideal for edge computing scenarios where resources and connectivity are constrained. + +The health score is calculated using the following logic: + +```rust +let health_score = 100 - (100 - battery) - ((temp.saturating_sub(25)) / 2); +``` +This formula estimates battery health as a percentage, considering both battery level and temperature: +- battery: Current battery level (default 50%) +- temp: Current temperature (default 25°C) + +The health estimation logic begins with the battery level as the baseline score. +If the temperature rises above 25°C, the score is adjusted downward—specifically, for every 2°C above this threshold, the health is reduced by 1%. +To ensure the calculation remains safe even when the temperature is below 25°C, the code uses saturating_sub(25), which prevents the result from becoming negative and avoids potential underflow errors. + +For example, if battery = 88 and temp = 32, then: +- Temperature offset = (32 - 25) / 2 = 3 +- Health = 88 - 3 = 85% + +### Step 3: Build and Run + +```bash +cd ~/zenoh/zenoh_battery_estimator +cargo build --release +``` + +After the build process, you will see: + +``` +cargo build --release + Compiling zenoh_battery_estimator v1.4.0 (/home/ubuntu/zenoh_v1.4/zenoh_battery_estimator) + Finished `release` profile [optimized] target(s) in 1m 22s +``` + +### Step 4: Query It with Parameters + +Run it on the Raspberry Pi you just built. +```bash +cd ~/zenoh/target/release/ +./zenoh_battery_estimator +``` + +You can reuse the built-in Zenoh z_get CLI in another Raspberry Pi. + +```bash +cd ~/zenoh/target/release/examples +./z_get -s "robot/battery/estimate?level=88&temp=32" +``` + +The result will look like: +![img4 alt-text#center](zenoh_ex4.gif "Figure 4: Dynamic Queryable with Computation") + +The excepted output will be +``` +>> Received ('robot/battery/estimate': 'Estimated battery health: 85%') +``` + +You’ve just built a responsive, parameterized edge compute service using Zenoh's Queryable API in Rust — a lightweight but powerful pattern for real-time intelligence at the edge. + +This approach not only minimizes network overhead but also enables each device to process and respond to context-aware queries on demand. +It’s a strong foundation for building scalable, event-driven IoT systems that can adapt dynamically to operational needs. diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md new file mode 100644 index 0000000000..48fa9c5db9 --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md @@ -0,0 +1,62 @@ +--- +title: Scalable Networking for Industrial and Robotics with Zenoh on Raspberry Pi + +draft: true +cascade: + draft: true + +minutes_to_complete: 45 + +who_is_this_for: This learning path is designed for robotics developers, industrial automation engineers, and IoT system architects building distributed, scalable, and low-latency applications. Whether you are using Robot Operating System (ROS), developing autonomous systems, or designing multi-node communication frameworks, this guide will show you how to leverage the Eclipse Zenoh protocol on Arm-based platforms — both in the cloud (AVH or EC2) and on physical devices like Raspberry Pi. + +learning_objectives: + - Understand Zenoh's architecture and its integration of pub/sub, storage, querying, and computation models. + - Build and run Zenoh examples on both Arm servers and Raspberry Pi. + - Set up and deploy a multi-node Zenoh system using Arm-based hardware or virtual environments. + +prerequisites: + - At least two [Raspberry Pi5 or Pi4](https://www.raspberrypi.com/products/raspberry-pi-5/) or other Cortex-A instances with a Linux-based OS installed. + - Basic understanding with the Linux command line. + - Experience with ROS 2 applications. + - Corellium account for virtual hardware testing. (Option) + +author: + - Odin Shen + - William Liang + - ChenYing Kuo + +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Cortex-A + - Neoverse +tools_software_languages: + - ROS2 + - C + - Raspberry Pi + +operatingsystems: + - Linux +### Cross-platform metadata only +shared_path: true +shared_between: + - iot + - automotive + +further_reading: + - resource: + title: Eclipse Zenoh Website + link: https://zenoh.io/ + type: documentation + - resource: + title: Eclipse Zenoh Github + link: https://github.com/eclipse-zenoh/zenoh + type: documentation + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/_next-steps.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex1.gif b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex1.gif new file mode 100644 index 0000000000..0972506543 Binary files /dev/null and b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex1.gif differ diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex2.gif b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex2.gif new file mode 100644 index 0000000000..eaa36bf6e9 Binary files /dev/null and b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex2.gif differ diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex3.gif b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex3.gif new file mode 100644 index 0000000000..119a79c2d9 Binary files /dev/null and b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex3.gif differ diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex4.gif b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex4.gif new file mode 100644 index 0000000000..480278f1fb Binary files /dev/null and b/content/learning-paths/cross-platform/zenoh-multinode-ros2/zenoh_ex4.gif differ diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index 89d08161fc..8ee2672ec5 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -11,8 +11,8 @@ maintopic: true operatingsystems_filter: - Android: 1 - Baremetal: 30 -- Linux: 28 -- macOS: 6 +- Linux: 29 +- macOS: 7 - RTOS: 9 - Windows: 4 subjects_filter: @@ -20,7 +20,7 @@ subjects_filter: - Containers and Virtualization: 6 - Embedded Linux: 4 - Libraries: 3 -- ML: 14 +- ML: 15 - Performance and Architecture: 21 - RTOS Fundamentals: 4 - Security: 2 @@ -32,10 +32,10 @@ tools_software_languages_filter: - Arduino: 2 - Arm Compiler for Embedded: 7 - Arm Compiler for Linux: 1 -- Arm Compute Library: 1 +- Arm Compute Library: 2 - Arm Development Studio: 8 - Arm Fast Models: 4 -- Arm Virtual Hardware: 10 +- Arm Virtual Hardware: 11 - Assembly: 1 - AVH: 1 - C: 3 @@ -53,12 +53,12 @@ tools_software_languages_filter: - DSTREAM: 2 - Edge AI: 1 - Edge Impulse: 1 -- ExecuTorch: 2 -- Fixed Virtual Platform: 9 +- ExecuTorch: 3 +- Fixed Virtual Platform: 10 - FPGA: 1 - Fusion 360: 1 - FVP: 1 -- GCC: 8 +- GCC: 9 - GenAI: 2 - GitHub: 3 - GitLab: 1 @@ -79,8 +79,8 @@ tools_software_languages_filter: - NumPy: 1 - Paddle: 1 - Porcupine: 1 -- Python: 6 -- PyTorch: 2 +- Python: 7 +- PyTorch: 3 - QEMU: 1 - Raspberry Pi: 6 - Remote.It: 1 diff --git a/content/learning-paths/embedded-and-microcontrollers/edge/_index.md b/content/learning-paths/embedded-and-microcontrollers/edge/_index.md index 46cc03fa70..982192d8a1 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge/_index.md @@ -7,19 +7,19 @@ cascade: minutes_to_complete: 90 -who_is_this_for: This learning path is for beginners in Edge AI and TinyML, including developers, engineers, hobbyists, AI/ML enthusiasts, and researchers working with embedded AI and IoT. +who_is_this_for: This learning path is for beginners in Edge AI and TinyML, including developers, engineers, hobbyists, AI/ML enthusiasts, and researchers working with embedded AI and IoT. learning_objectives: - - Understand Edge AI and TinyML basics. - - Collect and preprocess audio data using Edge Impulse. - - Train and deploy an audio classification model on Arduino Nano RP2040 - - Interface with LEDs to switch them on and off . + - Understand Edge AI and TinyML basics. + - Collect and preprocess audio data using Edge Impulse. + - Train and deploy an audio classification model on Arduino Nano RP2040. + - Interface with LEDs to switch them on and off. prerequisites: - Explore this [learning path](https://learn.arm.com/learning-paths/embedded-and-microcontrollers/arduino-pico/) if you are an absolute beginner. - - An [Edge Impulse](https://edgeimpulse.com/) Studio account. - - The [Arduino IDE with the RP2040 board support package](https://learn.arm.com/install-guides/arduino-pico/) installed on your computer - - An Arduino Nano RP2040 Connect [board](https://store.arduino.cc/products/arduino-nano-rp2040-connect-with-headers?_gl=1*9t4cti*_up*MQ..*_ga*NTA1NTQwNzgxLjE3NDYwMjIyODk.*_ga_NEXN8H46L5*MTc0NjAyMjI4Ny4xLjEuMTc0NjAyMjMxOC4wLjAuMjA3MjA2NTUzMA..). + - An [Edge Impulse](https://edgeimpulse.com/) Studio account. + - The [Arduino IDE with the RP2040 board support package](https://learn.arm.com/install-guides/arduino-pico/) installed on your computer + - An Arduino Nano RP2040 Connect [board](https://store.arduino.cc/products/arduino-nano-rp2040-connect-with-headers?_gl=1*9t4cti*_up*MQ..*_ga*NTA1NTQwNzgxLjE3NDYwMjIyODk.*_ga_NEXN8H46L5*MTc0NjAyMjI4Ny4xLjEuMTc0NjAyMjMxOC4wLjAuMjA3MjA2NTUzMA..). author: Bright Edudzi Gershon Kordorwu ### Tags @@ -27,7 +27,6 @@ skilllevels: Introductory subjects: ML armips: - Cortex-M - tools_software_languages: - Edge Impulse - tinyML @@ -35,24 +34,21 @@ tools_software_languages: - Arduino operatingsystems: - Baremetal - - further_reading: - - resource: - title: TinyML Brings AI to Smallest Arm Devices + title: TinyML Brings AI to Smallest Arm Devices link: https://newsroom.arm.com/blog/tinyml type: blog - resource: - title: What is edge AI? + title: What is edge AI? link: https://docs.edgeimpulse.com/nordic/concepts/edge-ai/what-is-edge-ai type: blog - resource: - title: Edge Impulse for Beginners + title: Edge Impulse for Beginners link: https://docs.edgeimpulse.com/docs/readme/for-beginners - type: doc + type: doc diff --git a/content/learning-paths/embedded-and-microcontrollers/edge/connect-and-set-up-arduino.md b/content/learning-paths/embedded-and-microcontrollers/edge/connect-and-set-up-arduino.md index bab4d7638e..9232e6a543 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge/connect-and-set-up-arduino.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge/connect-and-set-up-arduino.md @@ -8,17 +8,17 @@ layout: learningpathall ### Arduino Nano RP2040 -To get started with your first **TinyML project**, a great option is the **Arduino Nano RP2040 Connect**. Built by Arduino, it uses the powerful **RP2040 microcontroller** and is fully supported by the Arduino core package. The board comes with built-in Wi-Fi, Bluetooth, and an onboard IMU—features that make it ideal for deploying machine learning models at the edge. +To get started with your first **TinyML project**, the **Arduino Nano RP2040 Connect** is a good option. Built by Arduino, it uses the **RP2040 microcontroller** and is fully supported by the Arduino core package. The board comes with built-in Wi-Fi, Bluetooth, and an onboard IMU—features that is useful for deploying machine learning models at the edge. ![example image alt-text#center](images/nano.png "Arduino Nano RP2040") -Its compatibility with popular tools like Edge Impulse and the Arduino IDE makes it a beginner-friendly yet powerful choice for TinyML applications. You can learn more about the Arduino Nano RP2040 Connect on the [official Arduino website](https://store.arduino.cc/products/arduino-nano-rp2040-connect-with-headers?_gl=1*1laabar*_up*MQ..*_ga*MTk1Nzk5OTUwMS4xNzQ2NTc2NTI4*_ga_NEXN8H46L5*czE3NDY1NzY1MjUkbzEkZzEkdDE3NDY1NzY5NTkkajAkbDAkaDE1MDk0MDg0ODc.). +Its compatibility with popular tools like Edge Impulse and the Arduino IDE makes it a suitable choice for TinyML applications. You can learn more about the Arduino Nano RP2040 Connect on the [official Arduino website](https://store.arduino.cc/products/arduino-nano-rp2040-connect-with-headers?_gl=1*1laabar*_up*MQ..*_ga*MTk1Nzk5OTUwMS4xNzQ2NTc2NTI4*_ga_NEXN8H46L5*czE3NDY1NzY1MjUkbzEkZzEkdDE3NDY1NzY5NTkkajAkbDAkaDE1MDk0MDg0ODc.). ## Put everything together ### Step 1: Connect the LED to the Arduino Nano RP2040 -To visualize the output of the voice command model, we will use a simple LED circuit. +To visualize the output of the voice command model, you will use a simple LED circuit. ### Components Needed @@ -40,9 +40,7 @@ To visualize the output of the voice command model, we will use a simple LED cir To program and deploy your trained model to the Arduino Nano RP2040, you first need to configure your development environment. -Follow the detailed setup instructions provided in the following learning path: - -[Arduino Nano RP2040 Setup Guide](https://learn.arm.com/install-guides/arduino-pico/) +Follow the detailed setup instructions provided in the [Arduino Nano RP2040 Install Guide](https://learn.arm.com/install-guides/arduino-pico/) This guide will walk you through: diff --git a/content/learning-paths/embedded-and-microcontrollers/edge/overview.md b/content/learning-paths/embedded-and-microcontrollers/edge/overview.md index 580524ad61..a637b275d6 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge/overview.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge/overview.md @@ -1,11 +1,13 @@ --- -title: Overview +title: Overview weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- +This section introduces the related topics that make out the basis for this learning path. Review it before proceeding to the step-by-step tutorial. + # Edge AI Edge AI refers to artificial intelligence models that run directly on edge devices, processing data locally rather than relying on cloud computing. These models are optimized for real-time decision-making on resource-constrained devices, such as microcontrollers, embedded systems, and IoT sensors. @@ -45,9 +47,9 @@ Wearable devices also benefit from Edge AI. Smartwatches monitor health by detec In industrial settings, predictive maintenance applications rely on IoT sensors to monitor vibrations and temperatures, helping prevent machinery failures. Smart agriculture systems use soil condition sensors to optimize irrigation and fertilization, while autonomous vehicles process sensor data for real-time navigation and obstacle detection. -## Importance of Edge AI +## The BLERP mnemonic -To understand the benefits of **Edge AI**, just **BLERP**, BLERP highlights the critical aspects of deploying machine learning models on edge devices, focusing on **Bandwidth, Latency, Economics, Reliability, and Privacy**. These components are key to understanding the advantages of processing data on-device rather than relying on the cloud. The table below provides an overview of each component and its importance in Edge AI applications "Situnayake, 2023" +To help remember the benefits of **Edge AI**, **BLERP** highlights the critical aspects of deploying machine learning models on edge devices. First used by Situnayake in 2023, the abbreviation expands to **Bandwidth, Latency, Economics, Reliability, and Privacy**. These components are key to understanding the advantages of processing data on-device rather than relying on the cloud. The table below provides an overview of each component and its importance in Edge AI applications. | Area | Description | |------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -65,5 +67,5 @@ Edge AI is transforming multiple industries. In healthcare, AI-powered medical d To build effective TinyML and Edge AI projects, one needs more than just data—**both software and hardware** play a critical role in the development process. While data forms the foundation for training machine learning models, the **software** enables data processing, model development, and deployment, and the **hardware** provides the physical platform for running these models at the edge. -In this learning path, we will build a model that recognize specific voice commands, which will be used to **control LEDs on the Arduino Nano RP2040 Connect**. In the following steps, both software and hardware components will be discussed in detail. +In this learning path, you will build a model that recognize specific voice commands, which will be used to **control LEDs on the Arduino Nano RP2040 Connect**. In the following steps, both software and hardware components will be discussed in detail. diff --git a/content/learning-paths/embedded-and-microcontrollers/edge/program-and-deployment.md b/content/learning-paths/embedded-and-microcontrollers/edge/program-and-deployment.md index 4993fcfe29..57a43ed544 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge/program-and-deployment.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge/program-and-deployment.md @@ -8,7 +8,7 @@ layout: learningpathall # Programming your first tinyML device -This Learning Path provides a complete sketch that you can upload onto your Arduino Nano RP2040. Follow the steps below to get started. +This section helps you create a complete sketch that you can upload onto your Arduino Nano RP2040. ## Step 1: Create a New Sketch @@ -20,7 +20,7 @@ This Learning Path provides a complete sketch that you can upload onto your Ardu ## Step 2: Upload the Arduino Library from Edge Impulse 1. After creating and saving your sketch, go to **Sketch** > **Include Library** > **Add .ZIP Library**. -2. In the file dialog that opens, navigate to the location of the **ZIP file** you exported from Edge Impulse in [Set up your environment](http://localhost:1313/learning-paths/embedded-and-microcontrollers/egde/software_edge_impulse/) +2. In the file dialog that opens, navigate to the location of the **ZIP file** you exported from Edge Impulse in [Set up your environment](/learning-paths/embedded-and-microcontrollers/edge/software-edge-impulse/) 3. Select the **ZIP file** and click **Open**. ## Step 3: Include the Library in Your Sketch @@ -33,9 +33,9 @@ The libray should be of the form `Name_of_your_library_inferencing.h` # Code walk-through -Before running the code, it’s important to understand what each part does. +In the example repository, you will find a code snippet `Code_Sample.ino`, which is used as a sketch for the project. Before running the code, it’s important to understand what each part does. -Take a few minutes to read through the comments and logic in the sketch before uploading it to your board. The code can be downloaded [here](jkhkjhjk). +Take a few minutes to read through the comments and logic in the sketch before uploading it to your board. The code is available in the example repository, and below is a walk-through of the steps. ## Include Necessary Libraries and Define Data Structure for Inference @@ -277,12 +277,12 @@ void print_inference_result(ei_impulse_result_t result) { ``` {{% notice Note %}} -The `ei_printf` command is a custom logging function from the Edge Impulse SDK, used for printing debug or inference-related information to the serial monitor, optimized for embedded systems. It works similarly to `printf` but is tailored for the Edge Impulse environment. You can download the complete [Code_Sample.ino](https://github.com/e-dudzi/Learning-Path.git) and try it out yourself. +The `ei_printf` command is a custom logging function from the Edge Impulse SDK, used for printing debug or inference-related information to the serial monitor, optimized for embedded systems. It works similarly to `printf` but is tailored for the Edge Impulse environment. You can find the complete `Code_Sample.ino` in the example repository and try it out yourself. {{% /notice %}} # Run Your Code -Now that you have a good understanding of the code, you should run it on your device. With your **Arduino Nano RP2040** plugged into your computer, and the correct [board and port](http://localhost:1313/learning-paths/embedded-and-microcontrollers/egde/connect-and-set-up-arduino/) selected in the Arduino IDE, follow these steps: +Now that you have a good understanding of the code, you should run it on your device. With your **Arduino Nano RP2040** plugged into your computer, and the correct [board and port](/learning-paths/embedded-and-microcontrollers/egde/connect-and-set-up-arduino/) selected in the Arduino IDE, follow these steps: #### If you're using the **Upload Button** diff --git a/content/learning-paths/embedded-and-microcontrollers/edge/software-edge-impulse.md b/content/learning-paths/embedded-and-microcontrollers/edge/software-edge-impulse.md index 5d790d22ec..6401b0022d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge/software-edge-impulse.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge/software-edge-impulse.md @@ -7,7 +7,7 @@ layout: learningpathall --- # Using Edge Impulse to Train TinyML Models -Now that the foundational concepts of TinyML and Edge AI are clear, it's time to move from theory to practice. One of the most accessible and easy to use platforms for training TinyML models is **Edge Impulse**. It provides an intuitive, end-to-end pipeline for collecting data, designing features, training models, and deploying them to edge devices. In this section, we will explore how Edge Impulse is used to train models specifically for ultra-low-power microcontrollers, bridging the gap between machine learning and real-world embedded applications. +Now that the foundational concepts of TinyML and Edge AI are clear, it's time to move from theory to practice. **Edge Impulse** is an easy to use platform for training TinyML models. It provides an end-to-end pipeline for collecting data, designing features, training models, and deploying them to edge devices. In this section, you will explore how Edge Impulse is used to train models specifically for ultra-low-power microcontrollers, bridging the gap between machine learning and real-world embedded applications. ## What is Edge Impulse? @@ -47,7 +47,7 @@ Once you’ve created your account and logged in, the first step is to **create ### Step 2: Configure the Target Device -After creating your project, the next step is to **configure the target device**. Since we are using the **Arduino Nano RP2040 Connect**, click the highlighted button to begin device configuration, as shown in the snapshot below. This ensures that the data collection, model training, and deployment steps are optimized for your specific hardware. +After creating your project, the next step is to **configure the target device**. Since you are using the **Arduino Nano RP2040 Connect**, click the highlighted button to begin device configuration, as shown in the snapshot below. This ensures that the data collection, model training, and deployment steps are optimized for your specific hardware. The specifications of the Arduino Nano RP2040 Connect board can be found on [Arduino’s official page](https://store.arduino.cc/products/arduino-nano-rp2040-connect). @@ -59,7 +59,13 @@ Follow the exact settings in the attached snapshot to complete the configuration With your device configured, the next step is to **add your dataset** to the project. Click on the **"Add existing data"** button and follow the configuration settings shown in the attached snapshot. This allows you to upload pre-recorded data instead of collecting it live, which can save time during the development phase. -The dataset for this project can be downloaded from the following link: [Download Dataset](https://github.com/e-dudzi/Learning-Path.git). The Dataset has already been split into **training** and **testing**. +An **example repository** has been set up with some assets to be used throughout this Learning Path. You can clone it with the following command: + +```bash +git clone https://github.com/e-dudzi/Learning-Path.git +``` + +The repository contains a `Dataset.zip` file containing the dataset used in the project. Extract it on your local machine. For convenience, the dataset has already been split into **training** and **testing**. ![example image alt-text#center](images/6.png "Figure 4. Add Existing Data") @@ -141,7 +147,7 @@ Review these metrics to determine if the model is learning effectively. If neede ![example image alt-text#center](images/15.png "Figure 12. Model Performance") -You can also [download](https://github.com/e-dudzi/Learning-Path.git) a pre-trained model and continue from here. +You can also use the pre-trained model in the `ei-edge-ai-tutorials-arduino-1.0.1.zip` archive, from the example repository. ### Final Step: Deploying the Model @@ -150,13 +156,13 @@ To use the trained model on your Arduino Nano RP2040, follow the steps below to 1. Click on the **Deployment** tab from the menu. 2. In the **search bar**, type **"Arduino"** to filter the export options. 3. Select **Arduino library** from the list. -4. The export process will start automatically, and the model will be downloaded as a `.zip` file. +4. If the export process does not start automatically, click **Build**. The model will be downloaded as a `.zip` file. ![example image alt-text#center](images/16.png "Figure 13. Model Deployment") ## Next Steps -In the following steps, you will move from model training to real-world deployment. Specifically, we will: +In the following steps, you will move from model training to real-world deployment. Specifically, you will: - Connect an **LED** to the **Arduino Nano RP2040** board. - Set up the **Arduino IDE** for development. diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/1-overview.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/1-overview.md new file mode 100644 index 0000000000..7345c0c727 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/1-overview.md @@ -0,0 +1,25 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Visualizing ML on Embedded Devices + +Selecting the best hardware for machine learning (ML) models depends on effective tools. You can visualize ML performance early in the development cycle by using Arm [Fixed Virtual Platforms](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms) (FVPs). + +## TinyML + +This Learning Path uses TinyML. TinyML is machine learning tailored to function on devices with limited resources, constrained memory, low power, and fewer processing capabilities. + +For a learning path focused on creating and deploying your own TinyML models, please see [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/) + +## Benefits and applications + +New products, like Arm's [Ethos-U85](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85) NPU are available on FVPs earlier than on physical devices. FVPs also have a graphical user interface (GUI), which is useful for for ML performance visualization due to: +- visual confirmation that your ML model is running on the desired device, +- clearly indicated instruction counts, +- confirmation of total execution time and +- visually appealing output for prototypes and demos. diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/2-env-setup.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/2-env-setup.md new file mode 100644 index 0000000000..ed0a605229 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/2-env-setup.md @@ -0,0 +1,78 @@ +--- +# User change +title: "Install ExecuTorch" + +weight: 3 + +# Do not modify these elements +layout: "learningpathall" +--- + +In this section, you will prepare a development environment to compile a machine learning model. + +## Introduction to ExecuTorch + +ExecuTorch is a lightweight runtime designed for efficient execution of PyTorch models on resource-constrained devices. It enables machine learning inference on embedded and edge platforms, making it well-suited for Arm-based hardware. Since Arm processors are widely used in mobile, IoT, and embedded applications, ExecuTorch leverages Arm's efficient CPU architectures to deliver optimized performance while maintaining low power consumption. By integrating with Arm's compute libraries, it ensures smooth execution of AI workloads on Arm-powered devices, from Cortex-M microcontrollers to Cortex-A application processors. + +## Install dependencies + +These instructions have been tested on Ubuntu 22.04, 24.04, and on Windows Subsystem for Linux (WSL). + +Python3 is required and comes installed with Ubuntu, but some additional packages are needed: + +```bash +sudo apt update +sudo apt install python-is-python3 python3-dev python3-venv gcc g++ make -y +``` + +## Create a virtual environment + +Create a Python virtual environment using `python venv`: + +```console +python3 -m venv $HOME/executorch-venv +source $HOME/executorch-venv/bin/activate +``` +The prompt of your terminal now has `(executorch)` as a prefix to indicate the virtual environment is active. + + +## Install Executorch + +From within the Python virtual environment, run the commands below to download the ExecuTorch repository and install the required packages: + +``` bash +cd $HOME +git clone https://github.com/pytorch/executorch.git +cd executorch +``` + +Run the commands below to set up the ExecuTorch internal dependencies: + +```bash +git submodule sync +git submodule update --init +./install_executorch.sh +``` + +{{% notice Note %}} +If you run into an issue of `buck` running in a stale environment, reset it by running the following instructions: + +```bash +ps aux | grep buck +pkill -f buck +``` +{{% /notice %}} + +After running the commands, `executorch` should be listed upon running `pip list`: + +```bash +pip list | grep executorch +``` + +```output +executorch 0.6.0a0+3eea1f1 +``` + +## Next Steps + +Proceed to the next section to learn about and set up the virtualized hardware. diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/3-env-setup-fvp.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/3-env-setup-fvp.md new file mode 100644 index 0000000000..43662d7525 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/3-env-setup-fvp.md @@ -0,0 +1,52 @@ +--- +# User change +title: "Set up the Corstone-320 FVP on Linux" + +weight: 4 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +In this section, you will run scripts to set up the Corstone-320 reference package. + +The Corstone-320 Fixed Virtual Platform (FVP) is a pre-silicon software development environment for Arm-based microcontrollers. It provides a virtual representation of hardware, allowing developers to test and optimize software before actual hardware is available. Designed for AI and machine learning workloads, it includes support for Arm's Ethos-U NPU and Cortex-M processors, making it ideal for embedded AI applications. The FVP accelerates development by enabling early software validation and performance tuning in a flexible, simulation-based environment. + +The Corstone reference system is provided free of charge, although you will have to accept the license in the next step. For more information on Corstone-320, check out the [official documentation](https://developer.arm.com/documentation/109761/0000?lang=en). + +## Corstone-320 FVP Setup for ExecuTorch + +{{% notice macOS %}} + +Setting up FVPs on MacOS requires some extra steps, outlined in GitHub repo [VPs-on-Mac](https://github.com/Arm-Examples/FVPs-on-Mac/). macOS users must do this first, before setting up the Corestone-320 FVP. + +{{% /notice %}} + +Navigate to the Arm examples directory in the ExecuTorch repository. Run the following command. + +```bash +cd $HOME/executorch/examples/arm +./setup.sh --i-agree-to-the-contained-eula +``` + +After the script has finished running, it prints a command to run to finalize the installation. This step adds the FVP executables to your system path. + +```bash +source $HOME/executorch/examples/arm/ethos-u-scratch/setup_path.sh +``` + +Test that the setup was successful by running the `run.sh` script for Ethos-U85, which is the target device for Corstone-320: + +{{% notice macOS %}} + +**Start Docker:** on macOS, FVPs run inside a Docker container. + +{{% /notice %}} + +```bash + ./examples/arm/run.sh --target=ethos-u85-256 +``` + +You will see a number of examples run on the FVP. + +This confirms the installation, so you can now proceed to the Learning Path [Build a Simple PyTorch Model](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model/). \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-how-executorch-works.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-how-executorch-works.md new file mode 100644 index 0000000000..e10107404d --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-how-executorch-works.md @@ -0,0 +1,57 @@ +--- +# User change +title: "How ExecuTorch Works" + +weight: 5 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +Source: [How ExecuTorch Works](https://docs.pytorch.org/executorch/stable/intro-how-it-works.html) (official PyTorch Documentation) +1. **Export the model:** + * Generate a Graph + * A graph is series of operators (ReLU, quantize, etc.) eligible for delegation to an accelerator + * Your goal is to identify operators for acceleration on the Ethos-U NPU +2. **Compile to ExecuTorch:** + * This is the ahead-of-time compiler + * This is why ExecuTorch inference is faster than PyTorch inference + * Delegate operators to an accelerator, like the Ethos-U NPU +3. **Run on targeted device:** + * Deploy the ML model to the Fixed Virtual Platform (FVP) or physical device + * Execute operators on the CPU and delegated operators on the Ethos-U NPU + +**Diagram of How ExecuTorch Works** +![How ExecuTorch works](./how-executorch-works-high-level.png) + +## Deploy a TinyML Model + +With your development environment set up, you can deploy a simple PyTorch model. + +This example deploys the [MobileNet V2](https://pytorch.org/hub/pytorch_vision_mobilenet_v2/) computer vision model. The model is a convolutional neural network (CNN) that extracts visual features from an image. It is used for image classification and object detection. + +The actual Python code for the MobileNet V2 model is in your local `executorch` repo: [executorch/examples/models/mobilenet_v2/model.py](https://github.com/pytorch/executorch/blob/main/examples/models/mobilenet_v2/model.py). You can deploy it using [run.sh](https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh), just like you did in the previous step, with some extra parameters: + +{{% notice macOS %}} + +**Start Docker:** on macOS, FVPs run inside a Docker container. + +{{% /notice %}} + +```bash +./examples/arm/run.sh \ +--aot_arm_compiler_flags="--delegate --quantize --intermediates mv2_u85/ --debug --evaluate" \ +--output=mv2_u85 \ +--target=ethos-u85-128 \ +--model_name=mv2 +``` + +**Explanation of run.sh Parameters** +|run.sh Parameter|Meaning / Context| +|--------------|-----------------| +|--aot_arm_compiler_flags|Passes a string of compiler options to the ExecuTorch Ahead-of-Time (AOT) compiler| +|--delegate|Enables backend delegation| +|--quantize|Converts the floating-point model to int8 quantized format using post-training quantization
**Essential for running on NPUs**| +|--intermediates mv2_u85/|Directory where intermediate files (e.g., TOSA, YAMLs, debug graphs) will be saved
Useful output files for **manual debugging**| +|--debug|Verbose debugging logging| +|--evaluate|Validates model output, provides timing estimates| diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/5-configure-fvp-gui.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/5-configure-fvp-gui.md new file mode 100644 index 0000000000..7da0db60cf --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/5-configure-fvp-gui.md @@ -0,0 +1,94 @@ +--- +# User change +title: "Configure the FVP GUI (optional)" + +weight: 6 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Find your IP address + +Note down your computer's IP address: +```bash +ip addr show +``` +To help you, here are some common WiFi interface names on Linux: +|Interface Name|Meaning / Context| +|--------------|-----------------| +|wlan0|Legacy name (common on older systems)| +|wlp2s0, wlp3s0|Predictable network naming scheme (modern systems)| +|wlx|Some systems name interfaces after MAC addresses| +|wifi0, ath0|Very rare, specific to certain drivers (e.g., Atheros)| + +{{% notice macOS %}} + +Note down your `en0` IP address (or whichever network adapter is active): + +```bash +ipconfig getifaddr en0 # Returns your Mac's WiFi IP address +``` + +{{% /notice %}} + +## Enable the FVP's GUI + +Edit the following parameters in [run_fvp.sh](https://github.com/pytorch/executorch/blob/d5fe5faadb8a46375d925b18827493cd65ec84ce/backends/arm/scripts/run_fvp.sh#L97-L102), to enable the Mobilenet V2 output on the FVP's GUI: + +```bash +-C mps4_board.subsystem.ethosu.num_macs=${num_macs} \ +-C mps4_board.visualisation.disable-visualisation=1 \ +-C vis_hdlcd.disable_visualisation=1 \ +-C mps4_board.telnetterminal0.start_telnet=0 \ +-C mps4_board.uart0.out_file='-' \ +-C mps4_board.uart0.shutdown_on_eot=1 \ +``` + +- Change `mps4_board.visualisation.disable-visualisation` to equal `0` +- Change `vis_hdlcd.disable_visualisation` to equal `0` +- Enter a `--display-ip` parameter and set it to your computer's IP address + +```bash +-C mps4_board.subsystem.ethosu.num_macs=${num_macs} \ +-C mps4_board.visualisation.disable-visualisation=0 \ +-C vis_hdlcd.disable_visualisation=0 \ +-C mps4_board.telnetterminal0.start_telnet=0 \ +-C mps4_board.uart0.out_file='-' \ +-C mps4_board.uart0.shutdown_on_eot=1 \ +--display-ip \ +``` + +## Deploy the model + +{{% notice macOS %}} + +- **Start Docker:** on macOS, FVPs run inside a Docker container. + + **Do not use Colima Docker!** + + - Make sure to use an [official version of Docker](https://www.docker.com/products/docker-desktop/) and not a free version like the [Colima](https://github.com/abiosoft/colima?tab=readme-ov-file) Docker container runtime + - `run.sh` assumes Docker Desktop style networking (`host.docker.internal`) which breaks with Colima + - Colima then breaks the FVP GUI + +- **Start XQuartz:** on macOS, the FVP GUI runs using XQuartz. + + Start the xquartz.app and then configure XQuartz so that the FVP will accept connections from your Mac and localhost: + ```bash + xhost + + xhost + 127.0.0.1 # The Docker container seems to proxy through localhost + ``` +{{% /notice %}} + +Now run the Mobilenet V2 computer vision model, using [executorch/examples/arm/run.sh](https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh): +```bash +./examples/arm/run.sh \ +--aot_arm_compiler_flags="--delegate --quantize --intermediates mv2_u85/ --debug --evaluate" \ +--output=mv2_u85 \ +--target=ethos-u85-128 \ +--model_name=mv2 +``` + +Observe that the FVP loads the model file, compiles the PyTorch model to ExecuTorch `.pte` format and then shows an instruction count in the top right of the GUI: + +![Terminal and FVP output](./Terminal%20and%20FVP%20Output.jpg) diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/6-evaluate-output.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/6-evaluate-output.md new file mode 100644 index 0000000000..b6567d743f --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/6-evaluate-output.md @@ -0,0 +1,163 @@ +--- +# User change +title: "Evaluate Ethos-U Performance" + +weight: 7 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Observe Ahead-of-Time Compilation +- The below output snippet from [run.sh](https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh) is how you can confirm ahead-of-time compilation +- Specifically you want to see that the original PyTorch model was converted to an ExecuTorch `.pte` file +- For the MobileNet V2 example, the compiled ExecuTorch file will be output as `mv2_arm_delegate_ethos-u85-128.pte` + +{{% notice Note %}} + +In the below sample outputs, the `executorch` directory path is indicated as `/path/to/executorch`. Your actual path will depend on where you cloned your local copy of the [executorch repo](https://github.com/pytorch/executorch/tree/main). + +{{% /notice %}} + +**Ahead-of-Time Compiler Start:** +```bash { output_lines = "1-4" } +-------------------------------------------------------------------------------- +Running e2e flow for model 'mv2' with flags '--delegate --quantize --delegate --quantize --intermediates mv2_u85/ --debug --evaluate' +-------------------------------------------------------------------------------- +CALL python3 -m examples.arm.aot_arm_compiler --model_name=mv2 --target=ethos-u85-128 --delegate --quantize --delegate --quantize --intermediates mv2_u85/ --debug --evaluate --intermediate=/path/to/executorch/mv2_u85 --output=/path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte --system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Sram_Only +``` + +**.pte File Build Completion:** +```bash { output_lines = "1-3" } +PTE file saved as /path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte +pte_data_size: 3809584 /path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte +pte_file: /path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte +``` + +**Ethos-U Delegate Build Start:** +```bash{ output_lines = "1-5" } ++ backends/arm/scripts/build_executor_runner.sh --et_build_root=/path/to/executorch/arm_test --pte=/path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte --build_type=Release --target=ethos-u85-128 --system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Sram_Only --extra_build_flags= --ethosu_tools_dir=/path/to/executorch/examples/arm/ethos-u-scratch +-------------------------------------------------------------------------------- +Build Arm Baremetal executor_runner for ethos-u85-128 with /path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128.pte using Ethos_U85_SYS_DRAM_Mid Sram_Only to '/path/to/executorch/mv2_u85/mv2_arm_delegate_ethos-u85-128/cmake-out' +-------------------------------------------------------------------------------- +``` + +**Ethos-U Delegate Build Completion:** +```bash { output_lines = "1" } +[100%] Built target arm_executor_runner +``` + +## Observe Test Batch Performance +By default, `run.sh` (and the underlying ethos_u_runner) uses: +- A constant input tensor, usually filled with zeros, ones, or random-ish synthetic data +- Input shape matches MobileNet V2: typically `[1, 3, 224, 224]` (batch size 1, 3 RGB channels, 224×224 image) +- Input tensor size: `1 × 3 × 224 × 224 × 1 byte = 150528 bytes ≈ 147 KB` + ```bash { output_lines = "1" } + Input SRAM bandwidth = 15.49 MB/batch + ``` +- `Batch Inference time` gives you a single performance metric for the Ethos-U85 (versus other [Ethos-U NPUs](https://developer.arm.com/Processors#q=Ethos-U&aq=%40navigationhierarchiescategories%3D%3D%22Processor%20products%22%20AND%20%40navigationhierarchiescontenttype%3D%3D%22Product%20Information%22&numberOfResults=48)) + ```bash { output_lines = "1" } + Batch Inference time 4.94 ms, 202.34 inferences/s (batch size 1) + ``` + +**Test Batch Performance:** +```bash { output_lines = "1-34" } +Network summary for out +Accelerator configuration Ethos_U85_128 +System configuration Ethos_U85_SYS_DRAM_Mid +Memory mode Sram_Only +Accelerator clock 1000 MHz +Design peak SRAM bandwidth 29.80 GB/s + +Total SRAM used 5178.77 KiB + +CPU operators = 0 (0.0%) +NPU operators = 64 (100.0%) + +Average SRAM bandwidth 7.21 GB/s +Input SRAM bandwidth 15.49 MB/batch +Weight SRAM bandwidth 11.87 MB/batch +Output SRAM bandwidth 6.66 MB/batch +Total SRAM bandwidth 35.65 MB/batch +Total SRAM bandwidth per input 35.65 MB/inference (batch size 1) + +Neural network macs 300836992 MACs/batch + +Info: The numbers below are internal compiler estimates. +For performance numbers the compiled network should be run on an FVP Model or FPGA. + +Network Tops/s 0.12 Tops/s + +NPU cycles 4832315 cycles/batch +SRAM Access cycles 1168037 cycles/batch +DRAM Access cycles 0 cycles/batch +On-chip Flash Access cycles 0 cycles/batch +Off-chip Flash Access cycles 0 cycles/batch +Total cycles 4942076 cycles/batch + +Batch Inference time 4.94 ms, 202.34 inferences/s (batch size 1) +``` + +## Observe Operator Delegation +This output indicates which operators go to processors: +- **Ethos-U85 NPU:** `occurrences_in_delegated_graphs` +- **Cortex-M85 CPU:** `occurrences_in_non_delegated_graph` + +```bash { output_lines = "1-34" } +Total delegated subgraphs: 1 +Number of delegated nodes: 419 +Number of non-delegated nodes: 3 + +Delegation table: +╒════╤════════════════════════════════════════════════════╤═══════════════════════════════════╤═══════════════════════════════════════╕ +│ │ op_type │ occurrences_in_delegated_graphs │ occurrences_in_non_delegated_graphs │ +╞════╪════════════════════════════════════════════════════╪═══════════════════════════════════╪═══════════════════════════════════════╡ +│ 0 │ aten_add_tensor │ 10 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 1 │ aten_clone_default │ 1 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 2 │ aten_convolution_default │ 52 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 3 │ aten_hardtanh_default │ 35 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 4 │ aten_linear_default │ 1 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 5 │ aten_mean_dim │ 1 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 6 │ aten_view_copy_default │ 1 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 7 │ cortex_m_dequantize_per_tensor_default │ 0 │ 1 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 8 │ cortex_m_quantize_per_tensor_default │ 0 │ 1 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 9 │ getitem │ 0 │ 1 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 10 │ quantized_decomposed_dequantize_per_tensor_default │ 217 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 11 │ quantized_decomposed_quantize_per_tensor_default │ 101 │ 0 │ +├────┼────────────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤ +│ 12 │ Total │ 419 │ 3 │ +╘════╧════════════════════════════════════════════════════╧═══════════════════════════════════╧═══════════════════════════════════════╛ +``` + +## Observe the Ethos-U Performance Monitoring Unit +This output shows Ethos-U performance, from the Performance Monitoring Unit (PMU) +```bash { output_lines = "1-7" } +I [executorch:arm_perf_monitor.cpp:180] Ethos-U PMU report: +I [executorch:arm_perf_monitor.cpp:181] ethosu_pmu_cycle_cntr : 4738932 +I [executorch:arm_perf_monitor.cpp:184] ethosu_pmu_cntr0 : 1447178 +I [executorch:arm_perf_monitor.cpp:184] ethosu_pmu_cntr1 : 420661 +I [executorch:arm_perf_monitor.cpp:184] ethosu_pmu_cntr2 : 0 +I [executorch:arm_perf_monitor.cpp:184] ethosu_pmu_cntr3 : 0 +I [executorch:arm_perf_monitor.cpp:184] ethosu_pmu_cntr4 : 130 +``` + +**Table of Ethos-U PMU Counters:** +|PMU Counter|Default Event Tracked|Description|Interpretation| +|-----------|---------------------|-----------|--------------| +|ethosu_pmu_cycle_cntr|Total NPU cycles|Counts the number of core clock cycles where the Ethos-U NPU was executing work.|High value = long runtime; use to compute throughput.| +|ethosu_pmu_cntr0|SRAM read data beats received(ETHOSU_PMU_SRAM_RD_DATA_BEAT_RECEIVED)|How many data beats (e.g., 64-bit words) the NPU read from local SRAM.|Indicates input + weight loading efficiency.| +|ethosu_pmu_cntr1|SRAM write data beats written(ETHOSU_PMU_SRAM_WR_DATA_BEAT_WRITTEN)|Number of data beats the NPU wrote back to SRAM (e.g., outputs or intermediate results).|Reflects output bandwidth usage.| +|ethosu_pmu_cntr2|External DRAM read beats(ETHOSU_PMU_EXT_RD_DATA_BEAT_RECEIVED)|Number of data beats read from off-chip memory (e.g., DRAM). Often 0 if Sram_Only is used.|If non-zero, may indicate cache misses or large model size.| +|ethosu_pmu_cntr3|External DRAM write beats(ETHOSU_PMU_EXT_WR_DATA_BEAT_WRITTEN)|Number of write data beats to external memory.|Helps detect offloading or insufficient SRAM.| +|ethosu_pmu_cntr4|Idle cycles(ETHOSU_PMU_NPU_IDLE)|Number of cycles where the NPU had no work scheduled (i.e., idle).|High idle count = possible pipeline stalls or bad scheduling.| diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/Terminal and FVP Output.jpg b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/Terminal and FVP Output.jpg new file mode 100644 index 0000000000..7c6cde8116 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/Terminal and FVP Output.jpg differ diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md new file mode 100644 index 0000000000..71ce781eab --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md @@ -0,0 +1,68 @@ +--- +title: Visualizing Ethos-U Performance on Arm FVPs + +draft: true +cascade: + draft: true + +minutes_to_complete: 120 + +who_is_this_for: This is an introductory topic for developers and data scientists new to Tiny Machine Learning (TinyML), who want to visualize ExecuTorch performance on a virtual device. + +learning_objectives: + - Identify suitable Arm-based devices for TinyML applications. + - Optionally, install Fixed Virtual Platforms (FVPs) on MacOS. + - Deploy a TinyML ExecuTorch model to a Corstone-320 FVP. + - Observe model execution on the FVP's graphical user interface (GUI). + +prerequisites: + - Basic knowledge of Machine Learning concepts. + - A computer running Linux or macOS. + + +author: Waheed Brown + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Cortex-A + - Cortex-M + - Ethos-U + +operatingsystems: + - Linux + - macOS + +tools_software_languages: + - Arm Virtual Hardware + - Fixed Virtual Platform + - Python + - PyTorch + - ExecuTorch + - Arm Compute Library + - GCC + +further_reading: + - resource: + title: TinyML Brings AI to Smallest Arm Devices + link: https://newsroom.arm.com/blog/tinyml + type: blog + - resource: + title: Arm Machine Learning Resources + link: https://www.arm.com/developer-hub/embedded-and-microcontrollers/ml-solutions/getting-started + type: documentation + - resource: + title: Arm Developers Guide for Cortex-M Processors and Ethos-U NPU + link: https://developer.arm.com/documentation/109267/0101 + type: documentation + + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_next-steps.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/how-executorch-works-high-level.png b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/how-executorch-works-high-level.png new file mode 100644 index 0000000000..58b7369d5f Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/how-executorch-works-high-level.png differ diff --git a/content/learning-paths/iot/_index.md b/content/learning-paths/iot/_index.md index 221cec7ff4..90e093d397 100644 --- a/content/learning-paths/iot/_index.md +++ b/content/learning-paths/iot/_index.md @@ -13,10 +13,10 @@ subjects_filter: - Containers and Virtualization: 2 - Embedded Linux: 2 - ML: 2 -- Performance and Architecture: 2 +- Performance and Architecture: 3 operatingsystems_filter: - Baremetal: 4 -- Linux: 8 +- Linux: 9 - macOS: 2 - RTOS: 2 - Windows: 2 @@ -28,6 +28,7 @@ tools_software_languages_filter: - Azure: 1 - Balena Cloud: 1 - Balena OS: 1 +- C: 1 - Coding: 3 - Docker: 2 - Fixed Virtual Platform: 1 @@ -35,7 +36,8 @@ tools_software_languages_filter: - Matter: 1 - MCP: 1 - Python: 2 -- Raspberry Pi: 3 +- Raspberry Pi: 4 - Remote.It: 1 +- ROS2: 1 - VS Code: 1 --- diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index b3aa2da78f..47c7b9b83f 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -53,7 +53,7 @@ tools_software_languages_filter: - Kubernetes: 1 - Linux: 1 - LLM: 1 -- LLVM: 1 +- LLVM: 2 - llvm-mca: 1 - MSBuild: 1 - MTE: 1 diff --git a/content/learning-paths/laptops-and-desktops/intro/find-hardware.md b/content/learning-paths/laptops-and-desktops/intro/find-hardware.md index eb441bba39..1bb3d43b05 100644 --- a/content/learning-paths/laptops-and-desktops/intro/find-hardware.md +++ b/content/learning-paths/laptops-and-desktops/intro/find-hardware.md @@ -10,17 +10,16 @@ Desktops and laptops, based on the Arm architecture, are available with differen ### Windows -Windows on Arm laptops are available for software development. Some examples include: +[Windows on Arm](https://learn.microsoft.com/en-us/windows/arm/overview) laptops are available for software development from a variety of vendors. -- [Lenovo ThinkPad X13s](https://www.lenovo.com/us/en/p/laptops/thinkpad/thinkpadx/thinkpad-x13s-(13-inch-snapdragon)/len101t0019) -- [Surface Pro 9 with 5G](https://www.microsoft.com/en-us/d/surface-pro-9/93vkd8np4fvk) -- [Dell Inspiron 14](https://www.dell.com/en-us/shop/dell-laptops/inspiron-14-laptop/spd/inspiron-14-3420-laptop) - -There are many other Windows on Arm laptops available. ### ChromeOS -Chromebooks with Arm processors can also be used for software development. The Lenovo [Duet Gen 9](https://www.lenovo.com/us/en/p/laptops/lenovo/lenovo-edu-chromebooks/lenovo-chromebook-duet-gen-9-11-inch-mediatek/83hh0000us) is a popular detachable Chromebook. +Chromebooks with Arm processors can also be used for software development. + +If you are looking high performance, the [Lenovo Chromebook Plus 14](https://www.bestbuy.com/site/lenovo-chromebook-plus-14-oled-2k-touchscreen-laptop-mediatek-kompanio-ultra-16gb-memory-256gb-ufs-seashell/6630493.p?skuId=6630493&intl=nosplash) is is powered by the MediaTek Kompanio Ultra processor, the fastest Arm chip in a Chromebook. + +The Lenovo [Duet Gen 9](https://www.lenovo.com/us/en/p/laptops/lenovo/lenovo-edu-chromebooks/lenovo-chromebook-duet-gen-9-11-inch-mediatek/83hh0000us) is a popular detachable Chromebook. ### Linux diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md index 59b948e122..0ea7407037 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md @@ -51,6 +51,7 @@ tools_software_languages_filter: - Kotlin: 7 - LiteRT: 1 - LLM: 1 +- LLVM: 1 - llvm-mca: 1 - MediaPipe: 2 - Memory Bug Report: 1 diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md index 1985a68688..4287f39064 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md @@ -46,7 +46,7 @@ The low-light enhancement pipeline is adapted from the LiveHDR+ method originall ![example image alt-text#center](lle_pipeline.png "Low-Light Enhancement Pipeline Diagram") -The Low-Resolution Coefficient Prediction Network (implemented with TFLite) performs computations such as: +The Low-Resolution Coefficient Prediction Network (implemented with LiteRT) performs computations such as: - Strided convolutions. - Local feature extraction using convolutional layers. - Global feature extraction using convolutional and fully connected layers. diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md index 89128cfccb..04c311aaa9 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md @@ -74,9 +74,9 @@ Leave the container by pressing `Ctrl+D`. ## Notes on the cmake configuration options - `-DENABLE_SME2=$ENABLE_SME2` with `ENABLE_SME2=0`: SME2 (Scalable Matrix Extension 2) is disabled in this build (`0`). -- `-DARMNN_TFLITE_PARSER=0`: Configures the `ai-camera-pipelines` repository to use TFLite with XNNPack instead of ArmNN. +- `-DARMNN_TFLITE_PARSER=0`: Configures the `ai-camera-pipelines` repository to use LiteRT with XNNPack instead of ArmNN. - `-DENABLE_KLEIDICV:BOOL=ON`: Enables KleidiCV for optimized image processing. -- `-DXNNPACK_ENABLE_KLEIDIAI:BOOL=ON`: Enables KleidiAI acceleration for TFLite workloads via XNNPack. +- `-DXNNPACK_ENABLE_KLEIDIAI:BOOL=ON`: Enables KleidiAI acceleration for LiteRT workloads via XNNPack. ## Install the pipelines diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md index 4c11ee1168..1372ae1b5e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md @@ -16,11 +16,11 @@ The application you built earlier includes a *benchmark mode* that runs the core These benchmarks demonstrate the performance improvements enabled by KleidiCV and KleidiAI: - KleidiCV enhances OpenCV performance with computation kernels optimized for Arm processors. -- KleidiAI accelerates TFLite + XNNPack inference using AI-optimized micro-kernels tailored for Arm CPUs. +- KleidiAI accelerates LiteRT + XNNPack inference using AI-optimized micro-kernels tailored for Arm CPUs. ## Performances with KleidiCV and KleidiAI -By default, the OpenCV library is built with KleidiCV support, and TFLite+xnnpack is built with KleidiAI support. +By default, the OpenCV library is built with KleidiCV support, and LiteRT+xnnpack is built with KleidiAI support. You can run the benchmarks using the applications you built earlier. diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md index f6de0e33c7..1b39ea65e5 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md @@ -30,21 +30,22 @@ operatingsystems: - macOS further_reading: - - resource: title: Accelerate Generative AI Workloads Using KleidiAI link: https://learn.arm.com/learning-paths/cross-platform/kleidiai-explainer type: website - - resource: title: LLM Inference on Android with KleidiAI, MediaPipe, and XNNPACK link: https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/kleidiai-on-android-with-mediapipe-and-xnnpack/ type: website - - resource: title: Vision LLM Inference on Android with KleidiAI and MNN link: https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/vision-llm-inference-on-android-with-kleidiai-and-mnn/ type: website + - resource: + title: TensorFlow Lite is now LiteRT + link: https://developers.googleblog.com/en/tensorflow-lite-is-now-litert/ + type: blog ### FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index b58279463c..37978e8ebc 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -8,7 +8,7 @@ key_ip: maintopic: true operatingsystems_filter: - Android: 2 -- Linux: 152 +- Linux: 153 - macOS: 10 - Windows: 14 pinned_modules: @@ -22,7 +22,7 @@ subjects_filter: - Containers and Virtualization: 28 - Databases: 15 - Libraries: 9 -- ML: 27 +- ML: 28 - Performance and Architecture: 60 - Storage: 1 - Web: 10 @@ -34,6 +34,7 @@ tools_software_languages_filter: - 5G: 1 - ACL: 1 - AI: 1 +- Amazon Web Services: 1 - Android Studio: 1 - Ansible: 2 - Arm Compiler for Linux: 1 @@ -108,10 +109,12 @@ tools_software_languages_filter: - Keras: 1 - Kubernetes: 10 - Lambda: 1 +- Libamath: 1 - libbpf: 1 -- Libmath: 1 - Linaro Forge: 1 +- Linux: 1 - Litmus7: 1 +- Llama.cpp: 1 - LLM: 9 - llvm-mca: 1 - LSE: 1 @@ -136,7 +139,7 @@ tools_software_languages_filter: - perf: 5 - Perf: 1 - PostgreSQL: 4 -- Python: 27 +- Python: 28 - PyTorch: 9 - RAG: 1 - Redis: 3 diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md new file mode 100644 index 0000000000..772e4d96c5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md @@ -0,0 +1,171 @@ +--- +title: Launching a Graviton4 instance +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Requirements + + - An AWS account + + - Access to launch an EC2 instance of type `c8g.4xlarge` (or larger) with at least 128 GB of storage + +For more information about creating an EC2 instance using AWS refer to [Getting Started with AWS](/learning-paths/servers-and-cloud-computing/csp/aws/). + +## AWS Console Steps + +Follow these steps to launch your EC2 instance using the AWS Management Console: + +### Step 1: Create an SSH Key Pair + +1. **Navigate to EC2 Console** + + - Go to the [AWS Management Console](https://console.aws.amazon.com) + + - Search for "EC2" and click on "EC2" service + +2. **Create Key Pair** + + - In the left navigation pane, click "Key Pairs" under "Network & Security" + + - Click "Create key pair" + + - Enter name: `arcee-graviton4-key` + + - Select "RSA" as the key pair type + + - Select ".pem" as the private key file format + + - Click "Create key pair" + + - The private key file will automatically download to your computer + +3. **Secure the Key File** + + - Move the downloaded `.pem` file to the SSH configuration directory + + ```bash + mkdir -p ~/.ssh + mv arcee-graviton4-key.pem ~/.ssh + ``` + + - Set proper permissions on macOS or Linux: + + ```bash + chmod 400 ~/.ssh/arcee-graviton4-key.pem + ``` + +### Step 2: Launch EC2 Instance + +1. **Start Instance Launch** + + - In the left navigation pane, click "Instances" under "Instances" + + - Click "Launch instances" button + +2. **Configure Instance Details** + + - **Name and tags**: Enter `Arcee-Graviton4-Instance` as the instance name + + - **Application and OS Images**: + - Click "Quick Start" tab + + - Select "Ubuntu" + + - Choose "Ubuntu Server 24.04 LTS (HVM), SSD Volume Type" + + - **Important**: Ensure the architecture shows "64-bit (ARM)" for Graviton compatibility + + - **Instance type**: + - Click on "Select instance type" + + - Select `c8g.4xlarge` or larger + +3. **Configure Key Pair** + + In "Key pair name", select the SSH keypair you created earlier (`Arcee-Graviton4-Instance`) + +4. **Configure Network Settings** + + - **Network**: Select a VPC with a least one public subnet. + + - **Subnet**: Select a public subnet in the VPC + + - **Auto-assign Public IP**: Enable + + - **Firewall (security groups)** + + - Click on "Create security group" + + - Click on "Allow SSH traffic from" + + - In the dropdown list, select "My IP". + + +{{% notice Notes %}} +You will only be able to connect to the instance from your current host, which is the safest setting. Selecting "Anywhere" allows anyone on the Internet to attempt to connect; use at your own risk. + +Although this demonstration only requires SSH access, it is possible to use one of your existing security groups as long as it allows SSH traffic. +{{% /notice %}} + +5. **Configure Storage** + + - **Root volume**: + - Size: `128` GB + + - Volume type: `gp3` + +7. **Review and Launch** + + - Review all settings in the "Summary" section + + - Click "Launch instance" + +### Step 3: Monitor Instance Launch + +1. **View Launch Status** + + After a few seconds, you should see a message similar to this one: + + `Successfully initiated launch of instance (i-)` + + If instance launch fails, please review your settings and try again. + +2. **Get Connection Information** + + - Click on the instance id, or look for the instance in the Instances list in the EC2 console. + + - In the "Details" tab of the instance, note the "Public DNS" host name + + - This is the host name you'll use to connect via SSH, aka `PUBLIC_DNS_HOSTNAME` + +### Step 4: Connect to Your Instance + +1. **Open Terminal/Command Prompt** + +2. **Connect via SSH** + ```bash + ssh -i ~/.ssh/arcee-graviton4-key.pem ubuntu@ + ``` + +3. **Accept Security Warning** + + - When prompted about authenticity of host, type `yes` + + - You should now be connected to your Ubuntu instance + +### Important Notes + +- **Region Selection**: Ensure you're in your preferred AWS region before launching + +- **AMI Selection**: The Ubuntu 24.04 LTS AMI must be ARM64 compatible for Graviton processors + +- **Security**: Think twice about allowing SSH from anywhere (0.0.0.0/0). It is strongly recommended to restrict access to your IP address. + +- **Storage**: The 128GB EBS volume is sufficient for the Arcee model and dependencies + +- **Backup**: Consider creating AMIs or snapshots for backup purposes + + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md new file mode 100644 index 0000000000..c85c8f0bc4 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md @@ -0,0 +1,51 @@ +--- +title: Setting up the instance +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, you'll set up the Graviton4 instance with all the necessary tools and dependencies required to build and run the Arcee Foundation Model. This includes installing the build tools and Python environment. + +## Step 1: Update Package List + +```bash +sudo apt-get update +``` + +This command updates the local package index from the repositories: + +- Downloads the latest package lists from all configured APT repositories +- Ensures you have the most recent information about available packages and their versions +- This is a best practice before installing new packages to avoid potential conflicts +- The package index contains metadata about available packages, their dependencies, and version information + +## Step 2: Install System Dependencies + +```bash +sudo apt-get install cmake gcc g++ git python3 python3-pip python3-virtualenv libcurl4-openssl-dev unzip -y +``` + +This command installs all the essential development tools and dependencies: + +- **cmake**: Cross-platform build system generator used to compile Llama.cpp +- **gcc & g++**: GNU C and C++ compilers for building native code +- **git**: Version control system for cloning repositories +- **python3**: Python interpreter for running Python-based tools and scripts +- **python3-pip**: Python package installer for managing Python dependencies +- **python3-virtualenv**: Tool for creating isolated Python environments +- **libcurl4-openssl-dev**: client-side URL transfer library + +The `-y` flag automatically answers "yes" to prompts, making the installation non-interactive. + +## What's Ready Now? + +After completing these steps, your Graviton4 instance has: + +- A complete C/C++ development environment for building Llama.cpp +- Python 3 with pip for managing Python packages +- Git for cloning repositories +- All necessary build tools for compiling optimized ARM64 binaries + +The system is now prepared for the next steps: building Llama.cpp and downloading the Arcee Foundation Model. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md new file mode 100644 index 0000000000..b4cdcf0f7a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md @@ -0,0 +1,82 @@ +--- +title: Building Llama.cpp +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, you'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model that's optimized for inference on various hardware platforms, including Arm-based processors like Graviton4. + +Even though AFM-4.5B has a custom model architecture, we're able to use the vanilla version of Llama.cpp as the Arcee AI team has contributed the appropriate modeling code. + +## Step 1: Clone the Repository + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` + +This command clones the Llama.cpp repository from GitHub to your local machine. The repository contains the source code, build scripts, and documentation needed to compile the inference engine. + +## Step 2: Navigate to the Project Directory + +```bash +cd llama.cpp +``` + +Change into the llama.cpp directory to run the build process. This directory contains the `CMakeLists.txt` file and source code structure. + +## Step 3: Configure the Build with CMake + +```bash +cmake -B . +``` + +This command uses CMake to configure the build system: + +- `-B .` specifies that the build files should be generated in the current directory +- CMake will detect your system's compiler, libraries, and hardware capabilities +- It will generate the appropriate build files (Makefiles on Linux) based on your system configuration + + +The CMake output should include the information below, indicating that the build process will leverage the Neoverse V2 architecture's specialized instruction sets designed for AI/ML workloads. These optimizations are crucial for achieving optimal performance on Graviton4: + +```output +-- ARM feature DOTPROD enabled +-- ARM feature SVE enabled +-- ARM feature MATMUL_INT8 enabled +-- ARM feature FMA enabled +-- ARM feature FP16_VECTOR_ARITHMETIC enabled +-- Adding CPU backend variant ggml-cpu: -mcpu=neoverse-v2+crc+sve2-aes+sve2-sha3+dotprod+i8mm+sve +``` + +- **DOTPROD: Dot Product** - Hardware-accelerated dot product operations for neural network computations +- **SVE: Scalable Vector Extension** - Advanced vector processing capabilities that can handle variable-length vectors up to 2048 bits, providing significant performance improvements for matrix operations +- **MATMUL_INT8: Matrix multiplication units** - Dedicated hardware for efficient matrix operations common in transformer models, accelerating the core computations of large language models +- **FMA: Fused Multiply-Add - Optimized floating-point operations that combine multiplication and addition in a single instruction +- **FP16 Vector Arithmetic - Hardware support for 16-bit floating-point vector operations, reducing memory usage while maintaining good numerical precision + +## Step 4: Compile the Project + +```bash +cmake --build . --config Release -j16 +``` + +This command compiles the Llama.cpp project: +- `--build .` tells CMake to build the project using the files in the current directory +- `--config Release` specifies a Release build configuration, which enables optimizations and removes debug symbols +- `-j16` runs the build with 16 parallel jobs, which speeds up compilation on multi-core systems like Graviton4 + +The build process will compile the C++ source code into executable binaries optimized for your ARM64 architecture. This should only take a minute. + +## What is built? + +After successful compilation, you'll have several key command-line executables in the `bin` directory: +- `llama-cli` - The main inference executable for running LLaMA models +- `llama-server` - A web server for serving model inference over HTTP +- `llama-quantize` - a tool for model quantization to reduce memory usage +- Various utility programs for model conversion and optimization + +You can find more information in the llama.cpp [GitHub repository](https://github.com/ggml-org/llama.cpp/tree/master/tools). + +These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Graviton4 instance. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md new file mode 100644 index 0000000000..f21d281408 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md @@ -0,0 +1,66 @@ +--- +title: Installing Python dependencies for llama.cpp +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, you'll set up a Python virtual environment and install the required dependencies for working with Llama.cpp. This ensures you have a clean, isolated Python environment with all the necessary packages for model optimization. + +## Step 1: Create a Python Virtual Environment + +```bash +virtualenv env-llama-cpp +``` + +This command creates a new Python virtual environment named `env-llama-cpp`: +- Virtual environments provide isolated Python environments that prevent conflicts between different projects +- The `env-llama-cpp` directory will contain its own Python interpreter and package installation space +- This isolation ensures that the Llama.cpp dependencies won't interfere with other Python projects on your system +- Virtual environments are essential for reproducible development environments + +## Step 2: Activate the Virtual Environment + +```bash +source env-llama-cpp/bin/activate +``` + +This command activates the virtual environment: +- The `source` command executes the activation script, which modifies your current shell environment +- Depending on you sheel, your command prompt may change to show `(env-llama-cpp)` at the beginning, indicating the active environment. This will be reflected in the following commands. +- All subsequent `pip` commands will install packages into this isolated environment +- The `PATH` environment variable is updated to prioritize the virtual environment's Python interpreter + +## Step 3: Upgrade pip to the Latest Version + +```bash +pip install --upgrade pip +``` + +This command ensures you have the latest version of pip: +- Upgrading pip helps avoid compatibility issues with newer packages +- The `--upgrade` flag tells pip to install the newest available version +- This is a best practice before installing project dependencies +- Newer pip versions often include security fixes and improved package resolution + +## Step 4: Install Project Dependencies + +```bash +pip install -r requirements.txt +``` + +This command installs all the Python packages specified in the requirements.txt file: +- The `-r` flag tells pip to read the package list from the specified file +- `requirements.txt` contains a list of Python packages and their version specifications +- This ensures everyone working on the project uses the same package versions +- The installation will include packages needed for model loading, inference, and any Python bindings for Llama.cpp + +## What is installed? + +After successful installation, your virtual environment will contain: +- **NumPy**: For numerical computations and array operations +- **Requests**: For HTTP operations and API calls +- **Other dependencies**: Specific packages needed for Llama.cpp Python integration + +The virtual environment is now ready for running Python scripts that interact with the compiled Llama.cpp binaries. Remember to always activate the virtual environment (`source env-llama-cpp/bin/activate`) before running any Python code related to this project. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md new file mode 100644 index 0000000000..e293e74ff7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md @@ -0,0 +1,91 @@ +--- +title: Downloading and optimizing AFM-4.5B +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, you'll download the AFM-4.5B model from Hugging Face, convert it to the GGUF format for use with Llama.cpp, and create quantized versions to optimize memory usage and inference speed. + +The first release of the [Arcee Foundation Model](https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family) family, [AFM-4.5B](https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model) is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish + +Here are the steps to download and optimize the model for AWS Graviton4. Make sure to run them in the virtual environment you created at the previous step. + +## Step 1: Install the Hugging Face libraries + +```bash +pip install huggingface_hub hf_xet +``` + +This command installs the Hugging Face Hub Python library, which provides tools for downloading models and datasets from the Hugging Face platform. The library includes the `huggingface-cli` command-line interface that you can use to download the AFM-4.5B model. + +## Step 2: Download the AFM-4.5B Model + +```bash +huggingface-cli download arcee-ai/afm-4.5B --local-dir models/afm-4-5b +``` + +This command downloads the AFM-4.5B model from the Hugging Face Hub: +- `arcee-ai/afm-4.5B` is the model identifier on Hugging Face Hub +- `--local-dir model/afm-4-5b` specifies the local directory where the model files will be stored +- The download includes the model weights, configuration files, and tokenizer data +- This is a 4.5 billion parameter model, so the download may take several minutes depending on your internet connection + +## Step 3: Convert to GGUF Format + +```bash +python3 convert_hf_to_gguf.py models/afm-4-5b +deactivate +``` + +The first command converts the downloaded Hugging Face model to the GGUF (GGML Universal Format) format: +- `convert_hf_to_gguf.py` is a conversion script that comes with Llama.cpp +- `models/afm-4-5b` is the input directory containing the Hugging Face model files +- The script reads the model architecture, weights, and configuration from the Hugging Face format +- It outputs a single `afm-4-5B-F16.gguf` ~15GB file in the `models/afm-4-5b/` directory +- GGUF is the native format used by Llama.cpp and provides efficient loading and inference + +Next, deactivate the Python virtual environment as future commands won't require it. + +## Step 4: Create Q4_0 Quantized Version + +```bash +bin/llama-quantize models/afm-4-5b/afm-4-5B-F16.gguf models/afm-4-5b/afm-4-5B-Q4_0.gguf Q4_0 +``` + +This command creates a 4-bit quantized version of the model: +- `llama-quantize` is the quantization tool from Llama.cpp +- `afm-4-5B-F16.gguf` is the input GGUF model file in 16-bit precision +- `Q4_0` specifies 4-bit quantization with zero-point quantization +- This reduces the model size by approximately 45% (from ~15GB to ~8GB) +- The quantized model will use less memory and run faster, though with a small reduction in accuracy +- The output file will be named `afm-4-5B-Q4_0.gguf` + +**ARM Optimization**: ARM has contributed highly optimized kernels for Q4_0 quantization that leverage the Neoverse v2 instruction sets. These low-level math routines accelerate typical deep learning operations, providing significant performance improvements on ARM-based processors like Graviton4. + +These instruction sets enable Llama.cpp to perform quantized operations much faster than generic implementations, making ARM processors highly competitive for inference workloads. + +## Step 5: Create Q8_0 Quantized Version + +```bash +bin/llama-quantize models/afm-4-5b/afm-4-5B-F16.gguf models/afm-4-5b/afm-4-5B-Q8_0.gguf Q8_0 +``` + +This command creates an 8-bit quantized version of the model: +- `Q8_0` specifies 8-bit quantization with zero-point quantization +- This reduces the model size by approximately 70% (from ~15GB to ~4.4GB) +- The 8-bit version provides a better balance between memory usage and accuracy compared to 4-bit +- The output file will be named `afm-4-5B-Q8_0.gguf` +- This version is often preferred for production use when memory constraints allow + +**ARM Optimization**: Similar to Q4_0, ARM has contributed optimized kernels for Q8_0 quantization that take advantage of Neoverse v2 instruction sets. These optimizations provide excellent performance for 8-bit operations while maintaining higher accuracy compared to 4-bit quantization. + +## What is available now? + +After completing these steps, you'll have three versions of the AFM-4.5B model: +- `afm-4-5B-F16.gguf` - The original full-precision model (~15GB) +- `afm-4-5B-Q4_0.gguf` - 4-bit quantized version (~8GB) for memory-constrained environments +- `afm-4-5B-Q8_0.gguf` - 8-bit quantized version (~4.4GB) for balanced performance and memory usage + +These models are now ready to be used with the Llama.cpp inference engine for text generation and other language model tasks. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md new file mode 100644 index 0000000000..b1c9aeb471 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md @@ -0,0 +1,158 @@ +--- +title: Running inference with AFM-4.5B +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Now that you have the AFM-4.5B models in GGUF format, you can run inference using various Llama.cpp tools. In this step, you'll explore different ways to interact with the model for text generation, benchmarking, and evaluation. + +## Using llama-cli for Interactive Text Generation + +The `llama-cli` tool provides an interactive command-line interface for text generation. This is perfect for testing the model's capabilities and having conversations with it. + +### Basic Usage + +```bash +bin/llama-cli -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -n 256 --color +``` + +This command starts an interactive session with the model: + +- `-m models/afm-4-5b/afm-4-5B-Q8_0.gguf` specifies the model file to load +- `-n 512` sets the maximum number of tokens to generate per response +- The tool will prompt you to enter text, and the model will generate a response + +In this example, `llama-cli` uses 16 vCPUs. You can try different values with `-t `. + +### Example Interactive Session + +Once you start the interactive session, you can have conversations like this: + +```console +> Give me a brief explanation of the attention mechnanism in transformer models. +In transformer models, the attention mechanism allows the model to focus on specific parts of the input sequence when computing the output. Here's a simplified explanation: + +1. **Key-Query-Value (K-Q-V) computation**: For each input element, the model computes three vectors: + - **Key (K)**: This represents the input element in a way that's useful for computing attention weights. + - **Query (Q)**: This represents the current input element being processed and is used to compute attention weights. + - **Value (V)**: This represents the input element in its original form, which is used to compute the output based on attention weights. + +2. **Attention scores computation**: The attention mechanism computes the similarity between the Query (Q) and each Key (K) element using dot product and softmax normalization. This produces a set of attention scores, which represent how relevant each Key (K) element is to the Query (Q). + +3. **Weighted sum**: The attention scores are used to compute a weighted sum of the Value (V) elements. The output is a weighted sum of the Values (V) based on the attention scores. + +4. **Output**: The final output is a vector that represents the context of the input sequence, taking into account the attention scores. This output is used in the decoder to generate the next word in the output sequence. + +The attention mechanism allows transformer models to selectively focus on specific parts of the input sequence, enabling them to better understand context and relationships between input elements. This is particularly useful for tasks like machine translation, where the model needs to capture long-range dependencies between input words. +``` + +To exit the interactive session, type `Ctrl+C` or `/bye`. + +This will display performance statistics: + +```bash +llama_perf_sampler_print: sampling time = 26.66 ms / 356 runs ( 0.07 ms per token, 13352.84 tokens per second) +llama_perf_context_print: load time = 782.72 ms +llama_perf_context_print: prompt eval time = 392.40 ms / 24 tokens ( 16.35 ms per token, 61.16 tokens per second) +llama_perf_context_print: eval time = 13173.66 ms / 331 runs ( 39.80 ms per token, 25.13 tokens per second) +llama_perf_context_print: total time = 129945.08 ms / 355 tokens +``` + +In this example, our 8-bit model running on 16 threads generated 355 tokens, at over 25 tokens per second (`eval time`). + +### Example Non-Interactive Session + +Now, try the 4-bit model in non-interactive mode: + +```bash +bin/llama-cli -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -n 256 --color -no-cnv -p "Give me a brief explanation of the attention mechnanism in transformer models." +``` +This command starts an non-interactive session with the model: +- `-m models/afm-4-5b/afm-4-5B-Q4_0.gguf` specifies the model file to load +- `-no-cnv` disable the conversation mode +- `-p` sets the prompt sent to the model +- The tool will prompt you to enter text, and the model will generate a response + +Here, you should see the model generating at about 40 tokens per second. This shows how a more aggressive quantization recipe helps deliver faster performance. + +## Using llama-server for API Access + +The `llama-server` tool runs the model as a web server, allowing you to make HTTP requests for text generation. This is useful for integrating the model into applications or for batch processing. + +### Starting the Server + +```bash +bin/llama-server -m models/afm-4-5b/afm-4-5B-Q4_0.gguf \ + --host 0.0.0.0 \ + --port 8080 \ + --ctx-size 4096 +``` + +This starts a server that: +- Loads the specified model +- Listens on all network interfaces (`0.0.0.0`) +- Accepts connections on port 8080 +- Uses a 4096-token context window + +### Making API Requests + +Once the server is running, you can make requests using curl or any HTTP client. As `llama-server` is compatible with the popular OpenAI API, we'll use in the following examples. + +Open a new terminal on the AWS instance and run: + +```bash +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "afm-4-5b", + "messages": [ + { + "role": "user", + "content": "Explain quantum computing in less than 100 words." + } + ], + "max_tokens": 256, + "temperature": 0.9 + }' +``` + +You get an answer similar to this one: + +```json +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "role": "assistant", + "content": "Quantum computing uses quantum-mechanical phenomena, such as superposition and entanglement, to perform calculations. It allows for multiple possibilities to exist simultaneously, which can speed up certain processes. Unlike classical computers, quantum computers can solve complex problems and simulate systems more efficiently. Quantum bits (qubits) store information, and quantum gates perform operations. Quantum computing has potential applications in fields like cryptography, optimization, and materials science. Its development is an active area of research, with companies like IBM, Google, and Microsoft investing in quantum computing technology." + } + } + ], + "created": 1750929895, + "model": "afm-4-5b", + "system_fingerprint": "b5757-716301d1", + "object": "chat.completion", + "usage": { + "completion_tokens": 111, + "prompt_tokens": 20, + "total_tokens": 131 + }, + "id": "chatcmpl-tb93ww9iYCErwLJmsV0YLrIadVvpBk4m", + "timings": { + "prompt_n": 11, + "prompt_ms": 105.651, + "prompt_per_token_ms": 9.604636363636363, + "prompt_per_second": 104.11638318615064, + "predicted_n": 111, + "predicted_ms": 2725.982, + "predicted_per_token_ms": 24.558396396396397, + "predicted_per_second": 40.719271073690145 + } +} +``` + +You can also interact with the server using Python with the [OpenAI client library](https://github.com/openai/openai-python), enabling streaming responses, and other features. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md new file mode 100644 index 0000000000..bf390d985e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md @@ -0,0 +1,119 @@ +--- +title: Evaluating the quantized models +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Using llama-bench for Performance Benchmarking + +The [`llama-bench`](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) tool allows you to measure the performance characteristics of your model, including inference speed and memory usage. + +### Basic Benchmarking + +You can benchmark multiple model versions to compare their performance: + +```bash +# Benchmark the full precision model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-F16.gguf + +# Benchmark the 8-bit quantized model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q8_0.gguf + +# Benchmark the 4-bit quantized model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf +``` + +Running each model on 16 vCPUs, you should see results like: +- **F16 model**: ~15-16 tokens/second, ~15GB memory usage +- **Q8_0 model**: ~25 tokens/second, ~8GB memory usage +- **Q4_0 model**: ~40 tokens/second, ~4.4GB memory usage + +The exact performance will depend on your specific instance configuration and load. + +### Advanced Benchmarking + +```bash +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf \ + -p 128,256,512 \ + -n 128 \ + -t 8,16,24 +``` + +This command: +- Loads the model and runs inference benchmarks +- `-p`: Evaluates a random prompt of 128, and 512 tokens +- `-n`: Generates 128 tokens +- `-t`: Run the model on 4, 8, and 16 threads + +The results should look like this: + +| model | size | params | backend | threads | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp128 | 62.90 ± 0.08 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp512 | 57.63 ± 0.06 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | tg128 | 15.18 ± 0.02 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp128 | 116.23 ± 0.04 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp512 | 106.39 ± 0.03 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | tg128 | 25.29 ± 0.05 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp128 | 206.67 ± 0.10 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp512 | 190.18 ± 0.03 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | tg128 | 40.99 ± 0.36 | + +It's pretty amazing to see that with only 4 threads, the 4-bit model can still generate at the very comfortable speed of 15 tokens per second. We could definitely run several copies of the model on the same instance to serve concurrent users or applications. + +You can also try [`llama-batched-bench`](https://github.com/ggml-org/llama.cpp/tree/master/tools/batched-bench) to benchmark performance on batch sizes larger than 1. + + +## Using llama-perplexity for Model Evaluation + +Perplexity is a measure of how well a language model predicts text. It represents the average number of possible next tokens the model considers when predicting each word. A lower perplexity score indicates the model is more confident in its predictions and generally performs better on the given text. For example, a perplexity of 2.0 means the model typically considers 2 possible tokens when making each prediction, while a perplexity of 10.0 means it considers 10 possible tokens on average. + +The `llama-perplexity` tool evaluates the model's quality on text datasets by calculating perplexity scores. Lower perplexity indicates better quality. + +### Downloading a Test Dataset + +First, download the Wikitest-2 test dataset. + +```bash +sh scripts/get-wikitext-2.sh +``` + +### Running Perplexity Evaluation + +Next, measure perplexity on the test dataset. + +```bash +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-F16.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wiki.test.raw +``` + +If you want to speed things up, you can add the `--chunks` option to use a fraction of 564 chunks contained in the test dataset. + +On the full dataset, these three commands will take about 5 hours. You should run them in a shell script to avoid SSH timeouts. + +For example: +```bash +#!/bin/bash +# ppl.sh +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-F16.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wiki.test.raw +``` +```bash + nohup sh ppl.sh >& ppl.sh.log & + tail -f ppl.sh.log + ``` + +Here are the full results. + +| Model | Generation Speed (tokens/s, 16 vCPUs) | Memory Usage | Perplexity (Wikitext-2) | +|:-------:|:----------------------:|:------------:|:----------:| +| F16 | ~15–16 | ~15 GB | TODO | +| Q8_0 | ~25 | ~8 GB | TODO | +| Q4_0 | ~40 | ~4.4 GB | TODO | + +When you have finished your benchmarking and evaluation, make sure to terminate your AWS EC2 instance in the AWS Management Console to avoid incurring unnecessary charges for unused compute resources. + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md new file mode 100644 index 0000000000..a7effd0311 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md @@ -0,0 +1,66 @@ +--- +title: Conclusion +weight: 9 + + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Conclusion + +Congratulations! You have successfully completed the journey of deploying the Arcee AFM-4.5B foundation model on AWS Graviton4. + +Here is a summary of what you learned. + +### What you built + +Using this Learning Path, you have: + +1. **Launched a Graviton4-powered EC2 instance** - Set up a c8g.4xlarge instance running Ubuntu 24.04 LTS, leveraging AWS's latest Arm-based processors for optimal performance and cost efficiency. + +2. **Configured the development environment** - Installed essential tools and dependencies, including Git, build tools, and Python packages needed for machine learning workloads. + +3. **Built Llama.cpp from source** - Compiled the optimized inference engine specifically for Arm64 architecture, ensuring maximum performance on Graviton4 processors. + +4. **Downloaded and optimized AFM-4.5B** - Retrieved the 4.5-billion parameter Arcee Foundation Model and converted it to the efficient GGUF format, then created quantized versions (8-bit and 4-bit) to balance performance and memory usage. + +5. **Ran inference and evaluation** - Tested the model's capabilities through interactive conversations, API endpoints, and comprehensive benchmarking to measure speed, memory usage, and model quality. + +### Key Performance Insights + +The benchmarking results demonstrate the power of quantization and Arm-based computing: + +- **Memory efficiency**: The 4-bit quantized model uses only ~4.4GB of RAM compared to ~15GB for the full precision model +- **Speed improvements**: Quantization delivers 2-3x faster inference speeds (40+ tokens/second vs 15-16 tokens/second) +- **Cost optimization**: Lower memory requirements enable running on smaller, more cost-effective instances +- **Quality preservation**: The quantized models maintain excellent perplexity scores, showing minimal quality degradation + +### The Graviton4 Advantage + +AWS Graviton4 processors, built on Arm Neoverse-V2 architecture, provide: +- Superior performance per watt compared to x86 alternatives +- Cost savings of 20-40% for compute-intensive workloads +- Optimized memory bandwidth and cache hierarchy for AI/ML workloads +- Native Arm64 support for modern machine learning frameworks + +### Next Steps and Call to Action + +Now that you have a fully functional AFM-4.5B deployment, here are some exciting ways to extend your learning: + +**Production Deployment** +- Set up auto-scaling groups for high availability +- Implement load balancing for multiple model instances +- Add monitoring and logging with CloudWatch +- Secure your API endpoints with proper authentication + +**Application Development** +- Build a web application using the llama-server API +- Create a chatbot or virtual assistant +- Develop content generation tools +- Integrate with existing applications via REST APIs + +The combination of Arcee AI's efficient foundation models, Llama.cpp's optimized inference engine, and AWS Graviton4's powerful Arm processors creates a compelling platform for deploying production-ready AI applications. Whether you're building chatbots, content generators, or research tools, this stack provides the performance, cost efficiency, and flexibility needed for modern AI workloads. + +For more information on Arcee AI and how we can help you build high-quality, secure, and cost-efficient AI, solution, please visit [www.arcee.ai](https://www.arcee.ai). + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md new file mode 100644 index 0000000000..4623d917d2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -0,0 +1,68 @@ +--- +title: Deploy Arcee AFM-4.5B on AWS Graviton4 + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for developers and engineers who want to deploy the Arcee AFM-4.5B small language model on an AWS Arm-based instance. AFM-4.5B is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish. + +learning_objectives: + - Launch and set up an Arm-based Graviton4 virtual machine on Amazon Web Services. + - Build Llama.cpp from source. + - Download AFM-4.5B from Hugging Face. + - Quantize AFM-4.5B with Llama.cpp. + - Deploy the model and run inference with Llama.cpp. + - Evaluate the quality of quantized models by measuring perplexity. + +prerequisites: + - An [AWS account](https://aws.amazon.com/) with permission to launch c8g (Graviton4) instances. + - Basic familiarity with SSH. + +author: Julien Simon + +### Tags +# Tagging metadata, see the Learning Path guide for the allowed values +skilllevels: Introductory +subjects: ML +arm_ips: + - Neoverse +tools_software_languages: + - Amazon Web Services + - Linux + - Python + - Llama.cpp +operatingsystems: + - Linux + + +further_reading: + - resource: + title: Arcee AI + link: https://www.arcee.ai + type: Website + - resource: + title: Announcing Arcee Foundation Models + link: https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family + type: Blog + - resource: + title: AFM-4.5B, the First Arcee Foundation Model + link: https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model + type: Blog + - resource: + title: Amazon EC2 Graviton Instances + link: https://aws.amazon.com/ec2/graviton/ + type: Documentation + - resource: + title: Amazon EC2 Documentation + link: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ + type: Documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/_index.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/_index.md index b64edf6edb..0474eaaf90 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/_index.md @@ -1,18 +1,15 @@ --- -title: Understanding Libamath's vector accuracy modes - -draft: true -cascade: - draft: true +title: Select accuracy modes in Libamath (Arm Performance Libraries) minutes_to_complete: 20 author: Joana Cruz -who_is_this_for: This is an introductory topic for software developers who want to learn how to use the different accuracy modes present in Libamath, a component of Arm Performance Libraries. +who_is_this_for: This is an introductory topic for developers who want to use the different accuracy modes for vectorized math functions in Libamath, a component of Arm Performance Libraries. learning_objectives: - - Understand how accuracy is defined in Libamath. - - Pick an appropriate accuracy mode for your application. + - Understand how accuracy is defined in Libamath + - Select an appropriate accuracy mode for your application + - Use Libamath with different vector accuracy modes in practice prerequisites: - An Arm computer running Linux with [Arm Performance Libraries](https://learn.arm.com/install-guides/armpl/) version 25.04 or newer installed. @@ -25,7 +22,7 @@ armips: tools_software_languages: - Arm Performance Libraries - GCC -- Libmath +- Libamath operatingsystems: - Linux @@ -34,10 +31,6 @@ further_reading: title: ArmPL Libamath Documentation link: https://developer.arm.com/documentation/101004/2410/General-information/Arm-Performance-Libraries-math-functions type: documentation -# - resource: -# title: PLACEHOLDER BLOG -# link: PLACEHOLDER BLOG LINK -# type: blog - resource: title: ArmPL Installation Guide link: https://learn.arm.com/install-guides/armpl/ diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/examples.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/examples.md index b622d0edae..38e82af0da 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/examples.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/examples.md @@ -12,7 +12,7 @@ Here is an example invoking all accuracy modes of the Neon single precision exp Make sure you have [Arm Performance Libraries](https://learn.arm.com/install-guides/armpl/) installed. -Use a text editor save the code below in a file named `example.c`. +Use a text editor to save the code below in a file named `example.c`. ```C { line_numbers = "true" } #include @@ -40,7 +40,7 @@ int main(void) { printf("Libamath example:\n"); printf("-----------------------------------------------\n"); printf(" // Display worst-case ULP error in expf for each\n"); - printf(" // accuracy mode, along with approximate (`got`) and exact results (`want`)\n\n"); + printf(" // accuracy mode, along with approximate (\\\"got\\\") and exact results (\\\"want\\\")\n\n"); check_accuracy (armpl_vexpq_f32_u10, 0x1.ab312p+4, "armpl_vexpq_f32_u10(%a) delivers error under 1.0 ULP"); check_accuracy (armpl_vexpq_f32, 0x1.8163ccp+5, "armpl_vexpq_f32(%a) delivers error under 3.5 ULP"); @@ -89,5 +89,5 @@ armpl_vexpq_f32_umax(-0x1.5b7322p+6) delivers result with half correct bits ULP error = 1745.2120 ``` -The inputs used for each variant correspond to the worst case scenario known to date (ULP Error argmax). +The inputs used for each variant correspond to the current worst-case scenario known to date (ULP Error argmax). This means that the ULP error should not be higher than the one demonstrated here, ensuring the results remain below the defined thresholds for each accuracy. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md index 1dff7d8364..610aa1be31 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md @@ -1,46 +1,66 @@ --- -title: Floating Point Representation +title: Floating-point representation weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Floating-Point Representation Basics +## Understanding the floating-point number system and IEEE-754 format -Floating Point numbers are a finite and discrete approximation of the real numbers, allowing us to implement and compute functions in the continuous domain with an adequate (but limited) resolution. +Floating-point numbers are essential for representing real numbers in computing, but they come with limits on precision and range. -A Floating Point number is typically expressed as: +This Learning Path covers the following: + +* How floating-point values are structured +* How bitwise representation works +* The IEEE-754 standard definition, including special values such as NaN and subnormals + +## What is a floating-point number? + +Floating-point numbers are a finite, discrete approximation of real numbers. They allow functions in the continuous domain to be computed with adequate, but limited, resolution. + +A floating-point number is typically expressed as: ```output -+/-d.dddd...d x B^e +± d.dddd...d × B^e ``` where: -* B is the base; -* e is the exponent; -* d.dddd...d is the mantissa (or significand). It is p-bit word, where p represents the precision; -* +/- sign which is usually stored separately. +* B is the base +* e is the exponent +* d.dddd...d is the mantissa (or significand) +* *p* is the number of bits used for precision +* the +/- sign is stored separately -If the leading digit is non-zero then it is a normalized representation/normal number. +The precision of a floating-point format refers to the number of binary digits used to represent the mantissa. This is denoted by *p*, and a system with *p* bits of precision can distinguish between \( 2^p \) different fractional values. -{{% notice Example 1 %}} -Fixing `B=2, p=24` +If the leading digit is non-zero, the number is said to be normalized (also called a *normal number*). + +{{% notice Example 1%}} +Fixing `B = 2, p = 24` `0.1 = 1.10011001100110011001101 × 2^4` is a normalized representation of 0.1 -`0.1 = 0.000110011001100110011001 × 2^0` is a non normalized representation of 0.1 +`0.1 = 0.000110011001100110011001 × 2^0` is a non-normalized representation of 0.1 {{% /notice %}} -Usually a Floating Point number has multiple non-normalized representations, but only 1 normalized representation (assuming leading digit is strictly smaller than base), when fixing a base and a precision. +A floating-point number can have multiple non-normalized forms, but only one normalized representation for a given value - assuming a fixed base and precision, and that the leading digit is strictly less than the base. + +## How precision and exponents define floating-point values + +Given: -### Building a Floating-Point Ruler +* a base `B` +* a precision `p` +* a maximum exponent `emax` +* a minimum exponent `emin` -Given a base `B`, a precision `p`, a maximum exponent `emax` and a minimum exponent `emin`, we can create the set of all the normalized values in this system. +You can create the full set of representable normalized values. {{% notice Example 2 %}} -`B=2, p=3, emax=2, emin=-1` +`B = 2, p = 3, emax = 2, emin = -1` | Significand | × 2⁻¹ | × 2⁰ | × 2¹ | × 2² | |-------------|-------|------|------|------| @@ -52,15 +72,15 @@ Given a base `B`, a precision `p`, a maximum exponent `emax` and a minimum expon {{% /notice %}} -Note that, for any given integer n, numbers are evenly spaced between 2ⁿ and 2ⁿ⁺¹. But the gap between them (also called [ULP](/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp/), which is explained in the more detail in the next section) grows as the exponent increases. So the spacing between floating point numbers gets larger as numbers get bigger. +For any exponent, *n*, numbers are evenly spaced between 2ⁿ and 2ⁿ⁺¹. However, the gap between them (also called a [ULP](/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp/), which is explained in more detail in the next section) increases with the magnitude of the exponent. -### The Floating-Point bitwise representation +## Bitwise representation of floating-point numbers -Since there are `B^p` possible mantissas, and `emax-emin+1` possible exponents, then `log2(B^p) + log2(emax-emin+1) + 1` (sign) bits are needed to represent a given Floating Point number in a system. +Since there are \( B^p \) possible mantissas and `emax-emin+1` possible exponents, then `log2(B^p) + log2(emax-emin+1) + 1` (sign) bits are needed to represent a given floating-point number in a system. In Example 2, 3+2+1=6 bits are needed. -Based on this, the floating point's bitwise representation is defined to be: +Based on this, the floating-point's bitwise representation is defined as: ``` b0 b1 b2 b3 b4 b5 @@ -77,53 +97,64 @@ b3, b4, b5 -> mantissa (M) However, this is not enough. In this bitwise definition, the possible values of E are 0, 1, 2, 3. But in the system being defined, only the integer values in the range [-1, 2] are of interest. -For this reason, E is called the biased exponent, and in order to retrieve the value it is trying to represent (i.e. the unbiased exponent) an offset must be added or subtracted (in this case, subtract 1): + E is stored as a biased exponent to allow representation of both positive and negative powers of two using only unsigned integers. In this example, a bias of 1 shifts the exponent range from [0, 3] to [−1, 2]: ```output -x = (-1)^S x M x 2^(E-1) +x = (-1)^S × M × 2^(E-1) ``` -## IEEE-754 Single Precision +## IEEE-754 single precision format -Single precision (also called float) is a 32-bit format defined by the [IEEE-754 Floating Point Standard](https://ieeexplore.ieee.org/document/8766229) +Single precision (also called float) is a 32-bit format defined by the [IEEE-754 Floating-Point Standard](https://ieeexplore.ieee.org/document/8766229). -In this standard the sign is represented using 1 bit, the exponent uses 8 bits and the mantissa uses 23 bits. +In this format: -The value of a (normalized) Floating Point in IEEE-754 can be represented as: +* The sign is represented using 1 bit +* The exponent uses 8 bits +* The mantissa uses 23 bits + +The value of a normalized floating-point number in IEEE-754 can be represented as: ```output -x=(−1)^S x 1.M x 2^E−127 +x = (−1)^S × (1.M) × 2^(E−127) ``` -The exponent bias of 127 allows storage of exponents from -126 to +127. The leading digit is implicit - that is we have 24 bits of precision. In normalized numbers the leading digit is implicitly 1. +The exponent bias of 127 allows storage of exponents from -126 to +127. The leading digit is implicit in normalized numbers, giving a total of 24 bits of precision. -{{% notice Special Cases in IEEE-754 Single Precision %}} -Since we have 8 bits of storage, meaning E ranges between 0 and 2^8-1=255. However not all these 256 values are going to be used for normal numbers. +{{% notice Special cases in IEEE-754 single precision %}} +Since the exponent field uses 8 bits, E ranges between 0 and 2^8-1=255. However not all these 256 values are used for normal numbers. If the exponent E is: * 0, then we are either in the presence of a denormalized number or a 0 (if M is 0 as well); -* 1 to 254 then we are in the normalized range; -* 255 then we are in the presence of Inf (if M==0), or Nan (if M!=0). +* 1 to 254 then this is in the normalized range; +* 255: infinity (if M==0), or NaN (if M!=0). -Subnormal numbers (also called denormal numbers) are special floating-point values defined by the IEEE-754 standard. +##### Subnormal numbers + +Subnormal numbers (also called denormal numbers) allow representation of values closer to zero than is possible with normalized exponents. They are special floating-point values defined by the IEEE-754 standard. They allow the representation of numbers very close to zero, smaller than what is normally possible with the standard exponent range. -Subnormal numbers do not have the a leading 1 in their representation. They also assume exponent is 0. +Subnormal numbers do not have a leading 1 in their representation. They also assume an exponent of –126. -The interpretation of denormal Floating Point in IEEE-754 can be represented as: +The interpretation of subnormal floating-point in IEEE-754 can be represented as: ``` -x=(−1)^S x 0.M x 2^−126 +x = (−1)^S × 0.M × 2^(−126) ``` @@ -135,5 +166,6 @@ x=(−1)^s x 0.M x 2^−126 | 11 (1.75) | 0.375 | 0.875 | 1.75 | 3.5 | 7.0 | --> {{% /notice %}} -If you're interested in diving deeper in this subject, [What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html) by David Goldberg is a good place to start. +## Further information +If you're interested in diving deeper into this subject, [What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html) by David Goldberg is a great place to start. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/multi-accuracy.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/multi-accuracy.md index 807b338b49..33432d07df 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/multi-accuracy.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/multi-accuracy.md @@ -7,51 +7,51 @@ layout: learningpathall --- -## The 3 accuracy modes of Libamath +## Accuracy modes -Libamath vector functions can come in various accuracy modes for the same mathematical function. -This means, some of our functions allow users and compilers to choose between: +Libamath provides multiple accuracy modes for the same vectorized mathematical function, allowing developers to choose between speed and precision depending on workload requirements. + +Some functions offer selectable modes with tradeoffs between: - **High accuracy** (≤ 1 ULP) - **Default accuracy** (≤ 3.5 ULP) - **Low accuracy / max performance** (approx. ≤ 4096 ULP) -## How accuracy modes are encoded in Libamath +### How accuracy modes are encoded -You can recognize the accuracy mode of a function by inspecting the **suffix** in its symbol: +You can recognize the accuracy mode of a function by the **suffix** in the function symbol: - **`_u10`** → High accuracy - For instance, `armpl_vcosq_f32_u10` - Ensures results stay within **1 Unit in the Last Place (ULP)**. + Example: `armpl_vcosq_f32_u10` + Ensures results within **1 Unit in the Last Place (ULP)**. - *(no suffix)* → Default accuracy - For instance, `armpl_vcosq_f32` - Keeps errors within **3.5 ULP** — a sweet spot for many workloads. + Example: `armpl_vcosq_f32` + Keeps errors within **3.5 ULP** - balancing precision and performance. -- **`_umax`** → Low accuracy - For instance, `armpl_vcosq_f32_umax` +- **`_umax`** → Low accuracy/max performance + Example: `armpl_vcosq_f32_umax` Prioritizes speed, tolerating errors up to **4096 ULP**, or roughly **11 correct bits** in single-precision. -## Applications +## When to use each mode Selecting an appropriate accuracy level helps avoid unnecessary compute cost while preserving output quality where it matters. -### High Accuracy (≤ 1 ULP) +### High accuracy (≤ 1 ULP) -Use when **numerical (almost) correctness** is a priority. These routines involve precise algorithms (such as high-degree polynomials, careful range reduction, or FMA usage) and are ideal for: +Use when bit-level correctness matters. These routines employ advanced algorithms (such as high-degree polynomials, tight range reduction, or FMA usage) and are ideal for: - **Scientific computing** such as simulations or finite element analysis - **Signal processing pipelines** [1,2] particularly recursive filters or transform -- **Validation & reference implementations** - -While slower, these functions provide **near-bitwise reproducibility** — critical in sensitive domains. +- **Validation and reference implementations** +While slower, these functions provide **near-bitwise reproducibility** — critical for validation and scientific fidelity. -### Default Accuracy (≤ 3.5 ULP) +### Default accuracy (≤ 3.5 ULP) The default mode strikes a **practical balance** between performance and numerical fidelity. It’s optimized for: @@ -59,12 +59,12 @@ The default mode strikes a **practical balance** between performance and numeric - **Analytics workloads** [3] such as log or sqrt during feature extraction - **Inference pipelines** [4] - especially on edge devices where latency matters + especially on edge devices where latency is critical Also suitable for many **scientific workloads** that can tolerate modest error in exchange for **faster throughput**. -### Low Accuracy / Max Performance (≤ 4096 ULP) +### Low accuracy / max performance (≤ 4096 ULP) This mode trades precision for speed — aggressively. It's designed for: @@ -74,7 +74,7 @@ This mode trades precision for speed — aggressively. It's designed for: where statistical convergence outweighs per-sample accuracy [6] - **Genetic algorithms, audio processing, and embedded DSP** -Avoid in control-flow-critical code or where **errors amplify**. +Avoid in control-flow-critical code or where errors might compound or affect control flow. ## Summary @@ -88,25 +88,25 @@ Avoid in control-flow-critical code or where **errors amplify**. {{% notice Tip %}} -If your workload has mixed precision needs, you can *selectively call different accuracy modes* for different parts of your pipeline. Libamath lets you tailor precision where it matters — and boost performance where it doesn’t. +If your workload has mixed precision needs, you can *selectively call different accuracy modes* for different parts of your pipeline. Choose conservatively where correctness matters, and push for speed elsewhere. {{% /notice %}} -#### References -1. Higham, N. J. (2002). *Accuracy and Stability of Numerical Algorithms* (2nd ed.). SIAM. +## References +1. Higham, N. J. (2002). *Accuracy and Stability of Numerical Algorithms* (2nd ed.), SIAM. -2. Texas Instruments. Overflow Avoidance Techniques in Cascaded IIR Filter Implementations on the TMS320 DSPs. Application Report SPRA509, 1999. +2. Texas Instruments. *Overflow Avoidance Techniques in Cascaded IIR Filter Implementations on the TMS320 DSPs*. Application Report SPRA509, 1999. https://www.ti.com/lit/pdf/spra509 -3. Ma, S., & Huai, J. (2019). Approximate Computation for Big Data Analytics. arXiv:1901.00232. +3. Ma, S., & Huai, J. (2019). *Approximate Computation for Big Data Analytics*. arXiv:1901.00232. https://arxiv.org/pdf/1901.00232 -4. Gupta, S., Agrawal, A., Gopalakrishnan, K., & Narayanan, P. (2015). Deep Learning with Limited Numerical Precision. In Proceedings of the 32nd International Conference on Machine Learning (ICML), PMLR 37. +4. Gupta, S., Agrawal, A., Gopalakrishnan, K., & Narayanan, P. (2015). *Deep Learning with Limited Numerical Precision*. In Proceedings of the 32nd International Conference on Machine Learning (ICML), PMLR 37. https://proceedings.mlr.press/v37/gupta15.html 5. Unity Technologies. *Precision Modes*. Unity Shader Graph Documentation. [https://docs.unity3d.com/Packages/com.unity.shadergraph@17.1/manual/Precision-Modes.html](https://docs.unity3d.com/Packages/com.unity.shadergraph@17.1/manual/Precision-Modes.html) -6. Croci, M., Gorman, G. J., & Giles, M. B. (2021). Rounding Error using Low Precision Approximate Random Variables. arXiv:2012.09739. +6. Croci, M., Gorman, G. J., & Giles, M. B. (2021). *Rounding Error using Low Precision Approximate Random Variables*. arXiv:2012.09739. https://arxiv.org/abs/2012.09739 diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp-error.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp-error.md index 8f253905f9..2230e99273 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp-error.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp-error.md @@ -1,28 +1,27 @@ --- -title: ULP Error and Accuracy +title: ULP error and accuracy weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -# ULP Error and Accuracy +## Overview -In the development of Libamath, a metric called ULP error is used to assess the accuracy of functions. -This metric measures the distance between two numbers, a reference (`want`) and an approximation (`got`), relative to how many floating-point “steps” (ULPs) these two numbers are apart. +In the development of Libamath, a metric called ULP error is used to assess the accuracy of floating-point functions. ULP (Unit in the Last Place) measures the distance between two numbers, a reference (`want`) and an approximation (`got`), relative to how many floating-point steps (ULPs) separate them. -It can be calculated by: +The formula is: ``` ulp_err = | want - got | / ULP(want) ``` -Because this is a relative measure in terms of floating-point spacing (ULPs)—that is, this metric is scale-aware—it is ideal for comparing accuracy across magnitudes. Otherwise, error measures would be very biased by the uneven distribution of the floats. +Because ULP error is defined in terms of floating-point spacing, it is inherently scale-aware. In contrast to absolute error, ULP error avoids bias due to the uneven distribution of floating-point numbers across different magnitudes. -# ULP Error Implementation +## ULP error implementation -In practice, however, the above expression may take different forms to account for sources of error that may occur during the computation of the error itself. +In practice, the basic expression is modified to account for additional sources of error introduced during the computation itself. In the implementation used here, this quantity is held by a term called `tail`: @@ -30,15 +29,17 @@ In the implementation used here, this quantity is held by a term called `tail`: ulp_err = | (got - want) / ULP(want) - tail | ``` -This term takes into account the error introduced by casting `want` from a higher precision to working precision. This contribution is given in terms of ULP distance: +This term compensates for the rounding error that occurs when the high-precision reference (`want_l`, a `double`) is cast down to a `float`. This contribution is given in terms of ULP distance: ``` tail = | (want_l - want) / ULP(want) | ``` -Here is a simplified version of the ULP Error. Use the same `ulp.h` from the previous section. +## A simplified version -Use a text editor to opy the code below into a new file `ulp_error.h`. + Below is a practical implementation of the ULP error calculation based on the model above. Use the same `ulp.h` header from the previous section. + +Use a text editor to copy the code below into a new file called `ulp_error.h`: ```C // Defines ulpscale(x) @@ -52,11 +53,11 @@ double ulp_error(float got, double want_l) { float want = (float) want_l; // Early exit for exact match - if (want_l == (double)want && got == want) { + if ((want_l == (double) want && got == want)) { return 0.0; } - int ulp_exp = ulpscale(want); + int ulp_exp = ulpscale(want); // Base-2 exponent for scaling ULP(want) // Fractional tail from float rounding double tail = scalbn(want_l - (double)want, -ulp_exp); @@ -68,7 +69,9 @@ double ulp_error(float got, double want_l) { return fabs(scalbn(diff, -ulp_exp) - tail); } ``` -Note that the final scaling is done with respect to the rounded reference. +{{% notice Note %}} +The final scaling is done with respect to the rounded reference. +{{% /notice %}} In this implementation, it is possible to get exactly 0.0 ULP error if and only if: @@ -110,4 +113,4 @@ The output should be: ULP error: 1.0 ``` -If you are interested in diving into the full implementation of the ulp error, you can consult the [tester](https://github.com/ARM-software/optimized-routines/tree/master/math/test) tool in [AOR](https://github.com/ARM-software/optimized-routines/tree/master), with particular focus to the [ulp.h](https://github.com/ARM-software/optimized-routines/blob/master/math/test/ulp.h) file. Note this tool also handles special cases and considers the effect of different rounding modes in the ULP error. \ No newline at end of file +If you are interested in diving into the full implementation of the ULP error, you can consult the [tester](https://github.com/ARM-software/optimized-routines/tree/master/math/test) tool in [AOR](https://github.com/ARM-software/optimized-routines/tree/master), with particular focus to the [ulp.h](https://github.com/ARM-software/optimized-routines/blob/master/math/test/ulp.h) file. Note this tool also handles special cases and considers the effect of different rounding modes in the ULP error. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md index d37302d6bd..1f0e77fde5 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md @@ -1,24 +1,26 @@ --- -title: Units in the Last Place (ULP) +title: Units in the last place (ULP) weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -# ULP +## What is ULP? -Units in the Last Place (ULP) is the distance between two adjacent floating-point numbers at a given value, representing the smallest possible change in that number's representation. +Units in the last place (ULP) is the distance between two adjacent floating-point numbers at a given value. It represents the smallest possible change in a number's representation at that magnitude. -It is a property of a number and can be calculated with the following expression: +ULP is a function of the number's exponent and can be calculated with the following expression: ```output ULP(x) = nextafter(x, +inf) - x ``` -Building on the example shown in the previous section: +## ULP example: binary model -Fixed `B=2, p=3, e^max=2, e^min=-1` +Building on the example from the previous section: + +Fixed `B = 2, p = 3, e^max = 2, e^min = -1` | Significand | × 2⁻¹ | × 2⁰ | × 2¹ | × 2² | |-------------|-------|------|------|------| @@ -27,7 +29,7 @@ Fixed `B=2, p=3, e^max=2, e^min=-1` | 1.10 (1.5) | 0.75 | 1.5 | 3.0 | 6.0 | | 1.11 (1.75) | 0.875 | 1.75 | 3.5 | 7.0 | -Based on the above definition, the ULP value for the numbers in this set can be computed as follows: +Based on the above definition, you can compute the ULP values for the numbers in this set as follows: ``` ULP(0.625) = nextafter(0.625, +inf) - 0.625 = 0.75-0.625 = 0.125 @@ -36,21 +38,25 @@ ULP(0.625) = nextafter(0.625, +inf) - 0.625 = 0.75-0.625 = 0.125 ULP(4.0) = 1.0 ``` -As the exponent of `x` grows, `ULP(x)` also increases exponentially; that is, the spacing between floating points becomes larger. +As the exponent of `x` increases, `ULP(x)` increases exponentially. That is, the spacing between floating-point values grows with the magnitude of x. Numbers with the same exponent have the same ULP. -For normalized IEEE-754 floats, a similar behavior is observed: the distance between two adjacent representable values — i.e., ULP(x) — is a power of two that depends only on the exponent of x. +## ULP in IEEE-754 + +For normalized IEEE-754 floating-point numbers, a similar behavior is observed: the distance between two adjacent representable values — that is, ULP(x) — is a power of two that depends only on the exponent of x. -Hence, another expression used to calculate the ULP of normalized Floating Point numbers is: +### Optimized expression + +A faster, commonly used expression for ULP is: ``` ULP(x) = 2^(e-p+1) ``` -where: -* `e` is the exponent (in the IEEE-754 definition of single precision this is `E-127`) -* `p` is the precision +Where: +* `e` is the unbiased exponent (in the IEEE-754 definition of single precision this is `E-127`) +* `p` is the precision (23 for IEEE-754 single-precision) When computing the ULP of IEEE-754 floats, this expression becomes: ``` @@ -65,21 +71,19 @@ Note that for denormal numbers, the latter expression does not apply. In single precision as defined in IEEE-754, the smallest positive subnormal is: ``` -min_pos_denormal = 2 ^ -23 x 2 ^ -126 = 2^-149 +min_pos_denormal = 2⁻²³ × 2⁻¹²⁶ = 2⁻¹⁴⁹ ``` The second smallest is: ``` -second_min_pos_denormal = 2 ^ -22 x 2 ^ -126 = 2^-148 = 2*2^-149 +second_min_pos_denormal = 2⁻²² × 2⁻¹²⁶ = 2⁻¹⁴⁸ = 2 × 2⁻¹⁴⁹ ``` -and so on... - -The denormal numbers are evenly spaced by `2^-149`. +Thus, all denormal numbers are evenly spaced by `2^-149`. {{% /notice %}} -## ULP implementation +## ULP implementation in C Below is an example of an implementation of the ULP function of a number. @@ -99,13 +103,12 @@ static inline uint32_t asuint(float x) { // Compute exponent of ULP spacing at x static inline int ulpscale(float x) { - //recover the biased exponent E + // Recover the biased exponent E int e = asuint(x) >> 23 & 0xff; if (e == 0) e++; // handle subnormals - // get exponent of the ULP - // e-p = E - 127 -23 + // Compute the ULP exponent: e - p = E - 127 - 23 return e - 127 - 23; } @@ -118,10 +121,10 @@ static float ulp(float x) { There are three key functions in this implementation: * the `asuint(x)` function reinterprets the bit pattern of a float as a 32-bit unsigned integer, allowing the extraction of specific bit fields such as the exponent. * the `ulpscale(x)` function returns the base-2 exponent of the ULP spacing at a given float value x, which is the result of `log2(ULP(x))`. The `e` variable in this function corresponds to the quantity E previously mentioned (the bitwise value of the exponent). -* the `scalbnf(m, n)` function (a standard function declared in math.h) efficiently evaluates `m x 2^n`. +* the `scalbnf(m, n)` function (a standard function declared in math.h) efficiently evaluates `m × 2^n`. -Below is an example which uses the `ulp()` function. +Here's an example program that calls `ulp()` to compute the spacing near a float value. Use a text editor to save the code below in a file named `ulp.c`. diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/_demo.md b/content/learning-paths/servers-and-cloud-computing/onnx/_demo.md new file mode 100644 index 0000000000..1c9a1872e9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/onnx/_demo.md @@ -0,0 +1,57 @@ +--- +title: Run a Phi-4-mini chatbot powered by ONNX Runtime +weight: 2 + +overview: | + This Learning Path shows you how to use a 32-core Azure Dpls_v6 instance powered by an Arm Neoverse N2 CPU to build a simple chatbot that you can use to serve a small number of concurrent users. + + This architecture is suitable for deploying the latest Generative AI technologies with RAG capabilities using their existing CPU compute capacity and deployment pipelines. + + The demo uses the ONNX runtime, which Arm has integrated with KleidiAI. Further optimizations are achieved by using the smaller Phi-4-mini model, which has been optimized at INT4 quantization to minimize memory usage. + + Chat with the LLM below to see the performance for yourself, and then follow the Learning Path to build your own Generative AI service on Arm Neoverse. + + +demo_steps: + - Type and send a message to the chatbot. + - Receive the chatbot's reply. + - View performance statistics demonstrating how well Azure Cobalt 100 instances run LLMs. + +diagram: config-diagram-dark.png +diagram_blowup: config-diagram.png + +terms_and_conditions: demo-terms-and-conditions.txt + +prismjs: true # enable prismjs rendering of code snippets + + +rag_data_cutoff_date: 2025/01/17 + +title_chatbot_area: Phi-4-mini Chatbot Demo + +prismjs: true + + + +### Specific details to this demo +# ================================================================================ +tps_max: 30 # sets stat visuals for tps +tps_ranges: + - name: Low + context: Around the average human reading rate of 3-5 words per second. + color: var(--arm-green) + min: 0 + max: 5 + - name: High + context: This is significantly higher than the average human reading rate of 5 words per second, delivering a stable and usable user chatbot experience from the Phi-4-mini LLM using the ONNX runtime. + color: var(--arm-green) + min: 5 + max: 1000 + +### FIXED, DO NOT MODIFY +# ================================================================================ +demo_template_name: phi_onnx_chatbot_demo # allows the 'demo.html' partial to route to the correct Configuration and Demo/Stats sub partials for page render. +weight: 2 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/_index.md b/content/learning-paths/servers-and-cloud-computing/onnx/_index.md index 37bdde2483..27dd6e766e 100644 --- a/content/learning-paths/servers-and-cloud-computing/onnx/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/onnx/_index.md @@ -7,7 +7,7 @@ who_is_this_for: This is an advanced topic for developers, ML engineers, and clo learning_objectives: - Quantize and run the Phi-4-mini model with ONNX Runtime on Azure. - - Analyze performance on Arm Neoverse-N2 based Azure Cobalt 100 VMs. + - Analyze performance on Arm Neoverse N2 based Azure Cobalt 100 VMs. prerequisites: - An [Arm-based instance](/learning-paths/servers-and-cloud-computing/csp/) from an appropriate cloud service provider. This Learning Path has been tested on an Azure Cobalt 100 virtual machine. diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/analysis.md b/content/learning-paths/servers-and-cloud-computing/onnx/analysis.md index 68e53a2335..8939fad493 100644 --- a/content/learning-paths/servers-and-cloud-computing/onnx/analysis.md +++ b/content/learning-paths/servers-and-cloud-computing/onnx/analysis.md @@ -1,6 +1,6 @@ --- title: Interact with the Phi-4-mini Chatbot -weight: 4 +weight: 5 layout: learningpathall --- diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/chatbot-icon.png b/content/learning-paths/servers-and-cloud-computing/onnx/chatbot-icon.png new file mode 100644 index 0000000000..6560d76baa Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/onnx/chatbot-icon.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/chatbot.md b/content/learning-paths/servers-and-cloud-computing/onnx/chatbot.md index 9ac96fa8c4..005c213d7f 100644 --- a/content/learning-paths/servers-and-cloud-computing/onnx/chatbot.md +++ b/content/learning-paths/servers-and-cloud-computing/onnx/chatbot.md @@ -1,6 +1,6 @@ --- title: Run the Chatbot Server -weight: 3 +weight: 4 layout: learningpathall --- diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram-dark.png b/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram-dark.png new file mode 100644 index 0000000000..0610f5d0b0 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram-dark.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram.png b/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram.png new file mode 100644 index 0000000000..997100ddc6 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/onnx/config-diagram.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/setup.md b/content/learning-paths/servers-and-cloud-computing/onnx/setup.md index 752251b29a..fe72af9ead 100644 --- a/content/learning-paths/servers-and-cloud-computing/onnx/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/onnx/setup.md @@ -2,7 +2,7 @@ # User change title: "Build ONNX Runtime and set up the Phi-4-mini Model" -weight: 2 +weight: 3 # Do not modify these elements layout: "learningpathall" @@ -93,6 +93,7 @@ Clone and build the `onnxruntime-genai` repository: cd ~ cp ./onnxruntime/build/Linux/Release/install/include/onnxruntime/onnxruntime_float16.h ./onnxruntime/build/Linux/Release/install/include/onnxruntime_float16.h cp ./onnxruntime/build/Linux/Release/install/include/onnxruntime/onnxruntime_c_api.h ./onnxruntime/build/Linux/Release/install/include/onnxruntime_c_api.h +cp ./onnxruntime/build/Linux/Release/install/include/onnxruntime/onnxruntime_ep_c_api.h ./onnxruntime/build/Linux/Release/install/include/onnxruntime_ep_c_api.h git clone https://github.com/microsoft/onnxruntime-genai.git cd onnxruntime-genai python3 build.py --config Release --update --ort_home ../onnxruntime/build/Linux/Release/install diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index dcdd3fd9a1..27aae38555 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 380 + content_total: 382 content_with_all_tests_passing: 0 content_with_tests_enabled: 61 sw_categories: diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index 196860c0d1..682ee0df3a 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -6557,3 +6557,114 @@ avg_close_time_hrs: 0 num_issues: 17 percent_closed_vs_total: 0.0 +- a_date: '2025-07-07' + content: + automotive: 2 + cross-platform: 32 + embedded-and-microcontrollers: 42 + install-guides: 102 + iot: 6 + laptops-and-desktops: 38 + mobile-graphics-and-gaming: 34 + servers-and-cloud-computing: 126 + total: 382 + contributions: + external: 97 + internal: 509 + github_engagement: + num_forks: 30 + num_prs: 10 + individual_authors: + adnan-alsinan: 2 + alaaeddine-chakroun: 2 + albin-bernhardsson: 1 + alex-su: 1 + alexandros-lamprineas: 1 + andrew-choi: 2 + andrew-kilroy: 1 + annie-tallund: 4 + arm: 3 + arnaud-de-grandmaison: 4 + aude-vuilliomenet: 1 + avin-zarlez: 1 + barbara-corriero: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong: 1 + chen-zhang: 1 + christophe-favergeon: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 2 + david-spickett: 2 + dawid-borycki: 33 + diego-russo: 2 + dominica-abena-o.-amanfo: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + georgios-mermigkis: 1 + geremy-cohen: 3 + gian-marco-iodice: 1 + graham-woodward: 1 + han-yin: 1 + iago-calvo-lista: 1 + james-whitaker: 1 + jason-andrews: 103 + joana-cruz: 1 + joe-stech: 6 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez: 1 + julie-gaskin: 5 + julio-suarez: 6 + jun-he: 1 + kasper-mecklenburg: 1 + kieran-hejmadi: 12 + koki-mitsunami: 2 + konstantinos-margaritis: 8 + kristof-beyls: 1 + leandro-nunes: 1 + liliya-wu: 1 + mark-thurman: 1 + masoud-koleini: 1 + mathias-brossard: 1 + michael-hall: 5 + na-li: 1 + nader-zouaoui: 2 + nikhil-gupta: 1 + nina-drozd: 1 + nobel-chowdary-mandepudi: 6 + odin-shen: 7 + owen-wu: 2 + pareena-verma: 46 + paul-howard: 3 + peter-harris: 1 + pranay-bakre: 5 + preema-merlin-dsouza: 1 + przemyslaw-wirkus: 2 + qixiang-xu: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + shuheng-deng: 1 + thirdai: 1 + tianyu-li: 2 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari: 2 + visualsilicon: 1 + willen-yang: 1 + ying-yu: 2 + yiyang-fan: 1 + zach-lasiuk: 2 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 19 + percent_closed_vs_total: 0.0 diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-chatbot/javascript--llm-chatbot.html b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-chatbot/javascript--llm-chatbot.html index 2c183ce6db..6dda79922c 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-chatbot/javascript--llm-chatbot.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-chatbot/javascript--llm-chatbot.html @@ -440,6 +440,7 @@ all_messages_div.removeChild(all_messages_div.firstChild); } {{ else if eq .Params.demo_template_name "llm_chatbot_first_demo" }} + {{ else if eq .Params.demo_template_name "phi_onnx_chatbot_demo" }} {{ else }} {{ end }} @@ -629,6 +630,9 @@ {{ else if eq .Params.demo_template_name "llm_chatbot_first_demo" }} {{ $server_location = getenv "HUGO_LLM_API" | base64Encode }} console.log('Using LLM API.'); + {{ else if eq .Params.demo_template_name "phi_onnx_chatbot_demo" }} + {{ $server_location = getenv "HUGO_PHI_ONNX_LLM_API" | base64Encode }} + console.log('Using HUGO_PHI_ONNX_LLM_API.'); {{ else }} console.log('No server location provided.'); {{ end }} @@ -674,17 +678,17 @@ // Update ping & popup status if (data.cluster_utilization == 'normal') { - showPopupPostConnection('Connected to Arm Neoverse V2 based LLM! Start chatting now.',"success"); + showPopupPostConnection('Connected to Arm Neoverse based LLM! Start chatting now.',"success"); //ping_txt.textContent = `Ping: ${ping}` traffic_txt.textContent = 'Server traffic: Low' } else if (data.cluster_utilization == 'high') { - showPopupPostConnection('Connected to Arm Neoverse V2 based LLM! Traffic is high, delays may occur. Start chatting now.',"success"); + showPopupPostConnection('Connected to Arm Neoverse based LLM! Traffic is high, delays may occur. Start chatting now.',"success"); //ping_txt.textContent = `Ping: ${ping}` traffic_txt.textContent = `Server traffic: High` } else if (data.cluster_utilization == 'at-limit') { - showPopupPostConnection('Connected to Arm Neoverse V2 based LLM! Traffic is high, delays may occur. Start chatting now.',"warning"); + showPopupPostConnection('Connected to Arm Neoverse based LLM! Traffic is high, delays may occur. Start chatting now.',"warning"); //ping_txt.textContent = `Ping: ${ping}` traffic_txt.textContent = `Server traffic: High` } diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/demo.html b/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/demo.html index 7e0fbe1d81..3745550619 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/demo.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/demo.html @@ -24,6 +24,9 @@ {{else if eq .Params.demo_template_name "whisper_audio_demo"}} {{/* {{partial "demo-components/config-params-only.html" .}} */}} +{{else if eq .Params.demo_template_name "phi_onnx_chatbot_demo"}} + {{/* {{partial "demo-components/config-params-only.html" .}} */}} + {{else if eq .Params.demo_template_name "kubectl_demo"}} {{partial "demo-components/config-param-and-file.html" .}} @@ -42,6 +45,10 @@ {{partial "demo-components/llm-voice-transcriber/demo-stats--llm-voice-transcriber.html" .}} {{partial "demo-components/llm-voice-transcriber/javascript--llm-voice-transcriber.html" .}} +{{else if eq .Params.demo_template_name "phi_onnx_chatbot_demo"}} + {{partial "demo-components/llm-chatbot/demo-stats--llm-chatbot.html" .}} + {{partial "demo-components/llm-chatbot/javascript--llm-chatbot.html" .}} + {{else if eq .Params.demo_template_name "kubectl_demo"}} {{partial "demo-components/demo--kubectl.html" .}}