diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index a6374e679d..0000000000 --- a/.gitpod.yml +++ /dev/null @@ -1,12 +0,0 @@ -tasks: - - name: Install Hugo - before: brew install hugo - init: echo "Your version of Hugo is `hugo version`" - command: | - hugo - bin/pagefind --site "public" --output-subdir ../static/pagefind - hugo server -D -F --baseURL $(gp url 1313) --liveReloadPort=443 --appendPort=false --bind=0.0.0.0 - -ports: - - port: 1313 - onOpen: open-preview diff --git a/.wordlist.txt b/.wordlist.txt index 79c3f66d0f..9606eb651b 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -4812,4 +4812,141 @@ learnt lof BalenaOS balenaCloud - +MX +ARMFp +AndroidDemo +ApacheBench +ArmHalideAndroidDemo +Autoscheduler +BGR +BVM +BenchmarkBubbleSort +BenchmarkQuickSort +Botspot +BoundaryConditions +BubbleSort +ByteBuffer +DGGML +DNQZJ +DTLB +EPYC +ETag +EVEX +Esc +FuseAll +FuseBlurAndThreshold +GGG +GOPATH +GOROOT +GTK +GetByteArrayElements +Golang +Golang’s +HWC +Halide +Halide’s +ImageParam +Istio +KEDA +Kedify +Kedify’s +LLC +LLE +MPix +NIC’s +Netty +NoRuntime +OpenBMC’s +Parallelization +QCOW +QuickSort +RDom +RGBRGBRGB +RRR +RamFB +Recomputation +ReleaseByteArrayElements +Remmina +Roubalik +SAXPY +ScaledObject +Scaler +SetByteArrayRegion +SoL +Sor +Sysoev +TinyRPS +UFW +VLA +VTOR +VirtualService +WindowsOnArm +XMM +YMM +YUV +ZMM +Zbynek +adaptively +allocs +apiKey +armhalideandroiddemo +autounattend +autowiring +benchmarkHttpResponse +benchmem +blurThresholdImage +bvm +clusterName +coroutine +createBitmapFromGrayBytes +cv +extractGrayScaleBytes +fallbacks +firstlogin +golang +gosort +goweb +halide +httpd +inBytes +inlines +inputBuffer +insturction +jbyteArray +keda +kedify +keypress +kts +llmexport +loadImageFromAssets +microarchitectures +minikube +oOer +orgId +outputArray +outputBuffer +parallelization +parallelize +parallelized +parallelizes +preallocation +precomputing +qcow +recomputation +reconfig +reconversion +refetching +req +scaler +scalers +sprintf +stdev +thresholded +underperformed +underperforms +unvectorized +uop +walkthrough +warmups +xo +yi \ No newline at end of file diff --git a/assets/contributors.csv b/assets/contributors.csv index ef6f06ea90..a149228b12 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -102,3 +102,5 @@ Ker Liu,,,,, Rui Chang,,,,, Alejandro Martinez Vicente,Arm,,,, Mohamad Najem,Arm,,,, +Zenon Zhilong Xiu,Arm,,zenon-zhilong-xiu-491bb398,, +Zbynek Roubalik,Kedify,,,, diff --git a/content/install-guides/dcperf.md b/content/install-guides/dcperf.md index 024965bb54..e0d32c80e1 100644 --- a/content/install-guides/dcperf.md +++ b/content/install-guides/dcperf.md @@ -9,7 +9,7 @@ additional_search_terms: - Neoverse test_images: -- ubuntu:22.04 +- ubuntu:latest test_maintenance: false layout: installtoolsall @@ -23,12 +23,12 @@ weight: 1 DCPerf is an open-source benchmarking and microbenchmarking suite originally developed by Meta. It faithfully replicates the characteristics of general-purpose data center workloads, with particular attention to microarchitectural fidelity. DCPerf stands out for accurate simulation of behaviors such as cache misses and branch mispredictions, which are details that many other benchmarking tools overlook. -You can use DCPerf to generate performance data to inform procurement decisions, and for regression testing to detect changes in the environment, such as kernel and compiler changes. +You can use DCPerf to generate performance data to inform procurement decisions, and for regression testing to detect changes in the environment, such as kernel and compiler changes. -DCPerf runs on Arm-based servers. The examples below have been tested on an AWS `c7g.metal` instance running Ubuntu 22.04 LTS. +DCPerf runs on Arm-based servers. The examples below have been tested on an AWS `c7g.metal` instance running Ubuntu 22.04 LTS. {{% notice Note %}} -When running on a server provided by a cloud service, you have limited access to some parameters, such as UEFI settings, which can affect performance. +When running on a server provided by a cloud service, you have limited access to some parameters, such as UEFI settings, which can affect performance. {{% /notice %}} ## Install prerequisites @@ -40,7 +40,7 @@ sudo apt update sudo apt install -y python-is-python3 python3-pip python3-venv git ``` -It is recommended that you install Python packages in a Python virtual environment. +It is recommended that you install Python packages in a Python virtual environment. Set up your virtual environment: @@ -48,7 +48,7 @@ Set up your virtual environment: python3 -m venv venv source venv/bin/activate ``` -If requested, restart the recommended services. +If requested, restart the recommended services. Install the required packages: @@ -65,9 +65,9 @@ cd DCPerf ## Running the MediaWiki benchmark -DCPerf offers many benchmarks. See the official documentation for the benchmark of your choice. +DCPerf offers many benchmarks. See the official documentation for the benchmark of your choice. -One example is the MediaWiki benchmark, designed to faithfully reproduce the workload of the Facebook social networking site. +One example is the MediaWiki benchmark, designed to faithfully reproduce the workload of the Facebook social networking site. Install HipHop Virtual Machine (HHVM), a virtual machine used to execute the web application code: @@ -95,14 +95,14 @@ Compiler: 1704922878_080332982 Repo schema: 4239d11395efb06bee3ab2923797fedfee64738e ``` -Confirm security-enhanced Linux (SELinux) is disabled with the following commands: +Confirm security-enhanced Linux (SELinux) is disabled with the following commands: ```bash sudo apt install selinux-utils getenforce ``` -You should see the following response: +You should see the following response: ```output Disabled @@ -181,7 +181,7 @@ The metrics file contains several key performance indicators from the benchmark These metrics help you evaluate the performance and reliability of the system under test. Higher values for successful requests and RPS, and lower response times, generally indicate better performance. The score provides a single value for easy comparison across runs or systems. -## Next steps +## Next steps These are some activities you might like to try next: diff --git a/content/learning-paths/automotive/_index.md b/content/learning-paths/automotive/_index.md index 97fb52787c..43a8e96e60 100644 --- a/content/learning-paths/automotive/_index.md +++ b/content/learning-paths/automotive/_index.md @@ -12,10 +12,10 @@ title: Automotive weight: 4 subjects_filter: - Containers and Virtualization: 3 -- Performance and Architecture: 5 +- Performance and Architecture: 6 operatingsystems_filter: - Baremetal: 1 -- Linux: 7 +- Linux: 8 - macOS: 1 - RTOS: 1 tools_software_languages_filter: @@ -23,10 +23,11 @@ tools_software_languages_filter: - Arm Zena CSS: 1 - C: 2 - C++: 1 -- Clang: 2 +- Clang: 3 - DDS: 1 - Docker: 2 -- GCC: 2 +- FVP: 1 +- GCC: 3 - Python: 2 - Raspberry Pi: 1 - ROS 2: 3 diff --git a/content/learning-paths/automotive/zenacssdebug/_index.md b/content/learning-paths/automotive/zenacssdebug/_index.md index 9539aaaf9d..9c57db90fa 100644 --- a/content/learning-paths/automotive/zenacssdebug/_index.md +++ b/content/learning-paths/automotive/zenacssdebug/_index.md @@ -1,24 +1,21 @@ --- title: Debug Arm Zena CSS Reference Software Stack with Arm Development Studio -draft: true -cascade: - draft: true minutes_to_complete: 60 -who_is_this_for: This is an introductory topic for software developers who wish to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack. +who_is_this_for: This introductory topic is for software developers who want to use Arm Development Studio to explore and debug the Arm Zena Compute Subsystem (CSS) Reference Software Stack on a Fixed Virtual Platform (FVP). -learning_objectives: - - Set up debug configuration for the Arm Zena CSS FVP - - Debug Runtime Security Engine (RSE) from boot time - - Debug Safety Island (SI) - - Debug Linux OS on Primary Compute cores +learning_objectives: + - Set up and save a debug configuration for the Arm Zena CSS FVP + - Start Runtime Security Engine (RSE) debug at reset and step through early boot + - Attach to and debug Safety Island (SI) firmware + - Attach to the Linux kernel on the primary compute cores and debug user space processes prerequisites: - - Ubuntu 22.04 host machine - - You will need [Arm Development Studio 2024.1 (or later)](/install-guides/armds) and an appropriate license - - A basic understanding of the Arm Zena CSS software stack and Arm processors + - Ubuntu 22.04 host machine + - Arm Development Studio 2024.1 or later with a valid license - for support see the [Install Guide for ADS](/install-guides/armds) + - Basic understanding of the Arm Zena CSS software stack, Armv8-A/Armv9-A cores, and Linux author: Ronan Synnott @@ -26,24 +23,24 @@ author: Ronan Synnott skilllevels: Introductory subjects: Performance and Architecture armips: - - Cortex-A - - Cortex-R + - Cortex-A + - Cortex-R operatingsystems: - - Linux + - Linux tools_software_languages: - - Arm Development Studio - - Arm Zena CSS - + - Arm Development Studio + - Arm Zena CSS + - FVP further_reading: - - resource: - title: Arm Zena Compute System (CSS) - link: https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem - type: website - - resource: - title: Arm Development Studio - link: https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio - type: website + - resource: + title: Arm Zena Compute Subsystem (CSS) + link: https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem + type: website + - resource: + title: Arm Development Studio + link: https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio + type: website ### FIXED, DO NOT MODIFY diff --git a/content/learning-paths/automotive/zenacssdebug/config.md b/content/learning-paths/automotive/zenacssdebug/config.md index 81e6367f40..b9d6dbf12f 100644 --- a/content/learning-paths/automotive/zenacssdebug/config.md +++ b/content/learning-paths/automotive/zenacssdebug/config.md @@ -1,6 +1,6 @@ --- # User change -title: "Model Configuration" +title: "Configure the model" weight: 4 # 1 is first, 2 is second, etc. @@ -8,56 +8,57 @@ weight: 4 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -# Debug Configuration +## Set up a debug configuration for the Zena CSS FVP -Arm Development Studio requires a `Debug Configuration` of the target that it will connect to. +Now you'll walk through setting up an Arm Development Studio debug configuration for the Zena CSS FVP using the Iris interface. This is a fast, reliable path to a working configuration. -As of Arm Development Studio version 2025.0, there is no such configuration provided 'out-of-the-box' for the Zena CSS FVP. However creating such a configuration is straight forward. +As of Arm Development Studio 2025.0, there is no out-of-the-box configuration for the Zena CSS FVP. Creating one, however, is straightforward. -See the Arm Development Studio [Getting Started Guide](https://developer.arm.com/documentation/101469/latest/Migrating-from-DS-5-to-Arm-Development-Studio/Connect-to-new-or-custom-models) for full instructions, but they are also summarized below. +For full guidance, see the Arm Development Studio [Getting Started Guide](https://developer.arm.com/documentation/101469/latest/Migrating-from-DS-5-to-Arm-Development-Studio/Connect-to-new-or-custom-models). A concise, task-focused version is below. -## Launch FVP +## Launch the FVP (with Iris) -As per previous section, launch FVP with the Iris server enabled: +Launch the FVP with the Iris server enabled: -```command +```bash kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" ``` -or if connecting to the FVP remotely: +If connecting to the FVP remotely, you can use this command: -```command +```bash kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 -A" ``` + {{% notice Note %}} -A local connection is assumed for the remainder of this learning path. +This example modeled below uses a local connection for the remaining steps. {{% /notice %}} -## Configuration Database - -Debug Configurations are stored in a configuration database. You must first create a local database in which to store the configuration. - -Navigate to `File` > `New` > `Other`, and then select `Configuration Database` > `Configuration Database` from the drop-down list. +## Create a configuration database in Arm Development Studio -Click `Next`. Give the Database a name, and click `Finish`. +Debug configurations are stored in a configuration database. Create a local database to store your model configuration: -## Debug Configuration +- In Arm Development Studio, go to **File > New > Other**. +- Select **Configuration Database > Configuration Database**. +- Click **Next**, enter a **Name**, then click **Finish**. -Navigate to the same wizard as above, and select `Model Configuration`. +## Create a model configuration for Zena CSS FVP (Iris) -Click `Next`, and you will be prompted to select the above `Configuration Database`. Click `Next` again, and you will be prompted to select a Model Interface. +- Open the same wizard (**File > New > Other**), then choose **Configuration Database > Model Configuration**. +- Click **Next**, select the **Configuration Database** you created, then click **Next**. +- For **Model Interface**, choose **Iris**, then click **Next**. +- Choose **Browse for model running on local host**. The debugger detects and interrogates the FVP. +- If connecting remotely, choose **Connect to model running on either local or remote host** and provide the host and port. -Select `Iris` from the pulldown, and click `Next`. +Arm Development Studio generates a `model.mdf` file that enumerates all CPUs in the FVP. -You will then be prompted to locate the model to connect to. +Optionally, update **Manufacturer Name** (for example, `Arm`) and **Platform Name** (for example, `Zena_CSS_FVP`). Then **Save** and **Import** the model into the configuration database. -Select `Browse for model running on local host`. The FVP will be detected and interrogated by the debugger. - -{{% notice Note %}} -Use `Connect to model running on either local or remote host` if connecting remotely. +{{% notice Tip %}} +If the FVP is not detected, verify the Iris server is running on the expected port (`7100` by default) and that your firewall allows local connections. For remote connections, confirm the host is reachable and the port is open. {{% /notice %}} A `model.mdf` file will be created that identifies all CPUs within the FVP. -You can change the `Manufacturer Name` and `Platform Name` to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then `Save`, and `Import` into the configuration database. +You can change the **Manufacturer Name** and **Platform Name** to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then **Save**, and **Import** into the configuration database. The debugger is now aware of the FVP and you are ready to debug. diff --git a/content/learning-paths/automotive/zenacssdebug/connect.md b/content/learning-paths/automotive/zenacssdebug/connect.md index 921f80b467..ebedbe8acd 100644 --- a/content/learning-paths/automotive/zenacssdebug/connect.md +++ b/content/learning-paths/automotive/zenacssdebug/connect.md @@ -1,6 +1,6 @@ --- # User change -title: "Debug Connections" +title: "Create debug connections" weight: 5 # 1 is first, 2 is second, etc. @@ -8,68 +8,79 @@ weight: 5 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -## Debug Connections +## Overview You are now ready to create debug connections for each of the sub-systems within Zena CSS. In this section you will create the connections, which will be subsequently enhanced in the following section. You may prefer to fully set up one such connection before moving to others. -Arm Development Studio has full support for Heterogeneous systems such as Zena CSS, and so you can connect to all processors simultaneously. +Arm Development Studio has full support for heterogeneous systems such as Zena CSS, and so you can connect to all processors simultaneously. -### Debug connection project +## Create a project for connection files -First, create a project to store these connections (`.launch` files) in. +First, create a project to store these connections (`.launch` files). -Select `File` > `New...` > `Project` > `General` > `Project`, and give it a meaningful name (`Connections`). +Select **File** > **New...** > **Project** > **General** > **Project**, and give it a meaningful name (for example, `Connections`). -### RSE (Cortex-M55) +## Create an RSE (Cortex-M55) model connection -Runtime Security Engine (RSE) is based on [Cortex-M55](https://developer.arm.com/Processors/Cortex-M55) core and is a security subsystem fulfilling the role of Root of Trust. +Runtime Security Engine (RSE) is based on the [Cortex-M55](https://developer.arm.com/Processors/Cortex-M55) core and is a security subsystem fulfilling the role of Root of Trust. -Select `File` > `New` > `Model Connection`. +Select **File** > **New** > **Model Connection**. {{% notice Note %}} -You can also use `File` > `New` > `Other` > `Arm Debugger` > `Model Connection`, or - -`Create a debug connection...` shortcut in the `Debug Control` pane. +You can also use **File** > **New** > **Other** > **Arm Debugger** > **Model Connection**, or the **Create a debug connection**... shortcut in the **Debug Control** pane. {{% /notice %}} -Specify a connection name (`RSE`), and associate with the above `Connections` project. Click `Next`. +Specify a connection name (`RSE`), and associate with the above `Connections` project. Click **Next**. -Locate the FVP based on the name you gave it previously (`Zena_CSS_FVP`). The text filter can help you locate it easily. +Locate the FVP based on the name you gave it previously (`Zena_CSS_FVP`). You can use the text filter to locate it quickly. -You will then be presented with the `Edit configuration` pane. In the `Connection` tab, scroll down to locate `Bare Metal Debug` > `Arm_Cortex-M55`. +You will then be presented with the **Edit configuration** pane. In the **Connection** tab, scroll down to locate **Bare Metal Debug** > **Arm_Cortex-M55**. -As you will be later launching the FVP with the software stack loaded, select `Connect to an already running model`. +As you will be later launching the FVP with the software stack loaded, select **Connect to an already running model**. -Assuming the same host will be running both the FVP and the debugger, specify the `Connection address` as the default `127.0.0.1:7100`. +Assuming the same host will be running both the FVP and the debugger, specify the **Connection address** as the default `127.0.0.1:7100`. {{% notice Note %}} -`127.0.0.1` is the same as `localhost`, that is the same host machine as is running the FVP. +`127.0.0.1` is the same as `localhost`, which targets the host running the FVP. For a remote FVP, specify the remote IP address and start the FVP with `-A`. Port `7100` is the default Iris port and can be adjusted if needed. +{{% /notice %}} + +Arm Development Studio creates `RSE.launch` inside the **Connections** project. + +## Create a Safety Island (Cortex-R82AE) model connection -It is also possible to connect to a remote host by specifying appropriate IP address, and launching FVP with the `-A` option. +The Safety Island is based on the [Cortex-R82AE](https://developer.arm.com/Processors/Cortex-R82AE) core and manages power, clocks, and CMN control. -`7100` is the default port number. You may need to change this if necessary. +Follow the same steps as for RSE, with this change: + +In **Edit configuration**, expand **Bare Metal Debug** and select **Arm_Cortex-R82AE**. + +{{% notice Tip %}} +To save time, copy `RSE.launch` to `SI.launch` and update the CPU selection to **Arm_Cortex-R82AE**. {{% /notice %}} -Click `Apply` to save the connection information, and `Close`. Observe that `RSE.launch` is created inside the `Connections` project. +## Create Primary compute (Cortex-A720AE) connections -### Safety Island (Cortex-R82AE) +Primary compute comprises four clusters intended to run a rich OS such as Linux. Each cluster has four [Cortex-A720AE](https://developer.arm.com/Processors/Cortex-A720AE) cores alongside a [DSU-120AE](https://developer.arm.com/Processors/DSU-120AE) DynamIQ Shared Unit. -The Safety Island is a subsystem based on [Cortex-R82AE](https://developer.arm.com/Processors/Cortex-R82AE) core. The software running on the Safety Island is responsible for power, clock and CMN control. +You will create two connections: one for bare-metal initialization and one with Linux kernel awareness for SMP debug. -The procedure to create this connection is very similar to the above, other than to select `Bare Metal Debug` > `Arm_Cortex-R82AE` from the drop-down. +### Primary init (bare metal, CPU0 only) -{{% notice %}} -For convenience you can copy-and-paste `RSE.launch` as `SI.launch` and just modify the CPU. -{{% /notice %}} +Create `Primary_init.launch`: + +- Select **File > New > Model Connection**. +- Select your `Zena_CSS_FVP` model. +- In **Edit configuration**, expand **Bare Metal Debug** and select **ARM_Cortex-A720AE_0** to attach to CPU0 only. This leaves other CPUs running. -### Primary Compute (Cortex-A720AE) +### Primary Linux (SMP, OS awareness) -The Primary Compute consists of four processor clusters to run a rich OS such as Linux. Each processor cluster includes four [Cortex-A720AE](https://developer.arm.com/Processors/Cortex-A720AE) cores and a [DSU-120AE](https://developer.arm.com/Processors/DSU-120AE) DynamIQ Shared Unit. +Create **Primary_Linux.launch** for Linux kernel debug with OS awareness: -The application processors will be debugged in an SMP configuration with Linux Kernel awareness. +- Use **File > New > Model Connection**. +- Select your **Zena_CSS_FVP** model. +- In **Edit configuration**, expand **Linux Kernel Debug** and choose **ARM_Cortex-A720AEx16 SMP Cluster 1**. + This connects to all 16 Cortex-A720AE processors described in the FVP. Only cores 0 to 3 are used by the default Linux configuration. -As shown above, create `Primary_init.launch` connection and scroll to `Bare Metal Debug` > `ARM_Cortex-A720AE_0`. This will connect to just CPU0, leaving the other CPUs free to run. +To learn more about OS awareness in Arm Debugger, see the [OS awareness documentation](https://developer.arm.com/documentation/101470/latest/Debugging-Embedded-Systems/About-OS-awareness). -To debug the Linux kernel you can make use of the [OS awareness](https://developer.arm.com/documentation/101470/latest/Debugging-Embedded-Systems/About-OS-awareness) feature of the Arm Debugger. -Create `Primary_Linux.launch` connection and scroll to `Linux Kernel Debug` > `ARM_Cortex-A720AEx16 SMP Cluster 1`. This will connect to all 16 `Cortex-A720AE` processors present in the FVP, though only cores 0-3 are used. diff --git a/content/learning-paths/automotive/zenacssdebug/launch.md b/content/learning-paths/automotive/zenacssdebug/launch.md index 5aba66e2c8..6422a11389 100644 --- a/content/learning-paths/automotive/zenacssdebug/launch.md +++ b/content/learning-paths/automotive/zenacssdebug/launch.md @@ -1,6 +1,6 @@ --- # User change -title: "Launch FVP" +title: "Launch the FVP" weight: 3 # 1 is first, 2 is second, etc. @@ -8,49 +8,46 @@ weight: 3 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -## Launch FVP +## Start the FVP from the build environment -You can now launch the FVP within the virtual environment with the software stack loaded: +You can launch the FVP within the build environment with the software stack loaded: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose" ``` -Refer to the [documentation](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html#run-the-fvp) for more details. -While you can continue to use this method to launch the FVP whilst debugging, this command does not enable the Iris debug server inside the model, and so will not be debuggable. -Additional command options are necessary. +See the [Arm Zena CSS User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html#run-the-fvp) for further information. -You will use the following. See output of `FVP_RD_Aspen --help` for full list and explanation. Options are case-sensitive. +While you can continue to use this method during debugging, it does not enable the Iris debug server in the model, so the system cannot be debugged from Arm Development Studio. Additional command-line options are required. -| Option | Alias | Notes | -|---------------------- |--------- |---------------------------------------------- | -| `--iris-server` | `-I` | Start Iris Debug Server | -| `--iris-port` | | Specify a port number (default = `7100`) | -| `--run` | `-R` | Run simulation when debug server started | -| `--iris-allow-remote` | `-A` | Allow remote connections (if different hosts) | +You will use the following options (see `FVP_RD_Aspen --help` for the full list). Options are case-sensitive. -### Launch FVP with additional options +| Option | Alias | Notes | +|-------------------------|:-----:|-------------------------------------------------------| +| `--iris-server` | `-I` | Start the Iris debug server | +| `--iris-port ` | | Set the Iris port (default `7100`) | +| `--run` | `-R` | Run the simulation when the debug server starts | +| `--iris-allow-remote` | `-A` | Allow remote connections (only if required) | -To launch the FVP with additional options, modify the above command by adding `--` and then the options. +## Enable the Iris debug server for Arm Development Studio -For example, to launch the model with the debug server and hold at the initial reset condition: +Append `--` to pass model options through `runfvp`. +Start the model with the debug server and hold at reset: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" ``` -To launch the model and start running (so that it can start to boot up): - +Start the model with the debug server and begin execution so that boot can progress: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run" ``` -To launch the model so that remote hosts can access it (not recommended if not needed), using options aliases: - +If required, allow remote debug connections using option aliases: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- -I -A --iris-port 7100" ``` {{% notice Note %}} -It is recommended to specify the port number used even if it is the default as that must match the debug connection setting (see later). +Even when using the default, specify the Iris port explicitly so it matches your debugger connection settings. If you enable remote connections, ensure your firewall allows inbound access to the chosen port. {{% /notice %}} diff --git a/content/learning-paths/automotive/zenacssdebug/primarycompute.md b/content/learning-paths/automotive/zenacssdebug/primarycompute.md index ae48551864..c3c41e4ae3 100644 --- a/content/learning-paths/automotive/zenacssdebug/primarycompute.md +++ b/content/learning-paths/automotive/zenacssdebug/primarycompute.md @@ -1,14 +1,13 @@ --- # User change -title: "Debug Primary Compute and Linux" +title: "Debug primary compute and Linux" weight: 8 # 1 is first, 2 is second, etc. # Do not modify these elements layout: "learningpathall" --- - -## Debug Primary Compute +## Debug primary compute The Primary Compute application processors (`Cortex-A720AE`) are the final processors to be enabled. @@ -16,11 +15,11 @@ As before, you can connect whilst powered down and monitor the point that they a You can debug the initialization code and the final Linux Operating System (OS) threads. -### Connect debugger to target +## Connect debugger to target Use the following debugger commands in the `Primary_init.launch` to load the symbols for the `BL2` initialization code, setting a breakpoint at `bl2_entrypoint`. -Note that an address "offset" is used to specify the exception level that the image is relevant to. If the processor changes exception level, the debug information would need to also be loaded to the corresponding EL address space. +Note that an address offset is used to specify the Exception Level (EL) that the image is relevant to. If the processor changes Exception Level, the debug information would need to also be loaded to the corresponding EL address space. For example the processors start in `EL3` and move to `EL2N` when the Linux kernel is enabled. @@ -29,43 +28,52 @@ stop add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-a/2.11.0+git/image/firmware/bl2.elf EL3:0x0 tbreak bl2_entrypoint ``` + {{% notice Note %}} -Exact paths may differ for your set up. +Exact paths might differ depending on your build output. {{% /notice %}} -Run the code to the `bl2_entrypoint` and you can debug as expected. +Run to **bl2_entrypoint** and step through as required. -### Debug Linux kernel modules +{{% notice Tip %}} +Symbol loading is Exception Level–aware. If execution changes Exception Level, load symbols into the corresponding EL address space. For example, the processors start in EL3 and transition to EL2N when the Linux kernel is enabled. +{{% /notice %}} -To make use of the OS awareness feature, disconnect `Primary_init` and connect to `Primary_Linux` as created previously. Load the symbols from the `vmlinux` image. +## Debug the Linux kernel with OS awareness (symmetric multiprocessing) -``` text +Switch to the `Primary_Linux.launch` connection you created earlier to enable Arm Development Studio OS awareness for the Linux kernel. Load the kernel symbols and set source mapping if your kernel sources are located outside the default paths: + +```text stop add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/linux-yocto/6.6.54+git/linux-fvp_rd_aspen-standard-build/vmlinux EL2N:0x0 set substitute-path /usr/src/kernel/ /arm-auto-solutions/build/tmp_baremetal/work-shared/fvp-rd-aspen/kernel-source/ ``` + Run the FVP until the OS prompt appears. {{% notice %}} -If you are only interested in kernel debug, modify the launch command for the FVP to include `--run` to start execution immediately. +If you only need kernel debugging, start the model with the debug server **and** begin execution immediately by adding `--run`: -``` command +```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run" ``` {{% /notice %}} -You can now enable the `Threads` view in the `Debug Control` pane. +## View Linux threads with OS awareness -Right-click on the connection, and select `Display Threads`. You can also do this by entering `thread` in the `Command` pane. +Enable the **Threads** view to inspect kernel threads instead of raw CPUs: -The view will then change from listing the 16 application processors to the OS threads. +In **Debug Control**, right-click the **Primary_Linux** connection and select **Display Threads** +2. Alternatively, enter `thread` in the **Command** pane. + +The view changes from listing the 16 application processors to the active OS threads. {{% notice Note %}} -A warning of the form: -``` text +You might see a warning like: +```text WARNING(ROS60): Could not enable OS support as the OS does not appear to be initialized. This might be caused by a mismatch between the loaded symbols and the code on the target or because the OS is not up and running. Enabling OS support will be re-attempted when the target next stops. ``` -may be emitted if the OS is not booted when you connect. It can safely be ignored. +This occurs if the OS has not completed boot when you connect; it is safe to ignore and will clear after the next target stop. {{% /notice %}} You have successfully learnt how to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack. diff --git a/content/learning-paths/automotive/zenacssdebug/rse.md b/content/learning-paths/automotive/zenacssdebug/rse.md index 007a7a17f9..0580a209aa 100644 --- a/content/learning-paths/automotive/zenacssdebug/rse.md +++ b/content/learning-paths/automotive/zenacssdebug/rse.md @@ -8,72 +8,80 @@ weight: 6 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -## Debug RSE from reset +## Overview -Let us start by debugging the initial code that executes on the Cortex-M55 within the RSE block. +You'll now move on to debug the initial code that runs on the Runtime Security Engine (RSE) based on Cortex-M55 in the Zena CSS FVP. You will launch the model with the Iris debug server, connect from Arm Development Studio, load Trusted Firmware-M (TF‑M) symbols, and step from reset. -### Launch FVP +## Launch the FVP and hold at reset -Start a new `tmux` session for the FVP (if necessary): +Start a new `tmux` session for the FVP if needed: ```command tmux new-session -s arm-auto-solutions ``` -and navigate to your code repository. - -To debug from reset, launch the FVP with the Iris server but do not run. This will hold the FVP in the initial reset condition. +Navigate to your code repository, then launch the FVP with Iris **without** running so it stays at reset: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" ``` -The FVP will start and generate various informational messages. Once initialized you should see something similar to: +The FVP initializes and prints information messages, for example: ```output ... Info: RD_Aspen: RD_Aspen.css.smb.rse_flashloader: FlashLoader: Saved 64MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/rse-flash-image.img' Info: RD_Aspen: RD_Aspen.ros.flash_loader: FlashLoader: Saved 128MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/ap-flash-image.img' ``` - Note that execution has not started. -### Connect the debugger +{{% notice Tip %}} +If you need remote debugging, start the FVP with `-A` and ensure the chosen Iris port (default `7100`) is reachable through your firewall. +{{% /notice %}} -Using the `RSE` connection created in the previous section, connect the debugger to the FVP. Observe that the processor is stopped before the first instruction has been executed. +## Connect the debugger to RSE (Cortex-M55) -In fact, the FVP is configured to have the vector table (`VTOR_S`) start at `0x11000000`, and if you inspect memory at that address the vector table will be populated. However no debug information is visible. Debug information must be loaded. +Use the **RSE** model connection you created earlier to attach the debugger. The processor is stopped before the first instruction. -In the `Debug Pane`, select `Load...` from the pane menu, and select `Add Symbols file`. +The FVP configures the secure vector table (**VTOR_S**) at `0x11000000`. If you inspect memory at that address, the vector table is populated, but source is not visible until you load symbols. -Browse to the `bl1_1.axf` file which is likely at: - -``` bash -/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf -``` -Debug symbols will be loaded, but likely no source will be displayed. This is because the build was performed within the virtual environment but the debugger is running outside of that. +Load TF‑M symbols and map sources: -You will be prompted to enter a path substitution to locate the sources. You can refer to the lowest common path so that all subsequent source files will also be located successfully. +- In **Debug Control**, open the pane menu and choose **Load...** +- Select **Add Symbols file**. +- Choose the TF‑M image, for example: + ```bash + /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf + ``` +When prompted for **substitute path**, map build-time paths to your local sources, for example: + ```bash + /usr/src/debug/trusted-firmware-m/2.1.0/ + /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/ + ``` -``` bash -/usr/src/debug/trusted-firmware-m/2.1.0/ -/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/" +Step one instruction to fetch the reset handler and stop there: +```text +stepi ``` -Finally, to perform a single instruction step (`stepi`) to allow the processor to fetch the address of the `Reset_Handler` and stop there. - -You can now step through the code, set breakpoints, and inspect the target as the code proceeds. -### Automate setup +You can now step through code, set breakpoints, and inspect the target as the code proceeds. -For convenience, it is possible to automate these actions every time you connect by entering them as `Debugger Commands` in the `.launch` configuration. +{{% notice Note %}} +Paths vary by environment. Use your actual build output and source locations when adding symbols or configuring path substitution. +{{% /notice %}} -Open (double-click) the `.launch` file, and navigate to the `Debugger` pane. +## Automate setup with debugger commands -Enable `Execute debugger commands`, and enter the following (note pathing for your setup). You can copy the exact commands from the `Command` or `History` pane whilst performing the above GUI configuration. +Automate the connection steps by adding **Debugger Commands** to the `.launch` configuration so they run on every attach: -It is recommended to have an explicit `stop` command as symbols cannot be loaded whilst the target is running. +- Open (double-click) your **RSE.launch** file. +- Go to the **Debugger** tab. +- Enable **Execute debugger commands**. +- Add commands similar to the following (adjust paths as needed). -``` text +```text stop add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf set substitute-path /usr/src/debug/trusted-firmware-m/2.1.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/ stepi ``` -![Debugger pane](debugger_commands.png) + +![RSE.launch in Arm Development Studio showing Debugger pane with TF-M symbols loaded and path substitution mapping alt-text#center](debugger_commands.png "RSE Debugger pane with TF-M symbol loading and source path substitution") + diff --git a/content/learning-paths/automotive/zenacssdebug/safetyisland.md b/content/learning-paths/automotive/zenacssdebug/safetyisland.md index 951e973531..02663a4d34 100644 --- a/content/learning-paths/automotive/zenacssdebug/safetyisland.md +++ b/content/learning-paths/automotive/zenacssdebug/safetyisland.md @@ -9,19 +9,25 @@ layout: "learningpathall" --- ## Debug Safety Island code from beginning -The Safety Island (Cortex-R82AE) is released from reset by the RSE code, and so the RSE code must proceed to that point before the Safety Island core can execute. +The Safety Island subsystem based on the Cortex-R82AE is released from reset by RSE code. To debug Safety Island from first instruction, you must let the RSE (Cortex‑M55) code reach the point where it enables Safety Island on the Zena CSS FVP. -### Launch FVP +## Launch the FVP and reconnect RSE -If necessary, restart the FVP in the reset state as before, and reconnect `RSE`. +If necessary, start (or restart) the FVP held at reset and reconnect the RSE model connection in Arm Development Studio: ```command kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" ``` -Set up the `SI` connection in a similar way as the `RSE` connection. Use the following commands in the `Debugger` pane. This will load debug symbols and perform the necessary path substitution. You can then set a breakpoint on the entry of the `SI` code, `arch_exception_reset`. +{{% notice Tip %}} +For remote debugging, add `-A` and ensure the chosen Iris port (default `7100`) is reachable. +{{% /notice %}} + +## Connect the debugger to Safety Island (Cortex-R82AE) -``` text +Configure the **SI** model connection similarly to **RSE**. Add the following **Debugger commands** to load symbols, set up source path substitution, and break at the Safety Island reset entry (`arch_exception_reset`): + +```text stop add-symbol-file /arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/si0_ramfw.elf set substitute-path /usr/src/debug/scp-firmware/2.14.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/scp-firmware/2.14.0/git/ @@ -29,22 +35,22 @@ b arch_exception_reset ``` {{% notice Note %}} -Exact paths may differ for your set up. +Paths vary by environment. Use your actual build output and source locations when adding symbols or configuring path substitution. {{% /notice %}} -### Start execution +## Start execution to release Safety Island -Select the `RSE` connection in the `Debug Control` pane, and start execution (this will be unavailable in the `SI` connection, as that is currently powered down). +In **Debug Control**, select the **RSE** connection and start execution (run). The **SI** connection remains unavailable to run until Safety Island is powered up. -The `RSE` code will run until the point that the `SI` is enabled. This is reflected in the output log. +When RSE enables Safety Island, you will see a log message like: -``` output +```output [INF] BL2: SI CL0 post load start ``` -#### Full output log +## Full output log -The full output lof is shown here for your reference: +The full output log is shown here for your reference: ``` output Trying ::1... diff --git a/content/learning-paths/automotive/zenacssdebug/zena.md b/content/learning-paths/automotive/zenacssdebug/zena.md index fffc489bbb..a6a8edb1e9 100644 --- a/content/learning-paths/automotive/zenacssdebug/zena.md +++ b/content/learning-paths/automotive/zenacssdebug/zena.md @@ -8,66 +8,66 @@ weight: 2 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -# Arm Zena Compute Subsystem +## Arm Zena Compute Subsystem -The Arm Zena Compute Subsystem (CSS) consists of a high-performance Arm Cortex-A720AE Application Processor (Primary Compute) system augmented with an Arm Cortex-R82AE based Safety Island (SI) and real-time domain to host additional system safety monitoring and real-time services. +The Arm Zena Compute Subsystem (CSS) consists of a high-performance Arm Cortex-A720AE application processor system (primary compute), augmented with an Arm Cortex-R82AE–based Safety Island (SI) and a real-time domain to host additional system-safety monitoring and real-time services. -The system additionally includes a Runtime Security Engine (RSE) used for the secure boot of the system elements and the runtime secure services. +The system also includes a Runtime Security Engine (RSE), which is used for secure boot of the system elements and to provide runtime secure services. -The Arm Zena CSS software stack provides an open-source, integrated solution running on a Fixed Virtual Platform (FVP). +The Arm Zena CSS Reference Software Stack provides an open-source, integrated solution running on a Fixed Virtual Platform (FVP). Both the reference software stack and the FVP are freely available. -The reference software stack and the FVP are freely available. +For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem). -For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem) and associated links. +## Build the software stack -## Build software stack +Follow the steps to download and build the software stack in the [Arm Zena CSS User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html). -Follow the steps to download and build the software stack in the [User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html). - -The default `Arm Automotive Solutions Demo` build is used. +Here the default **Arm Automotive Solutions Demo** build is used. {{% notice Note %}} -The focus of this Learning Path is to demonstrate the **debug** of the software stack. +The primary focus of this Learning Path is to demonstrate how to debug the software stack. {{% /notice %}} -## Verify correct build and execution +## Verify the build and execution -Once the software stack has been built, you can verify that it runs successfully with the command: +After you build the software stack, verify that it runs successfully: -``` command +```bash kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose" ``` -The system will run through the boot process until a Linux prompt is available (in `terminal_ns_uart0`). +The system runs through the boot process until a Linux prompt is available (in `terminal_ns_uart0`). -Use `Ctrl+C` on the command terminal to terminate. +Press **Ctrl+C** in the command terminal to terminate the process. ## Install FVP (optional) -The FVP is downloaded and installed as part of the build process above. +The FVP is downloaded and installed as part of the build process. -The `Arm-Zena-CSS-FVP` can also be independently downloaded from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs). +You can also separately download Arm-Zena-CSS-FVP from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs). See also the Arm Ecosystem FVPs and Architecture Envelope Models [Install Guide](/install-guides/fm_fvp/eco_fvp/). {{% notice Note %}} -For legacy reasons the FVP is named is `FVP_RD_Aspen`. +For legacy reasons, the FVP is named `FVP_RD_Aspen`. {{% /notice %}} -# Arm Development Studio +## Arm Development Studio + +Arm Development Studio is a software development environment with multicore debug support for Arm CPUs. It provides early support for the latest processors and works seamlessly with FVPs. -Arm Development Studio is a software development solution with support of multicore debug for Arm CPUs. It provides the earliest support for the latest processors. +The CPUs implemented within Arm Zena CSS are supported by Arm Development Studio 2024.0 and later; however, 2024.1 or later is recommended for Linux OS debug support. At the time of writing, the latest version is 2025.0, which is used for this Learning Path. -The CPUs implemented within Arm Zena CSS are supported by Arm Development Studio 2024.0 and later, though 2024.1 or later is recommended for appropriate Linux OS support. At time of writing the latest version available is 2025.0, and that is the version used for this learning path. +For more information, see [Arm Development Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio). -For more information see [Arm Development Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio). +Arm Development Studio is a commercial, license-managed product. For installation and setup instructions, see the [Arm Development Studio Install Guide](/install-guides/armds/). -Arm Development Studio is a commercial, license managed, product. For installation and set up instructions, see this [Install Guide](/install-guides/armds/). +Launch the IDE and create a new workspace folder. -Launch the IDE. It is recommended to create a new workspace folder. +If you’re prompted by the launcher (this prompt is disabled by default), create a new folder there. -If prompted by the launcher (this is disabled by default) create a new folder there, else select `File` > `Switch Workspace` > `Other...`. +Otherwise, select **File** > **Switch Workspace** > **Other**. {{% notice Note %}} -To enable this prompt by default, navigate to `Window` > `Preferences` > `General` > `Startup and Shutdown` > `Workspaces`, and enable `Prompt for workspace on startup`. +To enable the workspace prompt, go to **Window** > **Preferences** > **General** > **Startup and Shutdown** > **Workspaces**, and enable **Prompt for workspace on startup**. {{% /notice %}} diff --git a/content/learning-paths/cross-platform/intrinsics/_index.md b/content/learning-paths/cross-platform/intrinsics/_index.md index 9f286a16ca..d6eea54443 100644 --- a/content/learning-paths/cross-platform/intrinsics/_index.md +++ b/content/learning-paths/cross-platform/intrinsics/_index.md @@ -23,8 +23,7 @@ author: Jason Andrews test_images: - amd64/ubuntu:latest - arm64v8/ubuntu:latest -test_link: https://github.com/armflorentlebeau/arm-learning-paths/actions/runs/4312122327 -test_maintenance: true +test_maintenance: false ### Tags skilllevels: Advanced diff --git a/content/learning-paths/cross-platform/topdown-compare/1-top-down.md b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md new file mode 100644 index 0000000000..de65d5cd6f --- /dev/null +++ b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md @@ -0,0 +1,197 @@ +--- +title: Top-down performance analysis +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What are the differences between Arm and x86 PMU counters? + +This is a common question from software developers and performance engineers. + +Both Arm and x86 CPUs provide sophisticated Performance Monitoring Units (PMUs) with hundreds of hardware counters. Instead of trying to list all available counters and compare microarchitecture, it makes more sense to focus on the performance methodologies they enable and the calculations used for performance metrics. + +While the specific counter names and formulas differ between architectures, both have converged on top-down performance analysis methodologies that categorize performance bottlenecks into four buckets: Retiring, Bad Speculation, Frontend Bound, and Backend Bound. + +This Learning Path provides a comparison of how Arm and x86 processors implement top-down +analysis, highlighting the similarities in approach while explaining the architectural differences in counter events and formulas. + +## Introduction to top-down performance analysis + +Top-down methodology makes performance analysis easier by shifting focus from individual performance +counters to pipeline slot utilization. Instead of trying to interpret dozens of seemingly unrelated metrics, you can systematically identify bottlenecks by attributing each CPU pipeline slot to one of four categories. + +- Retiring: pipeline slots that successfully complete useful work +- Bad Speculation: slots wasted on mispredicted branches +- Frontend Bound: slots stalled due to instruction fetch/decode limitations +- Backend Bound: slots stalled due to execution resource constraints + +The methodology uses a hierarchical approach that allows you to drill down only into the dominant bottleneck category, and avoid the complexity of analyzing all possible performance issues at the same time. + +The next sections compare the Intel x86 methodology with the Arm top-down methodology. AMD also has an equivalent top-down methodology which is similar to Intel, but uses different counters and calculations. + +## Intel x86 top-down methodology + +Intel uses a slot-based accounting model where each CPU cycle provides multiple issue slots. A slot is a hardware resource needed to process operations. More slots means more work can be done. The number of slots depends on the design but current processor designs have 4, 6, or 8 slots. + +### Hierarchical Structure + +Intel uses a multi-level hierarchy that typically extends to 4 levels of detail. + +**Level 1 (Top-Level):** + +At Level 1, all pipeline slots are attributed to one of four categories, providing a high-level view of whether the CPU is doing useful work or stalling. + +- Retiring = `UOPS_RETIRED.RETIRE_SLOTS / SLOTS` +- Bad Speculation = `(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + N * RECOVERY_CYCLES) / SLOTS` +- Frontend Bound = `IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS` +- Backend Bound = `1 - (Frontend + Bad Spec + Retiring)` + +Where `SLOTS = 4 * CPU_CLK_UNHALTED.THREAD` on most Intel cores. + +**Level 2 breakdown:** + +Level 2 drills into each of these to identify broader causes, such as distinguishing between frontend latency and bandwidth limits, or between memory and core execution stalls in the backend. + +- Frontend Bound covers frontend latency vs. frontend bandwidth +- Backend Bound covers memory bound vs. core bound +- Bad Speculation covers branch mispredicts vs. machine clears +- Retiring covers base vs. microcode sequencer + +**Level 3 breakdown:** + +Level 3 provides fine-grained attribution, pinpointing specific bottlenecks like DRAM latency, cache misses, or port contention, which makes it possible to identify the exact root cause and apply targeted optimizations. + +- Memory Bound includes L1 Bound, L2 Bound, L3 Bound, DRAM Bound, Store Bound +- Core Bound includes Divider, Ports Utilization +- And many more specific categories + +**Level 4 breakdown:** + +Level 4 provides the specific microarchitecture events that cause the inefficiencies. + +### Key Performance Events + +Intel processors expose hundreds of performance events, but top-down analysis relies on a core set: + +| Event Name | Purpose | +| :---------------------------------------------- | :----------------------------------------------------------------------------------- | +| `UOPS_RETIRED.RETIRE_SLOTS` | Count retired micro-operations (Retiring) | +| `UOPS_ISSUED.ANY` | Count issued micro-operations (helps quantify Bad Speculation) | +| `IDQ_UOPS_NOT_DELIVERED.CORE` | Frontend delivery failures (Frontend Bound) | +| `CPU_CLK_UNHALTED.THREAD` | Core clock cycles (baseline for normalization) | +| `BR_MISP_RETIRED.ALL_BRANCHES` | Branch mispredictions (Bad Speculation detail) | +| `MACHINE_CLEARS.COUNT` | Pipeline clears due to memory ordering or faults (Bad Speculation detail) | +| `CYCLE_ACTIVITY.STALLS_TOTAL` | Total stall cycles (baseline for backend breakdown) | +| `CYCLE_ACTIVITY.STALLS_MEM_ANY` | Aggregate stalls from memory hierarchy misses (Backend → Memory Bound) | +| `CYCLE_ACTIVITY.STALLS_L1D_MISS` | Stalls due to L1 data cache misses | +| `CYCLE_ACTIVITY.STALLS_L2_MISS` | Stalls waiting on L2 cache misses | +| `CYCLE_ACTIVITY.STALLS_L3_MISS` | Stalls waiting on last-level cache misses | +| `MEM_LOAD_RETIRED.L1_HIT` / `L2_HIT` / `L3_HIT` | Track where loads are satisfied in the cache hierarchy | +| `MEM_LOAD_RETIRED.L3_MISS` | Loads missing LLC and going to memory | +| `MEM_LOAD_RETIRED.DRAM_HIT` | Loads serviced by DRAM (DRAM Bound detail) | +| `OFFCORE_RESPONSE.*` | Detailed classification of off-core responses (L3 vs. DRAM, local vs. remote socket) | + + +Using the above levels of metrics you can find out which of the 4 top-level categories are causing bottlenecks. + +### Arm top-down methodology + +Arm developed a similar top-down methodology for Neoverse server cores. The Arm architecture uses an 8-slot rename unit for pipeline bandwidth accounting. + +### Two-Stage Approach + +Unlike Intel's hierarchical model, Arm employs a two-stage methodology: + +**Stage 1: Topdown analysis** + +- Identifies high-level bottlenecks using the same four categories +- Uses Arm-specific PMU events and formulas +- Slot-based accounting similar to Intel but with Arm event names + +**Stage 2: Micro-architecture exploration** + +- Resource-specific effectiveness metrics grouped by CPU component +- Industry-standard metrics like MPKI (Misses Per Kilo Instructions) +- Detailed breakdown without strict hierarchical drilling + +### Stage 1 formulas + +Arm uses different top-down metrics based on different events but the concept is similar. + +| Metric | Formula | Purpose | +| :-- | :-- | :-- | +| Backend bound | `100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * 8))` | Backend resource constraints | +| Frontend bound | `100 * ((STALL_SLOT_FRONTEND / (CPU_CYCLES * 8)) - (BR_MIS_PRED / (4 * CPU_CYCLES)))` | Frontend delivery issues | +| Bad speculation | `100 * (1 - (OP_RETIRED/OP_SPEC)) * (1 - (STALL_SLOT/(CPU_CYCLES * 8))) + (BR_MIS_PRED / (4 * CPU_CYCLES))` | Misprediction recovery | +| Retiring | `100 * (OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/(CPU_CYCLES * 8)))` | Useful work completed | + +### Stage 2 resource groups + +Instead of hierarchical levels, Arm organizes detailed metrics into effectiveness groups as shown below: + +- Branch Effectiveness: Misprediction rates, MPKI +- ITLB/DTLB Effectiveness: Translation lookaside buffer efficiency +- L1I/L1D/L2/LL Cache Effectiveness: Cache hit ratios and MPKI +- Operation Mix: Breakdown of instruction types (SIMD, integer, load/store) +- Cycle Accounting: Frontend vs. backend stall percentages + +### Key performance events + +Neoverse cores expose approximately 100 hardware events optimized for server workloads, including: + +| Event Name | Purpose / Usage | +| :-------------------- | :--------------------------------------------------------------------------------------- | +| `CPU_CYCLES` | Core clock cycles (baseline for normalization). | +| `OP_SPEC` | Speculatively executed micro-operations (used as slot denominator). | +| `OP_RETIRED` | Retired micro-operations (used to measure useful work). | +| `INST_RETIRED` | Instructions retired (architectural measure; used for IPC, MPKI normalization). | +| `INST_SPEC` | Instructions speculatively executed (needed for operation mix and speculation analysis). | +| `STALL_SLOT` | Total stall slots (foundation for efficiency metrics). | +| `STALL_SLOT_FRONTEND` | Stall slots due to frontend resource constraints. | +| `STALL_SLOT_BACKEND` | Stall slots due to backend resource constraints. | +| `BR_RETIRED` | Branches retired (baseline for branch misprediction ratio). | +| `BR_MIS_PRED_RETIRED` | Mispredicted branches retired (branch effectiveness, speculation waste). | +| `L1I_CACHE_REFILL` | Instruction cache refills (frontend stalls due to I-cache misses). | +| `ITLB_WALK` | Instruction TLB walks (frontend stalls due to translation). | +| `L1D_CACHE_REFILL` | Data cache refills (backend stalls due to L1D misses). | +| `L2D_CACHE_REFILL` | Unified L2 cache refills (backend stalls from L2 misses). | +| `LL_CACHE_MISS_RD` | Last-level/system cache read misses (backend stalls from LLC/memory). | +| `DTLB_WALK` | Data TLB walks (backend stalls due to translation). | +| `MEM_ACCESS` | Total memory accesses (baseline for cache/TLB effectiveness ratios). | + + +## Arm compared to x86 + +### Conceptual similarities + +Both architectures adhere to the same fundamental top-down performance analysis philosophy: + +1. Four-category classification: Retiring, Bad Speculation, Frontend Bound, Backend Bound +2. Slot-based accounting: Pipeline utilization measured in issue or rename slots +3. Hierarchical analysis: Broad classification followed by drill-down into dominant bottlenecks +4. Resource attribution: Map performance issues to specific CPU micro-architectural components + +### Key Differences + +| Aspect | x86 Intel | Arm Neoverse | +| :-- | :-- | :-- | +| Hierarchy Model | Multi-level tree (Level 1 → Level 2 → Level 3+) | Two-stage: Topdown Level 1 + Resource Groups | +| Slot Width | 4 issue slots per cycle (typical) | 8 rename slots per cycle (Neoverse V1) | +| Formula Basis | Micro-operation (uop) centric | Operation and cycle centric | +| Event Naming | Intel-specific mnemonics | Arm-specific mnemonics | +| Drill-down Strategy | Strict hierarchical descent | Exploration by resource groups | + +### Event Mapping Examples + +| Performance Question | x86 Intel Events | Arm Neoverse Events | +| :-- | :-- | :-- | +| Frontend bound? | `IDQ_UOPS_NOT_DELIVERED.*` | `STALL_SLOT_FRONTEND` | +| Bad speculation? | `BR_MISP_RETIRED.*` | `BR_MIS_PRED_RETIRED` | +| Memory bound? | `CYCLE_ACTIVITY.STALLS_L3_MISS` | `L1D_CACHE_REFILL`, `L2D_CACHE_REFILL` | +| Cache effectiveness? | `MEM_LOAD_RETIRED.L3_MISS_PS` | Cache refill metrics / Cache access metrics | + +While it doesn't make sense to directly compare PMU counters for the Arm and x86 architectures, it is useful to understand the top-down methodologies for each so you can do effective performance analysis and compare you code running on each architecture. + +Continue to the next step to try a code example. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md new file mode 100644 index 0000000000..1050cceb5b --- /dev/null +++ b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md @@ -0,0 +1,268 @@ +--- +title: Performance analysis code example +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example code + +To compare top-down on Arm and x86 you can run a small example to gain some practical experience. + +You can prepare the application and test it on both x86 and Arm Linux systems. You will need a C compiler installed, [GCC](/install-guides/gcc/native/) or Clang, and [Perf](/install-guides/perf/) installed on each system. Refer to the package manager for your Linux distribution for installation information. + +Use a text editor to copy the code below to a file named `test.c` + +```C +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + long long num_iterations = atoll(argv[1]); + if (num_iterations <= 0) { + fprintf(stderr, "Number of iterations must be a positive integer.\n"); + return 1; + } + + // Using volatile tells the compiler not to optimize this variable away. + // We initialize it to a non-trivial value. + volatile double result = 1.23456789; + + printf("Performing %lld dependent floating-point divisions...\n", num_iterations); + + // This loop creates a long dependency chain of floating-point divisions. + // Division is a high-latency operation. The dependency between iterations + // means the CPU backend will be stalled waiting for the result of the + // previous division before it can start the next one. This creates a + // classic backend-bound scenario, specifically core-bound. + for (long long i = 0; i < num_iterations; ++i) { + result /= 1.00000001; + } + + printf("Done. Final result: %f\n", (double)result); + + return 0; +} +``` + +This program takes a single command-line argument specifying the number of iterations to run. It performs that many sequential floating-point divisions in a loop, using a volatile variable to prevent compiler optimization, and prints the final result. + +It's a contrived example used to create a dependency chain of high-latency operations (divisions), simulating a CPU-bound workload where each iteration must wait for the previous one to finish. + +Build the application using GCC: + +```console +gcc -O3 -march=native -o test test.c +``` + +You can also use Clang by substituting `clang` instead of `gcc` in the command above. + +Run the application and pin it to one core to make the numbers more consistent: + +```console +taskset -c 1 ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +``` + +## Collect x86 top-down level 1 metrics + +Linux Perf computes top-down level 1 breakdown as described in the previous section for Retiring, Bad Speculation, Frontend Bound, and Backend Bound. + +Use `perf stat` to on the pinned core to collect the metrics. + +```console +taskset -c 1 perf stat -C 1 --topdown ./test 1000000000 +``` + +The output will be similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 + + Performance counter stats for 'CPU(s) 1': + + retiring bad speculation frontend bound backend bound +S0-D0-C1 1 8.5% 0.0% 0.1% 91.4% + + 6.052117775 seconds time elapsed +``` + +You see a very large `backend bound` component for this program. + +You can also run with the `-M topdownl1` argument on Perf. + +```console +taskset -c 1 perf stat -C 1 -M topdownl1 ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 + + Performance counter stats for 'CPU(s) 1': + + 3,278,902,619 uops_issued.any # 0.00 Bad_Speculation (14.30%) + 19,185,808,092 cpu_clk_unhalted.thread # 0.04 Retiring (14.30%) + 3,275,536,897 uops_retired.retire_slots (14.30%) + 1,065,517 int_misc.recovery_cycles (14.30%) + 3,263,874,383 uops_issued.any # 0.96 Backend_Bound (14.33%) + 28,107,558 idq_uops_not_delivered.core (28.64%) + 631,768 int_misc.recovery_cycles (42.90%) + 19,173,526,414 cpu_clk_unhalted.thread (57.17%) + 19,176,373,078 cpu_clk_unhalted.thread # 0.00 Frontend_Bound (42.79%) + 25,090,380 idq_uops_not_delivered.core (42.79%) + cpu_clk_unhalted.thread + + 6.029283206 seconds time elapsed +``` + +Again, showing `Backend_Bound` value very high (0.96). + +If you want to learn more, you can continue with the level 2 and level 3 analysis. + + +## Use the Arm top-down methodology + +Make sure you install the Arm top-down tool. + +Use the [Telemetry Solution install guide](/install-guides/topdown-tool/) for information about installing `topdown-tool`. + +Collect instructions per cycle (IPC): + +```console +taskset -c 1 topdown-tool -m General ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 2 (uarch metrics) +======================= +[General] +Instructions Per Cycle 0.355 per cycle +``` + +Connect the stage 1 metrics: + +```console +taskset -c 1 topdown-tool -m Cycle_Accounting ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 1 (Topdown metrics) +========================= +[Cycle Accounting] +Frontend Stalled Cycles 0.04% cycles +Backend Stalled Cycles. 88.15% cycles +``` + +This confirms the example has high backend stalls as on x86. + +You can continue to use the `topdown-tool` for additional microarchitecture exploration. + +For L1 data cache: + +```console +taskset -c 1 topdown-tool -m L1D_Cache_Effectiveness ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 2 (uarch metrics) +======================= +[L1 Data Cache Effectiveness] +L1D Cache MPKI............... 0.023 misses per 1,000 instructions +L1D Cache Miss Ratio......... 0.000 per cache access +``` + +For L1 instruction cache: + +```console +taskset -c 1 topdown-tool -m L1D_Cache_Effectiveness ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 2 (uarch metrics) +======================= +[L1 Data Cache Effectiveness] +L1D Cache MPKI............... 0.022 misses per 1,000 instructions +L1D Cache Miss Ratio......... 0.000 per cache access +``` + +For last level cache: + +```console +taskset -c 1 topdown-tool -m LL_Cache_Effectiveness ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 2 (uarch metrics) +======================= +[Last Level Cache Effectiveness] +LL Cache Read MPKI.............. 0.017 misses per 1,000 instructions +LL Cache Read Miss Ratio........ 0.802 per cache access +LL Cache Read Hit Ratio......... 0.198 per cache access +``` + +For operation mix: + +```console +taskset -c 1 topdown-tool -m Operation_Mix ./test 1000000000 +``` + +The output is similar to: + +```output +Performing 1000000000 dependent floating-point divisions... +Done. Final result: 0.000056 +Stage 2 (uarch metrics) +======================= +[Speculative Operation Mix] +Load Operations Percentage.......... 16.70% operations +Store Operations Percentage......... 16.59% operations +Integer Operations Percentage....... 33.61% operations +Advanced SIMD Operations Percentage. 0.00% operations +Floating Point Operations Percentage 16.45% operations +Branch Operations Percentage........ 16.65% operations +Crypto Operations Percentage........ 0.00% operations +``` + + +## Summary + +Both Arm Neoverse and modern x86 cores expose hardware events that Perf aggregates into the same top-down categories. Names of the PMU counters differ, but the level 1 categories are the same. + +If you are working on both architectures you can use the same framework with minor differences between Intel's hierarchical structure and Arm's two-stage resource groups to systematically identify and resolve performance bottlenecks. + diff --git a/content/learning-paths/cross-platform/topdown-compare/_index.md b/content/learning-paths/cross-platform/topdown-compare/_index.md new file mode 100644 index 0000000000..275149ff08 --- /dev/null +++ b/content/learning-paths/cross-platform/topdown-compare/_index.md @@ -0,0 +1,59 @@ +--- +title: "Compare Arm and x86 Top-Down Performance Analysis" + +minutes_to_complete: 30 + +draft: true +cascade: + draft: true + +who_is_this_for: This is an advanced topic for software developers who want to understand the similarities and differences between Arm and x86 top-down performance analysis. + +learning_objectives: + - Describe the similarities and differences between top-down performance analysis on x86 and Arm Linux systems. + - Run applications on both architectures and understand how performance analysis is done on each system. + +prerequisites: + - Familiarity with performance analysis on Linux systems using Perf. + - Arm and x86 Linux systems to try code examples. + +author: + - Jason Andrews + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Neoverse +operatingsystems: + - Linux +tools_software_languages: + - GCC + - Clang + +shared_path: true +shared_between: + - servers-and-cloud-computing + - automotive + +further_reading: + - resource: + title: Arm Neoverse V1 Top-down Methodology for Performance Analysis & Telemetry Specification + link: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/arm-neoverse-v1-top-down-methodology + type: blog + - resource: + title: Performance Analysis and Tuning on Modern CPUs + link: https://www.amazon.com/Performance-Analysis-Tuning-Modern-CPUs/dp/B0DNQZJ92S + type: documentation + - resource: + title: How to use the Arm Performance Monitoring Unit and System Counter + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/arm_pmu/). + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/topdown-compare/_next-steps.md b/content/learning-paths/cross-platform/topdown-compare/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/cross-platform/topdown-compare/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md index d17480fcbc..b83ed43fbc 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md @@ -1,28 +1,30 @@ --- -title: Migrating SIMD code to the Arm architecture +title: "Migrate SIMD code to the Arm architecture" weight: 3 -### FIXED, DO NOT MODIFY -layout: learningpathall +# FIXED, DO NOT MODIFY +layout: "learningpathall" --- -## Vectorization on x86 vs. Arm +## Vectorization on x86 and Arm -Migrating SIMD (Single Instruction, Multiple Data) code from x86 extensions to Arm extensions is an important task for software developers aiming to optimize performance on Arm platforms. +Migrating SIMD (Single Instruction, Multiple Data) code from x86 extensions to Arm extensions is a key task for software developers aiming to optimize performance on Arm platforms. -Understanding the mapping between x86 instruction sets like SSE, AVX, and AMX to Arm's NEON, SVE, and SME extensions is essential for ensuring portability and high performance. This Learning Path provides an overview to help you design a migration plan, leveraging Arm features such as scalable vector lengths and advanced matrix operations, to effectively adapt your code. +Understanding the mapping from x86 instruction sets such as SSE, AVX, and AMX to Arm’s NEON, SVE, and SME extensions is essential for achieving portability and high performance. This Learning Path provides an overview to help you design a migration plan in which you can leverage Arm features such as scalable vector lengths and advanced matrix operations to adapt your code effectively. -Vectorization is a key optimization strategy where one instruction processes multiple data elements simultaneously. It drives performance in HPC, AI/ML, signal processing, and data analytics. +Vectorization is a key optimization strategy where one instruction processes multiple data elements simultaneously. It drives performance in High-Performance Computing (HPC), AI and ML, signal processing, and data analytics. -Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers a mix of fixed-width, for NEON, and scalable vectors for SVE and SME ranging from 128 to 2048 bits. +Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers fixed-width vectors for NEON and scalable vectors for SVE and SME, ranging from 128 to 2048 bits. -If you are interested in migrating SIMD software to Arm, understanding these differences ensures portable, high-performance code. +If you are migrating SIMD software to Arm, understanding these differences will help you write portable, high-performance code. ## Arm vector and matrix extensions +This section provides some more information about the Arm vector and matrix extensions and shows you when to use each, how they map from SSE/AVX/AMX, and what changes in your programming model (predication, gather/scatter, tiles, streaming mode). + ### NEON -NEON is a 128-bit SIMD extension available across all Armv8 cores, including both mobile and Neoverse platforms. It is particularly well-suited for multimedia processing, digital signal processing (DSP), and packet processing workloads. Conceptually, NEON is equivalent to x86 SSE or AVX-128, making it the primary target for migrating SSE workloads. Compiler support for auto-vectorization to NEON is mature, simplifying the migration process for developers. +NEON is a 128-bit SIMD extension available across Armv8-A cores, including Neoverse and mobile. It is well suited to multimedia, DSP, and packet processing. Conceptually, NEON is closest to x86 SSE and AVX used in 128-bit mode, making it the primary target when migrating many SSE workloads. Compiler auto-vectorization to NEON is mature, reducing the need for manual intrinsics. ### Scalable Vector Extension (SVE) @@ -30,116 +32,100 @@ SVE introduces a revolutionary approach to SIMD with its vector-length agnostic ### Scalable Matrix Extension (SME) -SME is designed to accelerate matrix multiplication and is similar to AMX. Unlike AMX, which relies on dot-product-based operations, SME employs outer-product-based operations, providing greater flexibility for custom AI and HPC kernels. SME integrates seamlessly with SVE, utilizing scalable tiles and a streaming mode to optimize performance. It is particularly well-suited for AI training and inference workloads, as well as dense linear algebra in HPC applications. +SME accelerates matrix multiplication and is similar in intent to AMX. Unlike AMX, which often uses dot-product oriented operations, SME employs outer-product oriented operations. SME integrates with SVE, using scalable tiles and a streaming mode to optimize performance. It is well suited to AI training and inference, as well as dense linear algebra in HPC applications. ## x86 vector and matrix extensions +Here is a brief overview of the x86 families you’ll likely port from: SSE (128-bit), AVX/AVX-512 (256/512-bit with masking), and AMX (tile-based matrix compute). Use this to identify feature equivalents before mapping kernels to NEON, SVE/SVE2, or SME on Arm. + ### Streaming SIMD Extensions (SSE) The SSE instruction set provides 128-bit XMM registers and supports both integer and floating-point SIMD operations. Despite being an older technology, SSE remains a baseline for many libraries due to its widespread adoption. -However, its fixed-width design and limited throughput make it less competitive compared to more modern extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm NEON, enabling a relatively straightforward transition. +However, its fixed-width design can constrain throughput compared with newer extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm NEON, enabling a relatively straightforward transition. ### Advanced Vector Extensions (AVX) -The AVX extensions introduce 256-bit YMM registers with AVX and 512-bit ZMM registers with AVX-512, offering significant performance improvements over SSE. Key features include Fused Multiply-Add (FMA) operations, masked operations in AVX-512, and VEX/EVEX encodings that allow for more operands and flexibility. - -Migrating AVX code to Arm requires careful consideration, as AVX maps to NEON for up to 128-bit operations or to SVE for scalable-width operations. Since SVE is vector-length agnostic, porting AVX code often involves refactoring to accommodate this new paradigm. +AVX provides 256-bit YMM registers, and AVX-512 adds 512-bit ZMM registers. Features include FMA, per-lane masking in AVX-512, and VEX or EVEX encodings. When moving AVX workloads to Arm, 128-bit paths often translate to NEON, while algorithms that scale with vector width are good candidates for SVE. Because SVE is vector-length agnostic, refactor for predication and scalable loops to maintain portability and performance. ### Advanced Matrix Extensions (AMX) -AMX is a specialized instruction set designed for accelerating matrix operations using dedicated matrix-tile registers, effectively treating 2D arrays as first-class citizens. It is particularly well-suited for AI workloads, such as convolutions and General Matrix Multiplications (GEMMs). - -When migrating AMX workloads to Arm, you can leverage Arm SME, which conceptually aligns with AMX but employs a different programming model based on outer products rather than dot products. This difference requires you to adapt their code to fully exploit SME's capabilities. +AMX accelerates matrix operations with tile registers configured using a tile palette. It suits AI workloads such as GEMM and convolutions. When migrating AMX kernels to Arm, target SME. While both target matrix compute, AMX commonly expresses dot products, while SME focuses on outer products, so porting often entails algorithmic adjustments. ## Comparison tables -## SSE vs. NEON - -| Feature | SSE | NEON | -|-----------------------|---------------------------------------------------------------|----------------------------------------------------------------| -| **Register width** | 128-bit (XMM registers) | 128-bit (Q registers) | -| **Vector length model**| Fixed 128 bits | Fixed 128 bits | -| **Predication / masking**| Minimal predication; SSE lacks full mask registers | Conditional select instructions; no hardware mask registers | -| **Gather / Scatter** | No native gather/scatter (introduced in AVX2 and later) | No native gather/scatter; requires software emulation | -| **Instruction set scope**| Arithmetic, logical, shuffle, blend, conversion, basic SIMD | Arithmetic, logical, shuffle, saturating ops, multimedia, crypto extensions (AES, SHA)| -| **Floating-point support**| Single and double precision floating-point SIMD operations | Single and double precision floating-point SIMD operations | -| **Typical applications**| Legacy SIMD workloads; general-purpose vector arithmetic | Multimedia processing, DSP, cryptography, embedded compute | -| **Extensibility** | Extended by AVX/AVX2/AVX-512 for wider vectors and advanced features| NEON fixed at 128-bit vectors; ARM SVE offers scalable vectors but is separate | -| **Programming model** | Intrinsics supported in C/C++; assembly used for optimization | Intrinsics widely used; inline assembly less common | - - -## AVX vs. SVE (SVE2) - -| Feature | x86: AVX / AVX-512 | ARM: SVE / SVE2 | -|-----------------------|---------------------------------------------------------|---------------------------------------------------------------| -| **Register width** | Fixed: 256-bit (YMM), 512-bit (ZMM) | Scalable: 128 to 2048 bits (in multiples of 128 bits) | -| **Vector length model**| Fixed vector length; requires multiple code paths or compiler dispatch for different widths | Vector-length agnostic; same binary runs on any hardware vector width | -| **Predication / masking**| Mask registers for per-element operations (AVX-512) | Rich predication with per-element predicate registers | -| **Gather/Scatter** | Native gather/scatter support (AVX2 and AVX-512) | Native gather/scatter with efficient implementation across vector widths | -| **Key operations** | Wide SIMD, fused multiply-add (FMA), conflict detection, advanced masking | Wide SIMD, fused multiply-add (FMA), predicated operations, gather/scatter, reduction operations, bit manipulation | -| **Best suited for** | HPC, AI workloads, scientific computing, data analytics | HPC, AI, scientific compute, cloud and scalable workloads | -| **Limitations** | Power and thermal throttling on heavy 512-bit usage; complex software ecosystem | Requires vector-length agnostic programming style; ecosystem and hardware adoption still maturing | +Use these side-by-side tables to pick the right Arm target and plan refactors. They compare register width, predication/masking, gather/scatter, key operations, typical workloads, and limitations for SSE ↔ NEON, AVX/AVX-512 ↔ SVE/SVE2, and AMX ↔ SME. -## AMX vs. SME +### A comparison of SSE and NEON -| Feature | x86: AMX | ARM: SME | -|-----------------------|---------------------------------------------------------|------------------------------------------------------------| -| **Register width** | Tile registers with fixed dimensions: 16×16 for BF16, 64×16 for INT8 (about 1 KB total) | Scalable matrix tiles integrated with SVE, implementation-dependent tile dimensions | -| **Vector length model**| Fixed tile dimensions based on data type | Implementation-dependent tile dimensions, scales with SVE vector length | -| **Predication / masking**| No dedicated predication or masking in AMX tiles | Predication integrated through SVE predicate registers | -| **Gather/Scatter** | Not supported within AMX; handled by other instructions | Supported via integration with SVE’s gather/scatter features | -| **Key operations** | Focused on dot-product based matrix multiplication, optimized for GEMM and convolutions | Focus on outer-product matrix multiplication with streaming mode for dense linear algebra | -| **Best suited for** | AI/ML workloads such as training and inference, specifically GEMM and convolution kernels | AI/ML training and inference, scientific computing, dense linear algebra workloads | -| **Limitations** | Hardware and software ecosystem currently limited (primarily Intel Xeon platforms) | Emerging hardware support; compiler and library ecosystem in development | +| Feature | SSE | NEON | +|---|---|---| +| **Register width** | 128-bit (XMM) | 128-bit (Q) | +| **Vector length model** | Fixed 128 bits | Fixed 128 bits | +| **Predication or masking** | Minimal, no dedicated mask registers | No dedicated mask registers; use bitwise selects and conditionals | +| **Gather/scatter** | No native gather/scatter; gather in AVX2 and scatter in AVX-512 | No native gather/scatter; emulate in software | +| **Instruction set scope** | Arithmetic, logical, shuffle, convert, basic SIMD | Arithmetic, logical, shuffle, saturating ops; cryptography via Armv8 Cryptography Extensions (AES and SHA) | +| **Floating-point support** | Single and double precision | Single and double precision | +| **Typical applications** | Legacy SIMD, general vector arithmetic | Multimedia, DSP, cryptography, embedded compute | +| **Extensibility** | Extended by AVX, AVX2, and AVX-512 | Fixed at 128-bit; scalable vectors provided by SVE as a separate extension | +| **Programming model** | Intrinsics in C/C++; assembly for hotspots | Intrinsics widely used; inline assembly less common | +### A comparison of AVX and SVE (SVE2) -## Key Differences for Developers +| Feature | x86: AVX or AVX-512 | Arm: SVE or SVE2 | +|---|---|---| +| **Register width** | Fixed: 256-bit YMM, 512-bit ZMM | Scalable: 128 to 2048 bits in 128-bit steps | +| **Vector length model** | Fixed; often multiple code paths for different widths | Vector-length agnostic; same binary adapts to hardware width | +| **Predication or masking** | Mask registers in AVX-512 | Rich predication via predicate registers | +| **Gather or scatter** | Gather in AVX2 and scatter in AVX-512 | Native gather and scatter across widths | +| **Key operations** | Wide SIMD, FMA, conflict detection, advanced masking | Wide SIMD, FMA, predication, gather or scatter, reductions, bit manipulation | +| **Best suited for** | HPC, AI and ML, scientific computing, analytics | HPC, AI and ML, scientific computing, cloud and scalable workloads | +| **Limitations** | Power and thermal headroom under heavy 512-bit use; ecosystem complexity | Requires VLA programming style; SVE or SVE2 hardware availability varies by platform | -When migrating from x86 SIMD extensions to Arm SIMD, there are several important architectural and programming differences for you to consider. +{{% notice Note %}} +SVE2 extends SVE with richer integer and DSP capabilities for general-purpose and media workloads. +{{% /notice %}} -### Vector Length Model +### A comparison of AMX and SME -x86 SIMD extensions such as SSE, AVX, and AVX-512 operate on fixed vector widths, 128, 256, or 512 bits. This often necessitates multiple code paths or compiler dispatch techniques to efficiently exploit available hardware SIMD capabilities. Arm NEON, similar to SSE, uses a fixed 128-bit vector width, making it a familiar, fixed-size SIMD baseline. +| Feature | x86: AMX | Arm: SME | +|---|---|---| +| **Register model** | Tile registers configured via a palette; fixed per type limits | Scalable matrix tiles integrated with SVE; implementation-dependent dimensions | +| **Vector length model** | Fixed tile geometry per configuration | Scales with SVE vector length and streaming mode | +| **Predication or masking** | Predication not inherent to tiles | Predication via SVE predicate registers | +| **Gather or scatter** | Not provided in AMX tiles; handled elsewhere | Via SVE integration with gather or scatter | +| **Key operations** | Dot-product oriented GEMM and convolution | Outer-product matrix multiply; streaming mode for dense linear algebra | +| **Best suited for** | AI and ML training and inference, GEMM and convolution kernels | AI and ML training and inference, scientific and HPC dense linear algebra | +| **Limitations** | Hardware and software availability limited to specific CPUs | Emerging hardware support; compiler and library support evolving | -In contrast, Arm’s Scalable Vector Extension (SVE) and Scalable Matrix Extension (SME) introduce a vector-length agnostic model. This allows vectors to scale from 128 bits up to 2048 bits depending on the hardware, enabling the same binary to run efficiently across different implementations without modification. +## The key differences for developers -### Programming and Intrinsics +The most significant changes when porting include moving from fixed-width SIMD to vector-length-agnostic loop structures, replacing mask-register control with predicate-driven control, and adjusting memory access patterns and compiler flags. Review this section first to minimize rework and preserve portable performance. -x86 offers a comprehensive and mature set of SIMD intrinsics that increase in complexity especially with AVX-512 due to advanced masking and lane-crossing operations. Arm NEON intrinsics resemble SSE intrinsics and are relatively straightforward for porting existing SIMD code. However, Arm SVE and SME intrinsics are designed for a more predicated and vector-length agnostic style of programming. +### Vector length model -When migrating to SVE/SME you are encouraged to leverage compiler auto-vectorization with predication support, moving away from heavy reliance on low-level intrinsics to achieve scalable, portable performance. +x86 SIMD (SSE, AVX, and AVX-512) uses fixed widths of 128, 256, or 512 bits. This often requires multiple code paths or dispatch strategies. Arm NEON is also fixed at 128-bit and is a familiar baseline. SVE and SME introduce vector-length agnostic execution from 128 to 2048 bits so the same binary scales across implementations. -### Matrix Acceleration +### Programming and intrinsics -For matrix computation, AMX provides fixed-size tile registers optimized for dot-product operations such as GEMM and convolutions. In comparison, Arm SME extends the scalable vector compute model with scalable matrix tiles designed around outer-product matrix multiplication and novel streaming modes. +x86 intrinsics are extensive, and AVX-512 adds masks and lane controls that increase complexity. NEON intrinsics look familiar to SSE developers. SVE and SME use predication and scalable loops. Prefer auto-vectorization and VLA-friendly patterns over heavy hand-written intrinsics when portability matters. -SME’s flexible, hardware-adaptable tile sizes and tight integration with SVE’s predication model provide a highly adaptable platform for AI training, inference, and scientific computing. +### Matrix acceleration -Both AMX and SME are currently available on limited set of platforms. +AMX provides fixed-geometry tile compute optimized for dot products. SME extends Arm’s scalable model with outer-product math, scalable tiles, and streaming mode. Both AMX and SME are currently available on a limited set of platforms. -### Overall Summary +## Summary -Migrating from x86 SIMD to Arm SIMD entails embracing Arm’s scalable and predicated SIMD programming model embodied by SVE and SME, which supports future-proof, portable code across a wide range of hardware. - -NEON remains important for fixed-width SIMD similar to SSE but may be less suited for emerging HPC and AI workloads that demand scale and flexibility. - -You need to adapt to Arm’s newer vector-length agnostic programming and tooling to fully leverage scalable SIMD and matrix architectures. - -Understanding these key differences in vector models, programming paradigms, and matrix acceleration capabilities helps you migrate and achieve good performance on Arm. +Migrating from x86 SIMD to Arm entails adopting Arm’s scalable and predicated programming model with SVE and SME for forward-portable performance, while continuing to use NEON for fixed-width SIMD similar to SSE. ## Migration tools -There are tools and libraries that help translate SSE intrinsics to NEON intrinsics, which can shorten the migration effort and produce efficient Arm code. These libraries enable many SSE operations to be mapped to NEON equivalents, but some SSE features have no direct NEON counterparts and require workarounds or redesign. - -Overall, NEON is the standard for SIMD on Arm much like SSE for x86, making it the closest analogue for porting SIMD-optimized software from x86 to ARM. - -[sse2neon](https://github.com/DLTcollab/sse2neon) is an open-source header library that provides a translation layer from Intel SSE2 intrinsics to Arm NEON intrinsics. It enables many SSE2-optimized codebases to be ported to Arm platforms with minimal code modification by mapping familiar SSE2 instructions to their NEON equivalents. - - -[SIMD Everywhere (SIMDe)](https://github.com/simd-everywhere/simde) is a comprehensive, header-only library designed to ease the transition of SIMD code between different architectures. It provides unified implementations of SIMD intrinsics across x86 SSE/AVX, Arm NEON, and other SIMD instruction sets, facilitating portable and maintainable SIMD code. SIMDe supports a wide range of SIMD extensions and includes implementations that fall back to scalar code when SIMD is unavailable, maximizing compatibility. +Several libraries help translate or abstract SIMD intrinsics to speed up migration. Coverage varies, and some features have no direct analogue. +Here are some of the tools available and their key features: -[Google Highway](https://github.com/google/highway) is a high-performance SIMD optimized vector hashing and data processing library designed by Google. It leverages platform-specific SIMD instructions, including Arm NEON and x86 AVX, to deliver fast, portable, and scalable hashing functions and vector operations. Highway is particularly well-suited for large-scale data processing, machine learning, and performance-critical applications requiring efficient SIMD usage across architectures. +- Sse2neon: an open-source header that maps many SSE2 intrinsics to NEON equivalents. Good for getting code building quickly. Review generated code for performance. See the [sse2neon GitHub repository](https://github.com/DLTcollab/sse2neon). +- SIMD Everywhere (SIMDe): a header-only portability layer that implements many x86 and Arm intrinsics across ISAs, with scalar fallbacks when SIMD is unavailable. See the [simde-everywhere GitHub repository](https://github.com/simd-everywhere/simde). +- Google Highway (hwy): a portable SIMD library and APIs that target multiple ISAs, including NEON, SVE where supported, and AVX, without per-ISA code paths. See the [Google highway GitHub repository](https://github.com/google/highway). -You can also review [Porting architecture specific intrinsics](/learning-paths/cross-platform/intrinsics/) for more information. \ No newline at end of file +For more on cross-platform intrinsics, see the Learning Path [Porting architecture-specific intrinsics](/learning-paths/cross-platform/intrinsics/). diff --git a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md index 015060804c..5d4881fa2d 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md @@ -1,24 +1,25 @@ --- -title: Vector extension code examples +title: "Explore vector extension code examples" weight: 4 -### FIXED, DO NOT MODIFY -layout: learningpathall +# FIXED, DO NOT MODIFY +layout: "learningpathall" --- -## SAXPY Example code +## SAXPY example code -As a way to provide some hands-on experience, you can study and run example code to better understand the vector extensions. The example used here is SAXPY. +This page walks you through a SAXPY (Single-Precision A·X Plus Y) kernel implemented in plain C and with vector extensions on both Arm (NEON, SVE) and x86 (AVX2, AVX-512). You will see how to build and run each version and how the vector width affects throughput. -SAXPY stands for "Single-Precision A·X Plus Y" and is a fundamental operation in linear algebra. It computes the result of the equation `y[i] = a * x[i] + y[i]` for all elements in the arrays `x` and `y`. +SAXPY computes `y[i] = a * x[i] + y[i]` across arrays `x` and `y`. It is widely used in numerical computing and is an accessible way to compare SIMD behavior across ISAs. -SAXPY is widely used in numerical computing, particularly in vectorized and parallelized environments, due to its simplicity and efficiency. +{{% notice Tip %}} +If a library already provides a tuned SAXPY (for example, BLAS), use that over hand-written kernels. These examples are for learning and porting. +{{% /notice %}} -### Reference version -Below is a plain C implementation of SAXPY without any vector extensions. +## Reference C version (no SIMD intrinsics) -This serves as a reference for the optimized examples provided later. +Below is a plain C implementation of SAXPY without any vector extensions which serves as a reference baseline for the optimized examples provided later: ```c #include @@ -56,7 +57,7 @@ int main() { } ``` -Use a text editor to copy the code to a file `saxpy_plain.c` and build and run the code using: +Use a text editor to copy the code to a file called `saxpy_plain.c` and build and run the code using: ```bash gcc -O3 -o saxpy_plain saxpy_plain.c @@ -65,13 +66,11 @@ gcc -O3 -o saxpy_plain saxpy_plain.c You can use Clang for any of the examples by replacing `gcc` with `clang` on the command line. -### Arm NEON version (128-bit SIMD, 4 floats per operation) +## Arm NEON version (128-bit SIMD, 4 floats per operation) -NEON operates on fixed 128-bit registers, able to process 4 single-precision float values simultaneously in every vector instruction. +NEON uses fixed 128-bit registers, processing four `float` values per instruction. It is available on most Armv8-A devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads. -This extension is available on most Arm-based devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads. - -The example below processes 16 floats per iteration using four separate NEON operations to improve instruction-level parallelism and reduce loop overhead. +The example below processes 16 floats per iteration using four separate NEON operations to improve instruction-level parallelism and reduce loop overhead: ```c #include @@ -139,7 +138,13 @@ gcc -O3 -march=armv8-a+simd -o saxpy_neon saxpy_neon.c ./saxpy_neon ``` -### AVX2 (256-bit SIMD, 8 floats per operation) +{{% notice Note %}} +On AArch64, NEON is mandatory; the flag is shown for clarity. +{{% /notice %}} + + + +## x86 AVX2 version (256-bit SIMD, 8 floats per operation) AVX2 doubles the SIMD width compared to NEON, processing 8 single-precision floats at a time in 256-bit registers. @@ -203,7 +208,7 @@ gcc -O3 -mavx2 -mfma -o saxpy_avx2 saxpy_avx2.c ./saxpy_avx2 ``` -### Arm SVE (hardware dependent: 4 to 16+ floats per operation) +## Arm SVE (hardware dependent: 4 to 16+ floats per operation) Arm SVE lets the hardware determine the register width, which can range from 128 up to 2048 bits. This means each operation can process from 4 to 64 single-precision floats at a time, depending on the implementation. @@ -214,6 +219,7 @@ SVE encourages writing vector-length agnostic code: the compiler automatically h ```c #include #include +#include #include #include @@ -270,13 +276,13 @@ gcc -O3 -march=armv8-a+sve -o saxpy_sve saxpy_sve.c ./saxpy_sve ``` -### AVX-512 (512-bit SIMD, 16 floats per operation) +## x86 AVX-512 version (512-bit SIMD, 16 floats per operation) AVX-512 provides the widest SIMD registers of mainstream x86 architectures, processing 16 single-precision floats per 512-bit operation. AVX-512 availability varies across x86 processors. It's found on Intel Xeon server processors and some high-end desktop processors, as well as select AMD EPYC models. -For very large arrays and high-performance workloads, AVX-512 delivers extremely high throughput, with additional masking features for efficient tail processing. +For large arrays and high-performance workloads, AVX-512 delivers extremely high throughput, with additional masking features for efficient tail processing. ```c #include @@ -341,7 +347,7 @@ gcc -O3 -mavx512f -o saxpy_avx512 saxpy_avx512.c ./saxpy_avx512 ``` -### Summary +## Summary Wider data lanes mean each operation processes more elements, offering higher throughput on supported hardware. However, actual performance depends on factors like memory bandwidth, the number of execution units, and workload characteristics. @@ -349,4 +355,4 @@ Processors also improve performance by implementing multiple SIMD execution unit Each vector extension requires different intrinsics, compilation flags, and programming approaches. While x86 and Arm vector extensions serve similar purposes and achieve comparable performance gains, you will need to understand the options and details to create portable code. -You should also look for existing libraries that already work across vector extensions before you get too deep into code porting. This is often a good way to leverage the available SIMD capabilities on your target hardware. +You can also look for existing libraries that already work across vector extensions before you get too deep into code porting. This is often a good way to leverage the available SIMD capabilities on your target hardware. diff --git a/content/learning-paths/cross-platform/vectorization-comparison/_index.md b/content/learning-paths/cross-platform/vectorization-comparison/_index.md index d2a54fe293..a925bb0166 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/_index.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/_index.md @@ -1,21 +1,22 @@ --- -title: "Mapping x86 vector extensions to Arm: a migration overview" - -minutes_to_complete: 30 +title: "Migrate x86-64 SIMD to Arm64" draft: true cascade: draft: true -who_is_this_for: This is an advanced topic for software developers who want to learn how to migrate vectorized code to Arm. +minutes_to_complete: 30 + +who_is_this_for: This is an advanced topic for developers migrating vectorized (SIMD) code from x86-64 to Arm64. learning_objectives: - - Understand how Arm vector extensions, including NEON, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures. - - Start planning how to migrate your SIMD code to the Arm architecture. + - Identify how Arm vector extensions including NEON, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures + - Plan a migration strategy using autovectorization, intrinsics, or library substitution + prerequisites: - - Familiarity with vector extensions, SIMD programming, and compiler intrinsics. - - Access to Linux systems with NEON and SVE support. + - Familiarity with vector extensions, SIMD programming, and compiler intrinsics + - Access to Linux systems with NEON and SVE support author: - Jason Andrews @@ -40,11 +41,11 @@ shared_between: further_reading: - resource: - title: SVE Programming Examples + title: SVE programming examples link: https://developer.arm.com/documentation/dai0548/latest type: documentation - resource: - title: Port Code to Arm Scalable Vector Extension (SVE) + title: Port code to Arm Scalable Vector Extension (SVE) link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/sve type: website - resource: @@ -64,22 +65,24 @@ further_reading: link: https://developer.arm.com/documentation/109246/latest type: documentation - resource: - title: Compiler Intrinsics + title: Compiler intrinsics (overview) link: https://en.wikipedia.org/wiki/Intrinsic_function type: website - resource: - title: ACLE - Arm C Language Extension + title: ACLE - Arm C Language Extensions link: https://github.com/ARM-software/acle type: website - resource: - title: Application Binary Interface for the Arm Architecture + title: Application Binary Interface for the Arm Architecture (AAPCS64) link: https://github.com/ARM-software/abi-aa type: website - ### FIXED, DO NOT MODIFY # ================================================================================ weight: 1 # _index.md always has weight of 1 to order correctly layout: "learningpathall" # All files under learning paths have this same wrapper learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. --- + + + diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index 945b031b43..e5d9c4f74d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -45,11 +45,12 @@ tools_software_languages_filter: - CMSIS-DSP: 1 - CMSIS-Toolbox: 3 - CNN: 1 +- Computer Vision: 1 - Containerd: 1 - DetectNet: 1 - Docker: 10 - DSTREAM: 2 -- Edge AI: 1 +- Edge AI: 2 - Edge Impulse: 1 - ExecuTorch: 3 - FastAPI: 1 diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md index 85b368fbed..6c1ff55547 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md @@ -44,6 +44,7 @@ From within the Python virtual environment, run the commands below to download t cd $HOME git clone https://github.com/pytorch/executorch.git cd executorch +git checkout 188312844ebfb499f92ab5a02137ed1a4abca782 ``` Run the commands below to set up the ExecuTorch internal dependencies: @@ -70,7 +71,7 @@ pip list | grep executorch ``` ```output -executorch 0.6.0a0+3eea1f1 +executorch 1.1.0a0+1883128 ``` ## Next Steps diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md index c717770259..c554c0a575 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md @@ -16,11 +16,11 @@ The Corstone reference system is provided free of charge, although you will have ## Corstone-320 FVP Setup for ExecuTorch -Navigate to the Arm examples directory in the ExecuTorch repository. Run the following command. +Run the FVP setup script in the ExecuTorch repository. ```bash -cd $HOME/executorch/examples/arm -./setup.sh --i-agree-to-the-contained-eula +cd $HOME/executorch +./examples/arm/setup.sh --i-agree-to-the-contained-eula ``` After the script has finished running, it prints a command to run to finalize the installation. This step adds the FVP executables to your system path. diff --git a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md index e23ea09452..b3fb6106d3 100644 --- a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md @@ -18,8 +18,7 @@ author: Pareena Verma test_images: - armswdev/arm-tools:bare-metal-compilers -test_link: null -test_maintenance: true +test_maintenance: false ### Tags skilllevels: Introductory diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md index 967482a761..ad8a8ade3f 100644 --- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md @@ -1,26 +1,23 @@ --- -title: Edge AI with PyTorch & ExecuTorch - Tiny Sentiment Analysis on Arm +title: Edge AI with PyTorch & ExecuTorch - Tiny Rock-Paper-Scissors on Arm -draft: true -cascade: - draft: true +minutes_to_complete: 60 -minutes_to_complete: 90 +who_is_this_for: This learning path is for machine learning developers interested in deploying TinyML models on Arm-based edge devices. You will learn how to train and deploy a machine learning model for the classic game "Rock-Paper-Scissors" on edge devices. You'll use PyTorch and ExecuTorch, frameworks designed for efficient on-device inference, to build and run a small-scale computer vision model. -who_is_this_for: This topic is for machine learning engineers, embedded AI developers, and researchers interested in deploying TinyML models for NLP on Arm-based edge devices using PyTorch and ExecuTorch. learning_objectives: - - Train a custom CNN-based sentiment classification model implemented in PyTorch. - - Optimize and convert the model using ExecuTorch for Arm-based edge devices. - - Deploy and run inference on the Corstone-320 FVP. + - Train a small Convolutional Neural Network (CNN) for image classification using PyTorch. + - Understand how to use synthetic data generation for training a model when real-world data is limited. + - Optimize and convert a PyTorch model into an ExecuTorch program (.pte) for Arm-based devices. + - Run the trained model on a local machine to play an interactive mini-game, demonstrating model inference. -prerequisites: - - Basic knowledge of machine learning concepts. - - It is advised to complete The Learning Path, [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) before starting this learning path. - - Familiarity with Python and PyTorch. - - A Linux host machine or VM running Ubuntu 22.04 or higher. - - An Arm license to run the examples on the Corstone-320 Fixed Virtual Platform (FVP), for hands-on deployment. +prerequisites: + - A basic understanding of machine learning concepts. + - Familiarity with Python and the PyTorch library. + - Having completed [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm). + - An x86 Linux host machine or VM running Ubuntu 22.04 or higher. author: Dominica Abena O. Amanfo @@ -28,9 +25,12 @@ author: Dominica Abena O. Amanfo skilllevels: Introductory subjects: ML armips: - - Cortex-A + - Cortex-M + - Ethos-U tools_software_languages: - tinyML + - Computer Vision + - Edge AI - CNN - PyTorch - ExecuTorch @@ -38,7 +38,6 @@ tools_software_languages: operatingsystems: - Linux - further_reading: - resource: title: Run Llama 3 on a Raspberry Pi 5 using ExecuTorch @@ -56,4 +55,4 @@ further_reading: weight: 1 # _index.md always has weight of 1 to order correctly layout: "learningpathall" # All files under learning paths have this same wrapper learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. ---- +--- \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md index 6f62990675..ac6b5e10a2 100644 --- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md +++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md @@ -6,17 +6,31 @@ weight: 2 layout: learningpathall --- -## Overview -In this course, you will learn how to train and run inference using a Tiny Sentiment Classifier. You'll deploy the model on the Arm Corstone-320 FVP for sentiment analysis. +## Overview +This learning path (LP) is a direct follow-up to the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) learning path. While the previous one introduced you to the core concepts and the toolchain, this one puts that knowledge into practice with a fun, real-world example. You will move from the simple [Feedforward Neural Network](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model) in the previous LP, to a more practical computer vision task: A tiny Rock-Paper-Scissors game, to demonstrate how these tools can be used to solve a tangible problem and run efficiently on Arm-based edge devices. -We will train a lightweight convolutional neural network (CNN)-based sentiment classifier using synthetic text data. This model is optimized for small devices, using embedding layers and 1D convolutions for efficient text classification. +You will train a lightweight CNN to classify images of the letters R, P, and S as "rock," "paper," or "scissors." The script uses a synthetic data renderer to create a large dataset of these images with various transformations and noise, eliminating the need for a massive real-world dataset. +### What is a Convolutional Neural Network (CNN)? +A Convolutional Neural Network (CNN) is a type of deep neural network primarily used for analyzing visual imagery. Unlike traditional neural networks, CNNs are designed to process pixel data by using a mathematical operation called **convolution**. This allows them to automatically and adaptively learn spatial hierarchies of features from input images, from low-level features like edges and textures to high-level features like shapes and objects. -## Environment Setup -Setup your development environment for TinyML by following the first 3 chapters of the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) Learning Path (LP). +![Image of a convolutional neural network architecture](image.png) +[Image credits](https://medium.com/@atul_86537/learning-ml-from-first-principles-c-linux-the-rick-and-morty-way-convolutional-neural-c76c3df511f4). + +CNNs are the backbone of many modern computer vision applications, including: + +- **Image Classification:** Identifying the main object in an image, like classifying a photo as a "cat" or "dog". +- **Object Detection:** Locating specific objects within an image and drawing a box around them. +- **Facial Recognition:** Identifying and verifying individuals based on their faces. + +For the Rock-Paper-Scissors game, you'll use a tiny CNN to classify images of the letters R, P, and S as the corresponding hand gestures. -If you just followed the LP above, you should already have your virtual environment activated. If not, activate it using: + +## Environment Setup +To get started, follow the first three chapters of the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) Learning Path. This will set up your development environment and install the necessary tools. Return to this LP once you've run the `./examples/arm/run.sh` script in the ExecuTorch repository. + +If you just followed the LP above, you should already have your virtual environment activated. If not, activate it using: ```console source $HOME/executorch-venv/bin/activate @@ -26,8 +40,7 @@ The prompt of your terminal now has `(executorch-venv)` as a prefix to indicate Run the commands below to install the dependencies. ```bash -pip install argparse json +pip install argparse numpy pillow torch ``` -You are now ready to build the model - +You are now ready to create the model. diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md index 77c0bd59c2..e9ffd439ec 100644 --- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md +++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md @@ -1,5 +1,5 @@ --- -title: Train and Test the Sentiment Classifier +title: Train and Test the Rock-Paper-Scissors Model weight: 3 ### FIXED, DO NOT MODIFY @@ -14,288 +14,477 @@ Navigate to the Arm examples directory in the ExecuTorch repository. cd $HOME/executorch/examples/arm ``` -Using a file editor of your choice, create a file named tiny_sentiment.py with the code shown below: +Using a file editor of your choice, create a file named `rps_tiny.py`, copy and paste the code shown below: ```python +#!/usr/bin/env python3 +""" +Tiny Rock–Paper–Scissors CNN (PyTorch) + ExecuTorch export + CLI mini-game. + +Usage: + # Train (fast) + export .pte + play + python rps_tiny.py --epochs 8 --export --play + + # Just train (no export) + python rps_tiny.py --epochs 8 + + # Export previously trained weights to .pte + python rps_tiny.py --export + + # Play the mini-game (uses the best weights on disk) + python rps_tiny.py --play + +Outputs: + - rps_best.pt (best PyTorch weights) + - rps_labels.json (label map) + - rps_tiny.pte (ExecuTorch program, if --export) +""" + +import argparse, json, math, os, random, sys +from dataclasses import dataclass +from typing import Tuple, List + +import numpy as np +from PIL import Image,ImageOps,ImageDraw, ImageFont, ImageFilter + import torch import torch.nn as nn import torch.optim as optim -import json -import numpy as np from torch.utils.data import Dataset, DataLoader -from sklearn.model_selection import train_test_split - - -class SentimentDataset(Dataset): - def __init__(self, texts, labels, vocab=None, max_length=50): - self.texts = texts - self.labels = labels - self.max_length = max_length - - if vocab is None: - # Build vocabulary from training data - self.vocab = {'': 0, '': 1} - for text in texts: - for word in text.lower().split(): - if word not in self.vocab: - self.vocab[word] = len(self.vocab) - else: - self.vocab = vocab - - def __len__(self): - return len(self.texts) - - def __getitem__(self, idx): - text = self.texts[idx].lower().split() - # Convert words to indices and pad/truncate to max_length - indices = [self.vocab.get(word, self.vocab['']) for word in text] - if len(indices) < self.max_length: - indices += [self.vocab['']] * (self.max_length - len(indices)) - else: - indices = indices[:self.max_length] - - return torch.tensor(indices), torch.tensor(self.labels[idx]) - - -class SentimentClassifier(nn.Module): - def __init__(self, vocab_size, embed_dim=100, hidden_dim=128, num_classes=2): - super().__init__() - self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) - self.conv1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1) - self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1) - self.pool = nn.AdaptiveMaxPool1d(1) - self.fc1 = nn.Linear(hidden_dim, hidden_dim) - self.fc2 = nn.Linear(hidden_dim, num_classes) - self.dropout = nn.Dropout(0.5) - - def forward(self, x): - # x shape: (batch_size, seq_len) - x = self.embedding(x) # (batch_size, seq_len, embed_dim) - x = x.transpose(1, 2) # (batch_size, embed_dim, seq_len) - x = torch.relu(self.conv1(x)) - x = self.dropout(x) - x = torch.relu(self.conv2(x)) - x = self.pool(x).squeeze(-1) - x = torch.relu(self.fc1(x)) - x = self.dropout(x) - x = self.fc2(x) - return x - - -def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model = model.to(device) - best_val_acc = 0 - - for epoch in range(num_epochs): - # Training phase - model.train() - train_loss = 0 - train_correct = 0 - train_total = 0 - - for inputs, labels in train_loader: - inputs, labels = inputs.to(device), labels.to(device) - optimizer.zero_grad() - outputs = model(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - - train_loss += loss.item() - _, predicted = outputs.max(1) - train_total += labels.size(0) - train_correct += predicted.eq(labels).sum().item() - - # Validation phase - model.eval() - val_loss = 0 - val_correct = 0 - val_total = 0 - - with torch.no_grad(): - for inputs, labels in val_loader: - inputs, labels = inputs.to(device), labels.to(device) - outputs = model(inputs) - loss = criterion(outputs, labels) - - val_loss += loss.item() - _, predicted = outputs.max(1) - val_total += labels.size(0) - val_correct += predicted.eq(labels).sum().item() - - train_acc = 100. * train_correct / train_total - val_acc = 100. * val_correct / val_total - - print(f'Epoch {epoch+1}/{num_epochs}:') - print(f'Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}%') - print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.2f}%') - print('-' * 60) - - # Save best model - if val_acc > best_val_acc: - best_val_acc = val_acc - torch.save(model.state_dict(), 'best_sentiment_model.pt') +# --------------------------- +# Config +# --------------------------- +SEED = 7 +random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED) + +LABELS = ["rock", "paper", "scissors"] # indexes: 0,1,2 +IMG_SIZE = 28 +TRAIN_SAMPLES_PER_CLASS = 1000 +VAL_SAMPLES_PER_CLASS = 200 +BATCH = 64 +LR = 2e-3 +EPOCHS_DEFAULT = 6 +WEIGHTS = "rps_best.pt" +LABELS_JSON = "rps_labels.json" +PTE_OUT = "rps_tiny.pte" + + +# --------------------------- +# Synthetic R/P/S renderer +# --------------------------- +def _rand(a, b): + return a + random.random()*(b-a) + +def render_rps(label: str) -> Image.Image: + """ + Render a 28x28 grayscale image for 'rock'/'paper'/'scissors' + using the letters R/P/S with random transforms + noise. + """ + ch = {"rock":"R","paper":"P","scissors":"S"}[label] + img = Image.new("L", (IMG_SIZE, IMG_SIZE), color=0) + d = ImageDraw.Draw(img) + + # Try to get a default truetype; fallback to PIL default bitmap font + font = None + try: + # Use a generic font size that fills the canvas + font = ImageFont.truetype(font="Arial.ttf", size=int(_rand(18,24))) + except Exception: + font = ImageFont.load_default() + + # Random text position + bbox = d.textbbox((0, 0), ch, font=font) # (left, top, right, bottom) + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x = (IMG_SIZE - w)//2 + int(_rand(-2, 2)) + y = (IMG_SIZE - h)//2 + int(_rand(-2, 2)) + + # Random brightness for foreground + fg = int(_rand(180, 255)) + d.text((x,y), ch, fill=fg, font=font) + + # Slight blur/rotate/shear + if random.random()<0.6: + img = img.filter(ImageFilter.GaussianBlur(radius=_rand(0.0, 0.7))) + if random.random()<0.8: + angle = _rand(-18, 18) + img = img.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=0) + + # Add mild elastic-ish jitter by affine + if random.random()<0.5: + dx, dy = _rand(-1.0, 1.0), _rand(-1.0, 1.0) + ax = 1 + _rand(-0.05, 0.05) + img = img.transform( + img.size, + Image.AFFINE, + (ax, _rand(-0.05,0.05), dx, _rand(-0.05,0.05), 1+_rand(-0.05,0.05), dy), + resample=Image.BILINEAR, + fillcolor=0 + ) + + # Salt & pepper noise + if random.random()<0.8: + arr = np.array(img, dtype=np.float32) + noise = np.random.randn(*arr.shape)*_rand(3, 12) + arr = np.clip(arr + noise, 0, 255).astype(np.uint8) + img = Image.fromarray(arr, mode="L") + + return img + + +# --------------------------- +# Dataset +# --------------------------- +@dataclass +class RPSItem: + image: torch.Tensor # [1,28,28] float32 0..1 + label: int + +class RPSDataset(Dataset): + def __init__(self, n_per_class: int, train: bool): + self.items: List[RPSItem] = [] + for idx, name in enumerate(LABELS): + for _ in range(n_per_class): + img = render_rps(name) + # Slightly different augments for train vs val + if train and random.random()<0.15: + img = ImageOps.invert(img) + t = torch.from_numpy(np.array(img, dtype=np.float32)/255.0)[None, ...] + self.items.append(RPSItem(t, idx)) + random.shuffle(self.items) + + def __len__(self): return len(self.items) + def __getitem__(self, i): + it = self.items[i] + return it.image, torch.tensor(it.label, dtype=torch.long) + + +# --------------------------- +# Model: Tiny CNN (Ethos-friendly) +# --------------------------- +class TinyRPS(nn.Module): + """ + Simple ConvNet: + [B,1,28,28] -> Conv3x3(16) -> ReLU -> Conv3x3(32) -> ReLU + -> MaxPool2d(2) -> Conv3x3(64) -> ReLU -> MaxPool2d(2) + -> flatten -> Linear(128) -> ReLU -> Linear(3) + """ + def __init__(self): + super().__init__() + self.body = nn.Sequential( + nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(inplace=True), + nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(inplace=True), + nn.MaxPool2d(2), + nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True), + nn.MaxPool2d(2), + ) + self.head = nn.Sequential( + nn.Flatten(), + nn.Linear(64*7*7, 128), nn.ReLU(inplace=True), + nn.Linear(128, 3) + ) + def forward(self, x): # x: [B,1,28,28] + return self.head(self.body(x)) + +# AOT entry points expected by aot_arm_compiler +ModelUnderTest = TinyRPS() +ModelInputs = (torch.zeros(1, 1, IMG_SIZE, IMG_SIZE, dtype=torch.float32),) + +# --------------------------- +# Train / Eval +# --------------------------- +def run_epoch(dl, model, crit, opt=None): + train = opt is not None + model.train() if train else model.eval() + totl=totc=cnt=0 + with torch.set_grad_enabled(train): + for x,y in dl: + if train: opt.zero_grad(set_to_none=True) + out = model(x) + loss = crit(out, y) + if train: + loss.backward() + opt.step() + totl += float(loss)*x.size(0) + totc += (out.argmax(1)==y).sum().item() + cnt += x.size(0) + return totl/cnt, totc/cnt + + +# --------------------------- +# Export to ExecuTorch (.pte) +# --------------------------- +def export_to_pte(model: nn.Module, out_path=PTE_OUT): + model.eval() + example = torch.zeros(1,1,IMG_SIZE,IMG_SIZE, dtype=torch.float32) + exported = None + try: + try: + from torch.export import export + except Exception: + import torch._export as _export + export = _export.export + exported = export(model, (example,)) + except Exception: + # Fallback: some older builds expose exir.capture + from executorch.exir import capture + exported = capture(model, (example,)) + from executorch import exir + edge = exir.to_edge(exported) + prog = edge.to_executorch() + with open(out_path, "wb") as f: + f.write(prog.buffer) + print(f"[export] wrote {out_path}") + + +# --------------------------- +# CLI mini-game +# --------------------------- +def ascii_show(img: torch.Tensor) -> str: + """Convert [1,28,28] tensor into tiny ASCII block for fun.""" + chars = " .:-=+*#%@" + arr = (img.squeeze(0).numpy()*255).astype(np.uint8) + h, w = arr.shape + lines=[] + for y in range(0,h,2): + row=[] + for x in range(0,w,1): + v = arr[y, x] + row.append(chars[min(len(chars)-1, int(v)*len(chars)//256)]) + lines.append("".join(row)) + return "\n".join(lines) + +def beats(a: int, b: int) -> int: + """Return +1 if a beats b, 0 if tie, -1 if loses.""" + # 0=rock beats 2=scissors, 1=paper beats 0, 2=scissors beats 1 + if a == b: return 0 + if (a==0 and b==2) or (a==1 and b==0) or (a==2 and b==1): return +1 + return -1 + +def play_game(model: nn.Module): + print("\n=== Rock–Paper–Scissors: Play vs Tiny CNN ===") + print("Type one of: rock / paper / scissors / quit\n") + while True: + s = input("Your move> ").strip().lower() + if s in ("quit","q","exit"): break + if s not in LABELS: + print("Invalid. Try: rock / paper / scissors / quit") + continue + # Generate an image of YOUR move and one for OPPONENT + your_idx = LABELS.index(s) + your_img = render_rps(s) + opp_idx = random.randint(0,2) + opp_img = render_rps(LABELS[opp_idx]) + + # Classify both with the model on CPU + def to_tensor(im): + return torch.from_numpy(np.array(im, dtype=np.float32)/255.0)[None,None,...] + with torch.no_grad(): + y_logits = model(to_tensor(your_img)) + o_logits = model(to_tensor(opp_img)) + y_pred = int(y_logits.argmax(1).item()) + o_pred = int(o_logits.argmax(1).item()) + y_conf = torch.softmax(y_logits,1)[0,y_pred].item() + o_conf = torch.softmax(o_logits,1)[0,o_pred].item() + + print("\nYou played:", s) + print(ascii_show(to_tensor(your_img)[0])) + print(f"Model thinks you played: {LABELS[y_pred]} ({y_conf*100:.1f}%)") + + print("\nOpponent played (hidden):") + print(ascii_show(to_tensor(opp_img)[0])) + print(f"Model thinks opponent played: {LABELS[o_pred]} ({o_conf*100:.1f}%)") + + outcome = beats(y_pred, o_pred) + if outcome>0: print("\n🎉 You win!") + elif outcome<0: print("\n😅 You lose!") + else: print("\n🤝 It's a tie!") + print("-"*50) + + +# --------------------------- +# Main +# --------------------------- def main(): - # Sample balanced dataset (After successfully completing this LP, you can try your own samples) - texts = [ - "I am very happy today", - "This is wonderful", - "I love this movie", - "Great experience", - "I am feeling fantastic", - "This is awesome", - "I am very sad today", - "This is terrible", - "I hate this movie", - "Worst experience ever", - "I am feeling depressed", - "This is awful" - ] - labels = [1] * 6 + [0] * 6 # 1 for positive, 0 for negative - - # Split dataset - train_texts, val_texts, train_labels, val_labels = train_test_split( - texts, labels, test_size=0.2, random_state=42, stratify=labels - ) - - # Create datasets - train_dataset = SentimentDataset(train_texts, train_labels) - val_dataset = SentimentDataset(val_texts, val_labels, vocab=train_dataset.vocab) - - # Create dataloaders - train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True) - val_loader = DataLoader(val_dataset, batch_size=4) - - # Initialize model and training components - model = SentimentClassifier(len(train_dataset.vocab)) - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4) - - # Train model - train_model(model, train_loader, val_loader, criterion, optimizer) - - # Save vocabulary - with open('sentiment_vocab.json', 'w') as f: - json.dump(train_dataset.vocab, f) - - # Test mode - model.eval() - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model = model.to(device) - - while True: - text = input("Enter text to analyze (or 'quit' to exit): ") - if text.lower() == 'quit': - break - - # Preprocess input - indices = [train_dataset.vocab.get(word.lower(), train_dataset.vocab['']) - for word in text.split()] - if len(indices) < train_dataset.max_length: - indices += [train_dataset.vocab['']] * (train_dataset.max_length - len(indices)) - else: - indices = indices[:train_dataset.max_length] - - # Get prediction - with torch.no_grad(): - input_tensor = torch.tensor(indices).unsqueeze(0).to(device) - output = model(input_tensor) - probabilities = torch.softmax(output, dim=1) - prediction = torch.argmax(output).item() - confidence = probabilities[0][prediction].item() * 100 - - sentiment = "Positive" if prediction == 1 else "Negative" - print(f"Sentiment: {sentiment}") - print(f"Confidence: {confidence:.2f}%") + ap = argparse.ArgumentParser() + ap.add_argument("--epochs", type=int, default=EPOCHS_DEFAULT) + ap.add_argument("--no-train", action="store_true", help="skip training (use saved weights)") + ap.add_argument("--export", action="store_true", help="export ExecuTorch .pte after training") + ap.add_argument("--play", action="store_true", help="play the mini-game after (or without) training") + args = ap.parse_args() + + # Always save label map for runners + with open(LABELS_JSON, "w") as f: + json.dump({"labels": LABELS}, f, indent=2) + + model = TinyRPS() + + if not args.no_train: + print("== Building synthetic datasets ==") + tr = RPSDataset(TRAIN_SAMPLES_PER_CLASS, train=True) + va = RPSDataset(VAL_SAMPLES_PER_CLASS, train=False) + train_loader = DataLoader(tr, batch_size=BATCH, shuffle=True, num_workers=0) + val_loader = DataLoader(va, batch_size=BATCH, shuffle=False, num_workers=0) + + print(f"Train size: {len(tr)} | Val size: {len(va)}") + + crit = nn.CrossEntropyLoss() + opt = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4) + + best = -1.0 + for e in range(1, args.epochs+1): + tl, ta = run_epoch(train_loader, model, crit, opt) + vl, vaa = run_epoch(val_loader, model, crit, None) + print(f"Epoch {e:02d}/{args.epochs} | train {ta*100:5.2f}% | val {vaa*100:5.2f}%") + if vaa > best: + best = vaa + torch.save(model.state_dict(), WEIGHTS) + print(f" ↑ saved {WEIGHTS} (val {vaa*100:.2f}%)") + print("Training done.") + else: + print("--no-train: skipping training") + + # Load best weights if present + if os.path.exists(WEIGHTS): + model.load_state_dict(torch.load(WEIGHTS, map_location="cpu")) + model.eval() + print(f"Loaded weights from {WEIGHTS}") + else: + print(f"[warn] No weights file {WEIGHTS}; using random init.") + + if args.export: + try: + export_to_pte(model, PTE_OUT) + except Exception as e: + print("[export] failed:", e) + + if args.play: + play_game(model) if __name__ == "__main__": - main() + main() ``` -### How This Script Works: -- Generates a synthetic dataset of positive and negative sentiment samples. -- Encodes text into numerical format using an embedding layer. -- Trains a compact CNN model for sentiment classification. -- Saves the trained model and vocabulary for inference. -- Once training is complete, the model is saved as tiny_sentiment.pt, and vocabulary is saved to tiny_sentiment_vocab.json. +### About the Script +The script handles the entire workflow: data generation, model training, and a simple command-line game. +- **Synthetic Data Generation:** The script includes a function `render_rps()` that generates 28x28 grayscale images of the letters 'R', 'P', and 'S' with random rotations, blurs, and noise. This creates a diverse dataset that's used to train the model. +- **Model Architecture:** The model, a TinyRPS class, is a simple Convolutional Neural Network (CNN). It uses a series of 2D convolutional layers, followed by pooling layers to reduce spatial dimensions, and finally, fully connected linear layers to produce a final prediction. This architecture is efficient and well-suited for edge devices. +- **Training:** The script generates synthetic training and validation datasets. It then trains the CNN model using the **Adam optimizer** and **Cross-Entropy Loss**. It tracks validation accuracy and saves the best-performing model to `rps_best.pt`. +- **ExecuTorch Export:** A key part of the script is the `export_to_pte()` function. This function uses the `torch.export module` (or a fallback) to trace the trained PyTorch model and convert it into an ExecuTorch program (`.pte`). This compiled program is highly optimized for deployment on any target hardware, for example Cortex-M or Cortex-A CPUs for embedded devices. +- **CLI Mini-Game**: After training, you can play an interactive game. The script generates an image of your move and a random opponent's move. It then uses the trained model to classify both images and determines the winner based on the model's predictions. +### Running the Script: + +To train the model, export it, and play the game, run the following command: -To train and test the model with your own inputs, run: ```bash -python ~/executorch/examples/arm/tiny_sentiment.py +python rps_tiny.py --epochs 8 --export --play ``` +You'll see the training progress, where the model's accuracy rapidly improves on the synthetic data. + +```output +== Building synthetic datasets == +Train size: 3000 | Val size: 600 + totl += float(loss)*x.size(0) +Epoch 01/8 | train 80.03% | val 98.67% + ↑ saved rps_best.pt (val 98.67%) +Epoch 02/8 | train 99.57% | val 100.00% + ↑ saved rps_best.pt (val 100.00%) +Epoch 03/8 | train 99.83% | val 99.83% +Epoch 08/8 | train 100.00% | val 100.00% +Training done. +Loaded weights from rps_best.pt +[export] wrote rps_tiny.pte +``` +After training and export, the game will start. Type rock, paper, or scissors and see the model's predictions and what your opponent played. +```output +=== Rock–Paper–Scissors: Play vs Tiny CNN === +Type one of: rock / paper / scissors / quit -{{% notice Note %}} -The output has been truncated -{{% /notice %}} +Your move> rock -The output should look like: -```bash -=== Sentiment Analysis Classifier === -This program demonstrates text sentiment classification using PyTorch - -Loading dataset... -Total examples: 12 -Positive examples: 6 -Negative examples: 6 - -Building vocabulary from training data... -Vocabulary size: 19 words - -Initializing model... -Starting training... -Training on device: cpu - -Epoch 1/20 -Training: 100%|██████████| 3/3 [00:00<00:00, 62.94it/s, loss=0.2385, acc=44.44%] -Validation: 100%|███████| 1/1 [00:00<00:00, 633.87it/s, loss=0.2302, acc=66.67%] - -Epoch Summary: -Train Loss: 0.7154, Train Acc: 44.44% -Val Loss: 0.6906, Val Acc: 66.67% -New best validation accuracy: 66.67%! Saving model... - -. -. -. -Saving vocabulary... - -=== Interactive Testing Mode === -Enter text to analyze sentiment. Type 'quit' to exit. -================================================== -Enter text to analyze (or 'quit' to exit): I am happy - -Processing text: "I am happy" -Tokenization: i am happy -Padding: Added 47 padding tokens - -Analyzing sentiment... - -Result: -Sentiment: Positive -Confidence: 76.67% -================================================== -Enter text to analyze (or 'quit' to exit): I am sad - -Processing text: "I am sad" -Tokenization: i am sad -Padding: Added 47 padding tokens - -Analyzing sentiment... - -Result: -Sentiment: Negative -Confidence: 63.98% -================================================== -Enter text to analyze (or 'quit' to exit): quit +You played: rock + + + + + + .=##*++=-:. + :**-:-=++**+: + .=#+. :+#=. + :*%%#*++==+**-. + -*+::-+#%*+-. + :+*-. -*+- + -*+: -**: + .. .=*+. + .::. +Model thinks you played: rock (100.0%) + +Opponent played (hidden): + + + + + + ..:--*###**- + -#**--. .:+#*. . + .+#- +#+ + -*+. :+#- + .+#+=**###+-. . + -##=:. . + . .+*: + .-** + . :== +Model thinks opponent played: paper (100.0%) + +😅 You lose! +-------------------------------------------------- +Your move> paper + +You played: paper + + + + + + .--:. + .=*+++***+=: + :++. :+*- + -+- .-+- + .=*-.. .=+=. + :**+++**+++- + -*- + .++: + :+- +Model thinks you played: paper (100.0%) + +Opponent played (hidden): + + + . + + + .:::::-:::. + .+*=======+*= + .**. +*- . + . .=+. :++: + .=*#*###**+=: + .=+- :=+-. + .=*: .-+=: + . -#-. :=*= + :*: .-+- +Model thinks opponent played: rock (100.0%) + +🎉 You win! +-------------------------------------------------- +Your move> ``` -Do not forget to type 'quit' once you are done testing the model. You are now ready to optimize and convert the model using ExecuTorch. \ No newline at end of file +Type `quit` to exit the game. In the next chapter, you'll prepare the model to run on the FVP. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md index 0b0a9f1ac2..b26333edb0 100644 --- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md +++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md @@ -1,66 +1,84 @@ --- -title: Run the model on Corstone-320 FVP +title: Run the model on Corstone-320 FVP weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -TODO: Ask Annie to try from her end +This section guides you through the process of compiling your trained Rock-Paper-Scissors model and running it on a simulated Arm-based edge device, the Corstone-320 Fixed Virtual Platform (FVP). This final step demonstrates the end-to-end workflow of deploying a TinyML model for on-device inference. + ## Compile and build the executable -Start by setting some environment variables that are used by ExecuTorch. +First, you'll use the Ahead-of-Time (AOT) Arm compiler to convert your PyTorch model into a format optimized for the Arm architecture and the Ethos-U NPU. This process, known as delegation, offloads parts of the neural network graph that are compatible with the NPU, allowing for highly efficient inference. + +Set up your environment variables by running the following commands in your terminal: ```bash export ET_HOME=$HOME/executorch export executorch_DIR=$ET_HOME/build ``` - -Then, generate a `.pte` file using the Arm examples. The Ahead-of-Time (AoT) Arm compiler will enable optimizations for edge devices like the Raspberry Pi and the Corstone-320 FVP. Run it from the ExecuTorch root directory. - -Navigate to the root directory using: +Use the AOT Arm compiler to generate the optimized `.pte` file. This command delegates the model to the Ethos-U85 NPU, applies quantization to reduce model size and improve performance, and specifies the memory configuration. Run it from the ExecuTorch root directory. ```bash -cd ../../ +cd $ET_HOME +python -m examples.arm.aot_arm_compiler --model_name=examples/arm/rps_tiny.py \ +--delegate --quantize --target=ethos-u85-128 \ +--system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Dedicated_Sram ``` -You are now in $HOME/executorch and ready to create the model file for ExecuTorch. -```bash -cd $ET_HOME -python -m examples.arm.aot_arm_compiler --model_name=examples/arm/tiny_sentiment.py \ ---delegate --quantize --target=ethos-u85-256 \ ---so_library=cmake-out-aot-lib/kernels/quantized/libquantized_ops_aot_lib.so \ ---system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Sram_Only +You should see: + +```output +PTE file saved as rps_tiny_arm_delegate_ethos-u85-128.pte ``` -From the Arm Examples directory, you build an embedded Arm runner with the `.pte` included. This allows you to get the most performance out of your model, and ensures compatibility with the CPU kernels on the FVP. Finally, generate the executable `arm_executor_runner`. +Next, you'll build the **Ethos-U runner**, which is a bare-metal executable that includes the ExecuTorch runtime and your compiled model. This runner is what the FVP will execute. Navigate to the runner's directory and use CMake to configure the build. ```bash cd $HOME/executorch/examples/arm/executor_runner - cmake -DCMAKE_BUILD_TYPE=Release \ --DCMAKE_TOOLCHAIN_FILE=$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \ --DTARGET_CPU=cortex-m85 \ --DET_DIR_PATH:PATH=$ET_HOME/ \ --DET_BUILD_DIR_PATH:PATH=$ET_HOME/cmake-out \ --DET_PTE_FILE_PATH:PATH=$ET_HOME/tiny_sentiment_arm_delegate_ethos-u85-256.pte \ --DETHOS_SDK_PATH:PATH=$ET_HOME/examples/arm/ethos-u-scratch/ethos-u \ --DETHOSU_TARGET_NPU_CONFIG=ethos-u85-256 \ --DPYTHON_EXECUTABLE=$HOME/executorch-venv/bin/python3 \ --DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid \ --B $ET_HOME/examples/arm/executor_runner/cmake-out - -cmake --build $ET_HOME/examples/arm/executor_runner/cmake-out --parallel -- arm_executor_runner + -S "$ET_HOME/examples/arm/executor_runner" \ + -B "$ET_HOME/examples/arm/executor_runner/cmake-out" \ + -DCMAKE_TOOLCHAIN_FILE="$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \ + -DTARGET_CPU=cortex-m85 \ + -DET_DIR_PATH="$ET_HOME" \ + -DET_BUILD_DIR_PATH="$ET_HOME/arm_test/cmake-out" \ + -DET_PTE_FILE_PATH="$ET_HOME/rps_tiny_arm_delegate_ethos-u85-128.pte" \ + -DETHOS_SDK_PATH="$ET_HOME/examples/arm/ethos-u-scratch/ethos-u" \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u85-128 \ + -DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid +``` + +You should see output similar to this, indicating a successful configuration: +```bash +-- ******************************************************* +-- PROJECT_NAME : ethos-u-corstone-320 +-- TR_ARENA_SIZE : +-- MESSAGE_HANDLER_ARENA_SIZE : +-- ******************************************************* +-- ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE = 0x200000 +-- ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE = +-- Configuring done (17.1s) +-- Generating done (0.2s) +-- Build files have been written to: ~/executorch/examples/arm/executor_runner/cmake-out ``` -Run the model on the Corstone-320 with the following command: +Now, build the executable with CMake: + +```bash +cmake --build "$ET_HOME/examples/arm/executor_runner/cmake-out" -j --target arm_executor_runner +``` + +### Run the Model on the FVP +With the `arm_executor_runner` executable ready, you can now run it on the Corstone-320 FVP to see the model on a simulated Arm device. ```bash FVP_Corstone_SSE-320 \ --C mps4_board.subsystem.ethosu.num_macs=256 \ +-C mps4_board.subsystem.ethosu.num_macs=128 \ -C mps4_board.visualisation.disable-visualisation=1 \ -C vis_hdlcd.disable_visualisation=1 \ -C mps4_board.telnetterminal0.start_telnet=0 \ @@ -70,37 +88,30 @@ FVP_Corstone_SSE-320 \ ``` {{% notice Note %}} - The argument `mps4_board.visualisation.disable-visualisation=1` disables the FVP GUI. This can speed up launch time for the FVP. - {{% /notice %}} -Observe that the FVP loads the model file. +Observe the output from the FVP. You'll see messages indicating that the model file has been loaded and the inference is running. This confirms that your ExecuTorch program is successfully executing on the simulated Arm hardware. + ```output telnetterminal0: Listening for serial connection on port 5000 telnetterminal1: Listening for serial connection on port 5001 telnetterminal2: Listening for serial connection on port 5002 telnetterminal5: Listening for serial connection on port 5003 -I [executorch:arm_executor_runner.cpp:412] Model in 0x70000000 $ -I [executorch:arm_executor_runner.cpp:414] Model PTE file loaded. Size: 3360 bytes. +I [executorch:arm_executor_runner.cpp:489 main()] PTE in 0x70000000 $ Size: 433968 bytes +I [executorch:arm_executor_runner.cpp:514 main()] PTE Model data loaded. Size: 433968 bytes. +I [executorch:arm_executor_runner.cpp:527 main()] Model buffer loaded, has 1 methods +I [executorch:arm_executor_runner.cpp:535 main()] Running method forward +I [executorch:arm_executor_runner.cpp:546 main()] Setup Method allocator pool. Size: 62914560 bytes. +I [executorch:arm_executor_runner.cpp:563 main()] Setting up planned buffer 0, size 3920. +I [executorch:EthosUBackend.cpp:116 init()] data:0x70000070 ``` -You can now test the model. - -## Test the Model -Test the model with your own inputs with the following command: - - -TODO: Add commands - -```bash - -``` - - -You've successfully trained and tested a CNN model for sentiment analysis on Arm hardware using Executorch. +{{% notice Note %}} +The inference itself may take a longer to run with a model this size - note that this is not a reflection of actual execution time. +{{% /notice %}} -Experiment with different inputs and data samples. This hands-on course showcases the power of TinyML and NLP on resource-constrained devices. +You've now successfully built, optimized, and deployed a computer vision model on a simulated Arm-based system. This hands-on exercise demonstrates the power and practicality of TinyML and ExecuTorch for resource-constrained devices. -In the next Learning Path, we would compare different model performances and inference times, before and after optimization using ExecuTorch. We would also analyze CPU and memory usage during inference. +In a future learning path, you can explore comparing different model performances and inference times before and after optimization. You could also analyze CPU and memory usage during inference, providing a deeper understanding of how the ExecuTorch framework optimizes your model for edge deployment. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png new file mode 100644 index 0000000000..b548f79463 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md index d1b21d9810..83505cecbd 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md @@ -19,7 +19,7 @@ author: Pareena Verma test_images: - amd64/ubuntu:latest test_link: null -test_maintenance: true +test_maintenance: false ### Tags skilllevels: Introductory diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index 25ff68127d..b0c3d43298 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -9,13 +9,13 @@ maintopic: true operatingsystems_filter: - Android: 2 - ChromeOS: 2 -- Linux: 33 +- Linux: 34 - macOS: 9 -- Windows: 44 +- Windows: 45 subjects_filter: - CI-CD: 5 - Containers and Virtualization: 7 -- Migration to Arm: 28 +- Migration to Arm: 29 - ML: 2 - Performance and Architecture: 27 subtitle: Create and migrate apps for power efficient performance @@ -28,6 +28,7 @@ tools_software_languages_filter: - Arm Performance Libraries: 2 - Arm64EC: 1 - Assembly: 1 +- Bash: 1 - C: 8 - C#: 6 - C++: 11 @@ -48,6 +49,7 @@ tools_software_languages_filter: - Intrinsics: 1 - JavaScript: 2 - Kubernetes: 1 +- KVM: 1 - Linux: 1 - LLM: 1 - LLVM: 2 @@ -61,7 +63,9 @@ tools_software_languages_filter: - OpenCV: 1 - perf: 4 - Python: 6 +- QEMU: 1 - Qt: 2 +- RDP: 1 - Remote.It: 1 - RME: 1 - Runbook: 18 diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md new file mode 100644 index 0000000000..e6ca39d6f4 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md @@ -0,0 +1,53 @@ +--- +title: Windows on Arm virtual machine creation using Arm Linux, QEMU, and KVM + +draft: true +cascade: + draft: true + +minutes_to_complete: 90 + +who_is_this_for: This is for developers and system administrators who want to automate Windows on Arm virtual machine (VM) creation on Arm Linux systems using QEMU and KVM. + +learning_objectives: + - Understand the process of creating Windows on Arm virtual machine using Bash scripts. + - Run scripts for VM creation and management. + - Troubleshoot common VM setup and runtime issues. + - Use Windows on Arm virtual machines for software development and testing. + +prerequisites: + - An Arm Linux system with KVM support and a minimum of 8GB RAM and 50GB free disk space. + +author: Jason Andrews + +### Tags +skilllevels: Introductory +subjects: Migration to Arm +armips: + - Neoverse + - Cortex-A +operatingsystems: + - Linux + - Windows +tools_software_languages: + - QEMU + - KVM + - Bash + - RDP + +further_reading: + - resource: + title: Linaro Wiki - Windows on Arm + link: https://wiki.linaro.org/LEG/Engineering/Kernel/WindowsOnArm + type: documentation + - resource: + title: Botspot Virtual Machine (BVM) Project + link: https://github.com/Botspot/bvm + type: website + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png b/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png new file mode 100644 index 0000000000..4f31c8b4f5 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png differ diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md new file mode 100644 index 0000000000..47b42a4298 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md @@ -0,0 +1,80 @@ +--- +title: System requirements +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +If you are building and testing Windows on Arm software you have a variety of options to run Windows on Arm. You can use local laptops, cloud virtual machines, and CI/CD platforms like GitHub Actions for development tasks. + +You can also use a local Arm Linux server to create virtual machines for Windows on Arm software development tasks. This Learning Path explains how to install and use Windows on Arm virtual machines on an Arm Linux system. Two scripts are provided to create and run Windows on Arm virtual machines to make the process easy. + +Before creating a Windows on Arm virtual machine, ensure your Arm Linux system meets the hardware and software requirements. This section covers everything you need to prepare to create a Windows on Arm virtual machine using QEMU and KVM. + +## Hardware requirements + +You need an Arm Linux system with enough performance, memory, and storage to run a Windows on Arm virtual machine. + +The provided scripts have been tested on a [Thelio Astra](https://system76.com/desktops/thelio-astra-a1.1-n1/configure?srsltid=AfmBOoplXbwXifyxppxFe_oyahYMJHUT0bp2BnIBSH5ADjqgZxB7wW75) running Ubuntu 24.04. + +Thelio Astra is an Arm-based desktop computer designed by System76 for autonomous vehicle development and other general-purpose Arm software development. It uses the Ampere Altra processor, which is based on the Arm Neoverse N1 CPU, and ships with the Ubuntu operating system. + +Other Arm Linux systems and other Linux distributions are possible, but have not been tested. General hardware requirements are listed below. + +The minimum hardware requirements for the Arm Linux system are: + +- 8 cores with hardware virtualization support +- 8 GB RAM +- 50 GB free disk space + +The scripts automatically allocate resources as listed below, but the details can be customized for your system. + +- CPU: half of available cores (minimum 4 cores) +- Memory: half of available RAM (minimum 4 GB) +- Disk: 40 GB VM disk + +## KVM support + +Kernel-based Virtual Machine (KVM) support is required for hardware-accelerated virtualization and good VM performance. + +KVM is a virtualization infrastructure built into the Linux kernel that allows you to run virtual machines with near-native performance. It leverages Arm's hardware virtualization extensions to provide efficient CPU virtualization, while QEMU handles device emulation and management. Without KVM, virtual machines run much slower using software emulation. + +Verify your system supports KVM by running: + +```console +sudo apt install cpu-checker -y +sudo kvm-ok +``` + +If KVM is available, you will see the messages: + +```output +INFO: /dev/kvm exists +KVM acceleration can be used +``` + +This confirms that: +- Your CPU supports hardware virtualization +- The KVM kernel module is loaded +- The `/dev/kvm` device exists + +## Required software + +The scripts require several software packages. + +Install the packages using the Linux package manager. + +```console +sudo apt update +sudo apt install qemu-system-arm qemu-utils genisoimage wget curl jq uuid-runtime -y +``` + +If needed, the [Remmina](https://remmina.org/) remote desktop (RDP) client is automatically installed by the run script so you don't need to install it now, but you can install it using the command below. + +```console +sudo apt install remmina remmina-plugin-rdp -y +``` + +Proceed to the next section to learn about the scripts. + diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md new file mode 100644 index 0000000000..b6103b6733 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md @@ -0,0 +1,100 @@ +--- +title: Understanding the virtual machine scripts + +weight: 3 + +layout: "learningpathall" +--- + +A GitHub project provides two Bash scripts. Understanding their architecture and design will help you use them effectively and enable you to customize the options for your specific needs. + +Start by cloning the project repository from GitHub to your Arm Linux system. + +```bash +git clone https://github.com/jasonrandrews/win11arm.git +cd win11arm +``` + +The remainder of this section explains the structure of the scripts, and the next section provides details to run the scripts to create a Windows virtual machine. + +## Project overview + +The project includes two Bash scripts. + +- VM create script: `create-win11-vm.sh` handles all VM creation tasks +- VM run script: `run-win11-vm.sh` manages VM execution and connectivity + +All configuration is available using command-line options. + +The VM create script also allows you to perform the entire VM creation with a single command or run each individual step to learn and monitor the process. + +This modular approach allows you to understand each component while maintaining the simplicity of automated execution. + +## Virtual machine creation + +The creation script, `create-win11-vm.sh` is responsible for building a complete Windows 11 on Arm VM from scratch. It handles everything from directory setup to Windows installation, with each step clearly defined and independently executable. + +The script handles resource detection and allocation, provides unattended Windows installation, and has a flexible command line to change default values. + +Virtual machine creation includes the following steps: + +- Download the Windows 11 for Arm ISO from Microsoft +- Configure VirtIO drivers for optimal performance +- Set up automated installation with custom credentials +- Create optimized disk images + +### Virtual machine creation details + +The `create-win11-vm.sh` script implements a four-step process that builds a Windows VM incrementally: + +### Step 1: Create VM directory + +Step 1 initializes the VM directory structure and configuration. It creates the VM directory, copies initial configuration files, and sets up the basic environment. As a result, the VM directory, configuration files, and connection profiles are created. + +### Step 2: Download Windows + +Step 2 downloads the Windows 11 ISO and VirtIO drivers. It downloads the Windows 11 Arm ISO from Microsoft, fetches VirtIO drivers, and prepares unattended installation files. The files created during this step include `installer.iso`, `virtio-win.iso`, and the unattended installation directory. This step takes some time as the Windows ISO download is large, but if you already have the file the script will save time and not repeat the download. + +### Step 3: Prepare VM disk image + +Step 3 creates the VM disk image and finalizes the installation setup. It builds the unattended installation ISO, creates the main VM disk image, and configures all installation media. The files created during this step include `disk.qcow2` and `unattended.iso`. + +{{% notice Note %}} +The product key used in the scripts is a generic key provided by Microsoft, which allows installation. This key is for testing purposes only and does not activate Windows. If you plan to continue using Windows beyond installation, you should replace it with a genuine product key. +{{% /notice %}} + +### Step 4: First Windows boot + +Step 4 executes the Windows installation. It boots the VM with installation media, runs the automated Windows setup, and completes the initial configuration. The result is a fully installed and configured Windows on Arm VM. + +Each step builds on the previous one, and you can run them individually for debugging or customization purposes. + +## Virtual machine execution + +The `run-win11-vm.sh` script runs virtual machines by managing their execution and connectivity. + +The script begins by checking if the VM is already active by validating QEMU processes and PID files. If the VM is running, it skips to establishing an RDP connection; otherwise, it proceeds to start the VM. + +Next, the script launches the VM in headless mode, optimized for RDP access, by configuring QEMU with a headless display, setting up port forwarding, and starting the VM as a background daemon process. + +Once the VM is running, the script waits for the RDP service to become available, configures the Remmina client, and establishes a desktop connection. + +This process ensures seamless access to the VM with proper display scaling and input handling. + +## Automatic resource detection and allocation + +The scripts try to manage resources based on your system. + +For CPU allocation, `/proc/cpuinfo` is used to determine the total number of CPU cores and use half of the available cores for the VM. A minimum of 2 cores for creation and 4 cores for runtime are required. + +For memory allocation, `/proc/meminfo` is used to determine total system RAM and allocate half of the available memory for the VM. A minimum of 2GB is required and memory usage is based on system capacity, with an option to override using a command line parameter. + +For storage, the default VM disk size is 40GB in QCOW2 format. The available disk space is validated before creation. + +All settings are customizable using command line arguments. + +## Script Integration and Workflow + +The create and run scripts share the same configuration files. Separating creation from execution enables you to create a VM once and then use the run script repeatedly. + +The next section explains how to create and run a Windows on Arm virtual machine. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md new file mode 100644 index 0000000000..e5572a56ed --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md @@ -0,0 +1,262 @@ +--- +title: "Create a Windows on Arm virtual machine" +weight: 4 +layout: "learningpathall" +--- + +## What is the fastest way to create a new Windows on Arm virtual machine? + +The virtual machine creation script creates a complete Windows 11 on Arm virtual machine with the `all` option. The default values are used for all configurable parameters. The location to store the VM files is also provided as an argument. + +To create a new VM, run the command: + +```console +./create-win11-vm.sh all $HOME/win11-vm +``` + +This single command executes all required virtual machine creation steps as explained in the previous section. + +The VM data is stored in the `$HOME/win11-vm` directory, and Windows will install automatically without any user intervention. + +Once the VM creation is complete, you'll see: + +```output +QEMU closed successfully. +Windows installation should be complete! +You can now use: ./run-win11-vm.sh $HOME/win11-vm +``` + +Your Windows on Arm VM is now ready to use. You can proceed to the next section to run the VM or continue on this page to review additional information about modifying default values, running the individual steps of VM creation, and fixing common errors. + +## Configuration options + +The creation script supports several options to customize your virtual machine setup. + +For example, you can change the Windows user, password, and disk size using the arguments shown below. + +```console +./create-win11-vm.sh all $HOME/win11-vm --username MyUser --password MyPass --disksize 60 +``` + +The table below lists the configuration options and default values. + +| Flag | Description | Default Value | Example | +|------|-------------|---------------|---------| +| `--username ` | Windows user name | `win11arm` | `--username Admin` | +| `--password ` | Windows user password | `win11arm` | `--password MySecurePass` | +| `--disksize ` | Disk size in GB | `40` | `--disksize 60` | +| `--rdp-port ` | RDP port for remote connections | `3389` | `--rdp-port 3390` | +| `--language ` | Windows language | `"English (United States)"` | `--language "English International"` | +| `--vm-mem ` | VM memory in GB | half of system RAM | `--vm-mem 8` | + + +### Disk space requirements + +The creation script checks available disk space before starting. + +An estimate of required disk space is shown in the table below. + +| Component | Size | Description | +|-----------|------|-------------| +| Windows 11 ISO | ~5GB | Downloaded from Microsoft | +| VirtIO drivers | ~500MB | Performance drivers | +| VM disk image | Variable | Default is 40 GB | +| Temporary files | ~1GB | Installation workspace | +| Total needed | ~7GB + disk size | Example: 47GB for default 40GB disk | + +### Configuration examples + +Create a VM with custom disk size and network port: + +```console +./create-win11-vm.sh all $HOME/win11-vm --disksize 60 --rdp-port 3390 --username Admin +``` + +Set up a VM with English International language: + +```console +./create-win11-vm.sh all $HOME/win11-vm --language "English International" +``` + +## Alternative four-step creation process + +The VM creation process consists of four distinct steps that can be run individually. Understanding each step helps with troubleshooting and customization. + +### Step 1: Create VM directory structure + +```console +./create-win11-vm.sh create $HOME/win11-vm +``` + +Command summary: +- Creates the VM directory at the specified path +- Sets up the initial directory structure for VM files +- Creates a `vm-config.txt` file with your configuration settings for reference +- Copies the Remmina connection template if available + +Files created: +- `vm-config.txt` - Configuration reference file +- `connect.remmina` - RDP connection template (if available) + +Each VM stores its configuration in `vm-config.txt`: + +```bash +# VM Configuration (for reference) +# Generated by create-win11-vm.sh v2.0.0 +VM_PATH=$HOME/win11-vm +USERNAME=win11arm +PASSWORD=win11arm +DISKSIZE=40 +RDP_PORT=3389 +LANGUAGE=English (United States) +VM_MEM=8 +CREATED=Thu Aug 28 10:30:45 UTC 2025 +``` + +This step is lightweight and completes quickly. It establishes the workspace where all VM files will be stored. + +### Step 2: Download Windows 11 and drivers + +```console +./create-win11-vm.sh download $HOME/win11-vm +``` + +Command summary: +- Downloads the Windows 11 Arm64 ISO directly from Microsoft's servers +- Patches the ISO to boot automatically without requiring a keypress +- Downloads VirtIO drivers for optimal VM performance +- Extracts and organizes drivers for the unattended installation +- Creates unattended installation configuration files +- Sets up the autounattend.xml with your specified username, password, and language + +Files created: +- `installer.iso` - Windows 11 Arm64 installation media +- `unattended/` directory - Contains drivers and installation automation files +- `unattended/autounattend.xml` - Windows unattended installation configuration +- `unattended/firstlogin.ps1` - Post-installation script + +If you already downloaded the Windows 11 installer ISO, you can copy it to your VM directory as `installer.iso` before running this step. The script will detect the existing file and ask if you want to use it or download a fresh copy: + +```output +installer.iso already exists. Delete it and download a fresh copy? [Y/n] +``` + +Choosing 'n' will skip the download and use your existing ISO, saving significant time and bandwidth. + +Download Process Details: +The script uses an automated process to download Windows 11 from Microsoft's official servers: + +1. Parse Microsoft's download page - Extracts product edition information +2. Get language SKU ID - Identifies the correct language variant +3. Obtain download link - Retrieves the direct download URL for Arm64 +4. Download and verify - Downloads the ISO and verifies its integrity + +### Step 3: Prepare VM disk + +```console +./create-win11-vm.sh prepare $HOME/win11-vm +``` + +Command summary: +- Creates the `unattended.iso` containing drivers and installation files +- Sets up the main VM hard drive as a QCOW2 disk image +- Allocates the specified disk space with optimized settings +- Prepares all components needed for the automated installation + +Files created: +- `unattended.iso` - ISO containing drivers and automation scripts +- `disk.qcow2` - Main VM hard drive (empty, ready for Windows installation) + +Disk Creation Details: +The script creates a QCOW2 disk image with these optimizations: +- Cluster size: 2 MB for better performance +- No copy-on-write: Disabled for improved I/O performance +- Metadata preallocation: Reduces fragmentation during VM operation + +Important Note: If `disk.qcow2` already exists, the script will warn you that proceeding will delete the existing VM's hard drive and start over with a clean installation. + +### Step 4: First boot and Windows installation + +```console +./create-win11-vm.sh firstboot $HOME/win11-vm +``` + +Command summary: +- Launches QEMU with the Windows installer +- Boots from the Windows 11 ISO with unattended installation +- Automatically installs Windows with your specified settings +- Installs VirtIO drivers for optimal performance +- Configures the user account and system settings +- Completes the entire Windows setup process without user intervention + +System Requirements Check: +Before starting, the script verifies: +- Desktop environment is available (DISPLAY or WAYLAND_DISPLAY) +- All required files exist (installer.iso, unattended.iso, disk.qcow2) +- Sufficient system resources are available + +Automatic Resource Allocation: +If you don't specify `--vm-mem`, the script automatically allocates: +- Memory: Half of your system's total RAM (minimum 2GB) +- CPU cores: Half of your system's total cores (minimum 2 cores) + +For example, on a system with 16GB RAM and 8 CPU cores: +- VM gets 8GB RAM and 4 CPU cores +- Host system retains 8GB RAM and 4 CPU cores for other tasks + +The script launches QEMU with these settings: +- Machine type: `virt` with KVM acceleration +- CPU: Host CPU passthrough for best performance +- Graphics: RamFB with GTK display and OpenGL acceleration +- Input: USB keyboard and tablet for proper mouse integration +- Network: User-mode networking with virtio-net for performance +- Storage: VirtIO block device with optimized caching +- Random number generator: Hardware entropy for security + +The installation process performs the following steps: +1. UEFI boot - VM starts with UEFI firmware +2. Windows installer loads - Boots from installer.iso +3. Unattended installation begins - Uses autounattend.xml configuration +4. Driver installation - VirtIO drivers installed automatically +5. User account creation - Your specified username and password +6. System configuration - Language, region, and basic settings +7. First login script - Runs firstlogin.ps1 for final setup + +The entire installation process typically takes 20-30 minutes depending on your system's performance. + +## Troubleshooting common problems + +### Insufficient disk space + +If you see an error about insufficient disk space: +```output +Error: Insufficient free disk space. 40 GB is needed, but you only have 25 GB. +``` + +Use the following options to correct the error: +- Free up disk space on your system +- Use a smaller disk size: `--disksize 30` +- Choose a different location with more space + +### Download failures + +If Windows ISO download fails: + +```output +Error: Failed to download Windows 11 installer.iso from Microsoft +``` + +Use the following options to correct the error: +- Check your internet connection +- Try again later (Microsoft can block frequent automated downloads) +- Manually download the ISO from Microsoft's website and save it as `installer.iso` + +### Memory allocation issues + +If the VM fails to start due to memory issues: + +Use the following options to correct the error: +- Reduce VM memory: `--vm-mem 4` +- Close other applications to free system memory + +You now have a good understanding of virtual machine creation. The next section will cover how to run and connect to your VM using the run script. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md new file mode 100644 index 0000000000..e05656d100 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md @@ -0,0 +1,282 @@ +--- +title: "Run a Windows on Arm virtual machine" +weight: 5 +layout: "learningpathall" +--- + +## Basic VM launch command + +After your Windows 11 Arm VM is created, launching it is simple with the unified run script: + +```console +./run-win11-vm.sh $HOME/win11-vm +``` + +This single command handles the entire VM startup and connection process automatically. The script performs three key steps: checks if the VM is already running, starts it in headless mode if needed, and connects you via RDP using Remmina. + +When the virtual machine starts you will see it on your Linux desktop: + +![Windows on Arm VM](./images/win11arm.png) + +## What does the run script do? + +Understanding the run script flow helps you troubleshoot issues and customize the VM runtime behavior. + +### Step 1: Check if VM is already running + +The script first checks if your VM is already running to avoid conflicts. + +Here is a fragment of the code: + +```bash +# Check for existing VM process +if [ -f "$vm_path/qemu.pid" ]; then + local vm_pid=$(cat "$vm_path/qemu.pid" 2>/dev/null) + if process_exists "$vm_pid"; then + status "VM is already running (PID: $vm_pid)" + fi +fi +``` + +The run script looks for the `qemu.pid` file in your VM directory, verifies the process ID is still active, cleans up stale PID files from previous sessions, and skips VM startup if already running. + +If this happens you will see output similar to: + +```output +VM is already running (PID: 12345) +Waiting for RDP service on port 3389... +``` + +### Step 2: Start VM in headless mode + +If the VM isn't running, the script starts it in headless mode (no GUI window) using QEMU. + +The arguments to QEMU are shown below: + +```bash +qemu-system-aarch64 \ + -M virt,accel=kvm \ + -cpu host \ + -m ${vm_mem}G \ + -smp $num_cores \ + -name "Windows on Arm" \ + -pidfile "$vm_path/qemu.pid" \ + -display none \ + -netdev user,id=nic,hostfwd=tcp:127.0.0.1:${rdp_port}-:3389 \ + -device virtio-net-pci,netdev=nic \ + -bios /usr/share/qemu-efi-aarch64/QEMU_EFI.fd \ + -drive file="$vm_path/disk.qcow2",if=virtio,discard=unmap,aio=threads,cache=none \ + -daemonize +``` + +The important arguments to QEMU are: +- `-M virt,accel=kvm` - Uses ARM virtualization with KVM acceleration +- `-cpu host` - Passes through your host CPU features for best performance +- `-display none` - Runs headless (no QEMU window) +- `-daemonize` - Runs QEMU as a background daemon +- `-netdev user,hostfwd=...` - Sets up port forwarding for RDP access +- `-pidfile` - Creates a PID file for process management + +The script automatically detects and allocates CPU and memory resources. + +The code is shown below: + +```bash +# Memory: Half of available RAM (minimum 2GB) +local total_ram_gb=$(awk '/MemTotal/ {print int($2/1048576)}' /proc/meminfo) +local vm_mem=$((total_ram_gb / 2)) +[ "$vm_mem" -lt 2 ] && vm_mem=2 + +# CPU: Half of available cores (minimum 4) +local total_cores=$(grep -c ^processor /proc/cpuinfo) +local num_cores=$((total_cores / 2)) +[ "$num_cores" -lt 4 ] && num_cores=4 +``` + +When the run script executes, you will see the CPU and RAM allocated: + +```output +Starting Windows VM in headless mode... +Using 8GB RAM and 4 CPU cores +VM started successfully +``` + +### Step 3: Connect via RDP + +Once the VM is running, the script waits for the RDP service and connects automatically. + +Here is the function which waits for the port to be ready: + +```bash +# Wait for RDP service to be available +wait_for_rdp() { + local port="$1" + local max_attempts=60 + + while [ $attempt -le $max_attempts ]; do + if timeout 3 bash -c "echo >/dev/tcp/localhost/$port" 2>/dev/null; then + return 0 + fi + sleep 2 + attempt=$((attempt + 1)) + done +} +``` + +Once the RDP service is ready, Remmina is started and connects. + +The related output is shown below: + +```output +Waiting for RDP service on port 3389... +RDP service is available! +Connecting to VM via RDP (localhost:3389)... +Username: win11arm +``` + +## Run script options and examples + +The run script supports several options for different use cases: + +### Custom RDP port + +```console +./run-win11-vm.sh /path/to/vm --rdp-port 3390 +``` +Uses a custom RDP port, useful when running multiple VMs or avoiding port conflicts. + +### Help information + +```console +./run-win11-vm.sh --help +``` +Displays usage information and all available options. + +## Remmina integration + +The script uses Remmina as the RDP client and creates a Remmina profile with the connection settings. + +The file name is `connect.remmina` and you can review and edit as needed. + +```ini +[remmina] +name=VM Connect +protocol=RDP +scale=2 +quality=9 +disable_fastpath=0 +glyph-cache=0 +multitransport=0 +relax-order-checks=1 +ignore-tls-errors=1 +cert_ignore=1 +window_width=1024 +window_height=768 +window_maximize=0 +disableautoreconnect=1 +viewmode=1 +network=lan #change viewmode=1 to viewmode=3 for fullscreen +sound=local #to get microphone input working, change to sound=remote, and USB passthrough your m +icrophone to the VM. +colordepth=63 +``` + + +## VM shutdown + +The preferred method is to shut down Windows normally from within the virtual machine. + +1. Click the Start button in Windows +2. Select Power → Shut down +3. Wait for Windows to complete shutdown +4. VM automatically stops when Windows finishes shutting down +5. Remmina exits automatically when the connection closes + +You should avoid killing QEMU directly as it may corrupt the VM disk as well as avoid exiting Remmina as it may leave the VM running in the background. + +## Runtime monitoring and management + +### Checking VM status + +To check if your VM is running without connecting: + +```console +# Check for VM process +ps aux | grep "Windows on Arm" + +# Check PID file +cat $HOME/win11-vm/qemu.pid + +# Test RDP connectivity +timeout 3 bash -c "echo >/dev/tcp/localhost/3389" +``` + +If the RDP connectivity fails the output is: + +```output +bash: connect: Connection refused +bash: line 1: /dev/tcp/localhost/3389: Connection refused +``` + +### Resource usage monitoring + +Monitor VM resource usage while running: + +```console +# CPU and memory usage +top -p $(cat $HOME/win11-vm/qemu.pid) + +# Detailed process information +ps -p $(cat $HOME/win11-vm/qemu.pid) -o pid,ppid,cmd,%cpu,%mem,etime +``` + +### Multiple VM management + +Running multiple VMs requires different RDP ports: + +```console +# First VM (default port 3389) +./run-win11-vm.sh $HOME/vm1 + +# Second VM (custom port 3390) +./run-win11-vm.sh $HOME/vm2 --rdp-port 3390 + +# Third VM (custom port 3391) +./run-win11-vm.sh $HOME/vm3 --rdp-port 3391 +``` + +Each VM needs its own directory and unique RDP port to avoid conflicts. + +## Troubleshooting runtime issues + +### RDP connection failures + +If RDP connection fails: + +```output +Error: RDP service did not become available after 120 seconds +``` + +Check VM is actually running: + +```console +ps aux | grep qemu-system-aarch64 +``` + +Verify RDP port: +```console +netstat -tlnp | grep 3389 +``` + +### Known Remmina crash issue + +When disconnecting from RDP, Remmina may crash with: + +```output +./run-win11-vm.sh: line 143: 60433 Aborted (core dumped) remmina -c "$remmina_file" $remmina_flags 2> /dev/null +RDP session ended +``` + +This is a known Remmina issue and does not affect VM functionality. + +You have learned how to create Windows on Arm virtual machines on an Arm Linux system with QEMU and KVM. You can use these virtual machines for software development and testing. You can speedup your development tasks by using an Arm Linux desktop or server with high processor count and plenty of RAM. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md index aae0dcbb19..0ba3f637ac 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md @@ -9,7 +9,7 @@ key_ip: - Mali maintopic: true operatingsystems_filter: -- Android: 31 +- Android: 32 - Linux: 30 - macOS: 14 - Windows: 14 @@ -17,7 +17,7 @@ subjects_filter: - Gaming: 6 - Graphics: 6 - ML: 12 -- Performance and Architecture: 34 +- Performance and Architecture: 35 subtitle: Optimize Android apps and build faster games using cutting-edge Arm tech title: Mobile, Graphics, and Gaming tools_software_languages_filter: @@ -26,7 +26,7 @@ tools_software_languages_filter: - Android: 4 - Android NDK: 2 - Android SDK: 1 -- Android Studio: 10 +- Android Studio: 11 - Arm Development Studio: 1 - Arm Mobile Studio: 1 - Arm Performance Studio: 3 @@ -38,6 +38,7 @@ tools_software_languages_filter: - CCA: 1 - Clang: 12 - CMake: 1 +- Coding: 1 - Docker: 1 - ExecuTorch: 1 - Frame Advisor: 1 diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png new file mode 100644 index 0000000000..98a272f84b Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png new file mode 100644 index 0000000000..d0b8df7cb0 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png new file mode 100644 index 0000000000..80e41973f2 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png new file mode 100644 index 0000000000..d098da4e1a Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png new file mode 100644 index 0000000000..8fa7609f69 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png new file mode 100644 index 0000000000..a78e5ee6f7 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png new file mode 100644 index 0000000000..5993f29b22 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png new file mode 100644 index 0000000000..a01e883efc Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png new file mode 100644 index 0000000000..64d714c262 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png new file mode 100644 index 0000000000..571783c51e Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md new file mode 100644 index 0000000000..b351d54846 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md @@ -0,0 +1,56 @@ +--- +title: Halide Essentials From Basics to Android Integration + +draft: true +cascade: + draft: true + +minutes_to_complete: 180 + +who_is_this_for: This is an introductory topic for software developers interested in learning how to use Halide for image processing. + +learning_objectives: + - Understand foundational concepts of Halide and set up your development environment. + - Create a basic real-time image processing pipeline using Halide. + - Optimize image processing workflows by applying operation fusion in Halide. + - Integrate Halide pipelines into Android applications developed with Kotlin. + +prerequisites: + - Basic C++ knowledge + - Android Studio with Android Emulator + +author: Dawid Borycki + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Cortex-A + - Cortex-X +operatingsystems: + - Android +tools_software_languages: + - Android Studio + - Coding + +further_reading: + - resource: + title: Halide 19.0.0 + link: https://halide-lang.org/docs/index.html + type: website + - resource: + title: Halide GitHub + link: https://github.com/halide/Halide + type: repository + - resource: + title: Halide Tutorials + link: https://halide-lang.org/tutorials/ + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md new file mode 100644 index 0000000000..3bb359a6fa --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md @@ -0,0 +1,419 @@ +--- +# User change +title: "Integrating Halide into an Android (Kotlin) Project" + +weight: 6 + +layout: "learningpathall" +--- + +## Objective +In this lesson, we’ll learn how to integrate a high-performance Halide image-processing pipeline into an Android application using Kotlin. + +## Overview of mobile integration with Halide +Android is the world’s most widely-used mobile operating system, powering billions of devices across diverse markets. This vast user base makes Android an ideal target platform for developers aiming to reach a broad audience, particularly in applications requiring sophisticated image and signal processing, such as augmented reality, photography, video editing, and real-time analytics. + +Kotlin, now the preferred programming language for Android development, combines concise syntax with robust language features, enabling developers to write maintainable, expressive, and safe code. It offers seamless interoperability with existing Java codebases and straightforward integration with native code via JNI, simplifying the development of performant mobile applications. + +## Benefits of using Halide on mobile +Integrating Halide into Android applications brings several key advantages: +1. Performance. Halide enables significant acceleration of complex image processing algorithms, often surpassing the speed of traditional Java or Kotlin implementations by leveraging optimized code generation. By generating highly optimized native code tailored for ARM CPUs or GPUs, Halide can dramatically increase frame rates and responsiveness, essential for real-time or interactive applications. +2. Efficiency. On mobile devices, resource efficiency translates directly to improved battery life and reduced thermal output. Halide’s scheduling strategies (such as operation fusion, tiling, parallelization, and vectorization) minimize unnecessary memory transfers, CPU usage, and GPU overhead. This optimization substantially reduces overall power consumption, extending battery life and enhancing the user experience by preventing overheating. +3. Portability. Halide abstracts hardware-specific details, allowing developers to write a single high-level pipeline that easily targets different processor architectures and hardware configurations. Pipelines can seamlessly run on various ARM-based CPUs and GPUs commonly found in Android smartphones and tablets, enabling developers to support a wide range of devices with minimal platform-specific modifications. +4. Custom Algorithm Integration. Halide allows developers to easily integrate their bespoke image-processing algorithms that may not be readily available or optimized in common libraries, providing full flexibility and control over application-specific performance and functionality. + +In short, Halide delivers high-performance image processing without sacrificing portability or efficiency, a balance particularly valuable on resource-constrained mobile devices. + +### Android development ecosystem and challenges +While Android presents abundant opportunities for developers, the mobile development ecosystem brings its own set of challenges, especially for performance-intensive applications: +1. Limited Hardware Resources. Unlike desktop or server environments, mobile devices have significant constraints on processing power, memory capacity, and battery life. Developers must optimize software meticulously to deliver smooth performance while carefully managing hardware resource consumption. Leveraging tools like Halide allows developers to overcome these constraints by optimizing computational workloads, making resource-intensive tasks feasible on constrained hardware. +2. Cross-Compilation Complexities. Developing native code for Android requires handling multiple hardware architectures (such as armv8-a, ARM64, and sometimes x86/x86_64). Cross-compilation introduces complexities due to different instruction sets, CPU features, and performance characteristics. Managing this complexity involves careful use of the Android NDK, understanding toolchains, and correctly configuring build systems (e.g., Gradle, CMake). Halide helps mitigate these issues by abstracting away many platform-specific optimizations, automatically generating code optimized for target architectures. +3. Image-Format Conversions (Bitmap ↔ Halide Buffer). Android typically handles images through the Bitmap class or similar platform-specific constructs, whereas Halide expects image data to be in raw, contiguous buffer formats. Developers must bridge the gap between Android-specific image representations (Bitmaps, YUV images from camera APIs, etc.) and Halide’s native buffer format. Proper management of these conversions—including considerations for pixel formats, stride alignment, and memory copying overhead—can significantly impact performance and correctness, necessitating careful design and efficient implementation of buffer-handling routines. + +## Project requirements +Before integrating Halide into your Android application, ensure you have the necessary tools and libraries. + +### Tools and prerequisites +1. Android Studio. [Download link](https://developer.android.com/studio). +2. Android NDK (Native Development Kit). Can be easily installed from Android Studio (Tools → SDK Manager → SDK Tools → Android NDK). + +## Setting up the Android project +### Creating the project +1. Open Android Studio. +2. Select New Project > Native C++. +![img4](Figures/04.png) + +### Configure the project +1. Set the project Name to Arm.Halide.AndroidDemo. +2. Choose Kotlin as the language. +3. Set Minimum SDK to API 24. +4. Click Next. +![img5](Figures/05.png) +5. Select C++17 from the C++ Standard dropdown list. +![img6](Figures/06.png) +6. Click Finish. + +## Configuring the Android project +Next, configure your Android project to use the files generated in the previous step. First, copy blur_threshold_android.a and blur_threshold_android.h into ArmHalideAndroidDemo/app/src/main/cpp. Ensure your cpp directory contains the following files: +* native-lib.cpp +* blur_threshold_android.a +* blur_threshold_android.h +* CMakeLists.txt + +Open CMakeLists.txt and modify it as follows (replace /path/to/halide with your Halide installation directory): +```cpp +cmake_minimum_required(VERSION 3.22.1) + +project("armhalideandroiddemo") +include_directories( + /path/to/halide/include +) + +add_library(blur_threshold_android STATIC IMPORTED) +set_target_properties(blur_threshold_android PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_SOURCE_DIR}/blur_threshold_android.a +) + +add_library(${CMAKE_PROJECT_NAME} SHARED native-lib.cpp) + +target_link_libraries(${CMAKE_PROJECT_NAME} + blur_threshold_android + android + log) +``` + +Open build.gradle.kts and modify it as follows: + +```console +plugins { + alias(libs.plugins.android.application) + alias(libs.plugins.kotlin.android) +} + +android { + namespace = "com.arm.armhalideandroiddemo" + compileSdk = 35 + + defaultConfig { + applicationId = "com.arm.armhalideandroiddemo" + minSdk = 24 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + ndk { + abiFilters += "arm64-v8a" + } + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + externalNativeBuild { + cmake { + cppFlags += "-std=c++17" + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + } + kotlinOptions { + jvmTarget = "11" + } + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } + buildFeatures { + viewBinding = true + } +} + +dependencies { + + implementation(libs.androidx.core.ktx) + implementation(libs.androidx.appcompat) + implementation(libs.material) + implementation(libs.androidx.constraintlayout) + testImplementation(libs.junit) + androidTestImplementation(libs.androidx.junit) + androidTestImplementation(libs.androidx.espresso.core) +} +``` + +Click the Sync Now button at the top. To verify that everything is configured correctly, click Build > Make Project in Android Studio. + +## UI +Now, you'll define the application's User Interface, consisting of two buttons and an ImageView. One button loads the image, the other processes it, and the ImageView displays both the original and processed images. +1. Open the res/layout/activity_main.xml file, and modify it as follows: +```XML + + + + +