diff --git a/.wordlist.txt b/.wordlist.txt index 306ba0bdeb..79c3f66d0f 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -4667,7 +4667,7 @@ Sommelier chromeos linuxcontainers XPS -NIC's +NIC’s offlines passthrough SLOs @@ -4722,4 +4722,94 @@ ATtestation CoCo procedureS NIC’s +httpbin +proxying +OpenBMC +PoC +PoCs +evb +ipmitool +openbmc +poc +IPMI +integrators +KCS +PLDM +MCTP +Redfish +hyperscalers +BMCs +OEM +NetFn +RDv +CSSv +penBmc +BMC's +socat +ZooKeeper +IRQs +IRQS +Friedt +namespaces +atlascli +benchmarkDB +cursorTest +replset +testCollection +Namespaces +mongotop +Mongotop +baselineDB +ef +netstat +tulnp +mongostat +arw +conn +getmore +qrw +vsize +conn +WiredTiger +GLE +getLastError +createIndex +getMore +getmore +RoT +lkvm +JMH +jmh +UseG +Xmx +Xms +JavaServer +servlets +RMSNorm +RoPE +FFN +ukernel +libstreamline +prefill +OpenCL +subgraphs +threadpool +worksize +Zhilong +Denoiser +RGGB +denoised +YGGV +Mohamad +Najem +kata +svl +svzero +anf +DynamIQ +Zena +learnt +lof +BalenaOS +balenaCloud diff --git a/assets/contributors.csv b/assets/contributors.csv index e65a28cb1c..ef6f06ea90 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -100,4 +100,5 @@ Ann Cheng,Arm,anncheng-arm,hello-ann,, Fidel Makatia Omusilibwa,,,,, Ker Liu,,,,, Rui Chang,,,,, - +Alejandro Martinez Vicente,Arm,,,, +Mohamad Najem,Arm,,,, diff --git a/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md b/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md index 75f3bf2cbe..16ad88740d 100644 --- a/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md +++ b/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md @@ -27,7 +27,7 @@ armips: tools_software_languages: - Python - Docker - - ROS2 + - ROS 2 - DDS operatingsystems: - Linux diff --git a/content/learning-paths/automotive/system76-auto/_index.md b/content/learning-paths/automotive/system76-auto/_index.md index f6a98adfd5..b317cf01c7 100644 --- a/content/learning-paths/automotive/system76-auto/_index.md +++ b/content/learning-paths/automotive/system76-auto/_index.md @@ -22,7 +22,6 @@ armips: operatingsystems: - Linux tools_software_languages: - - Automotive further_reading: - resource: diff --git a/content/learning-paths/automotive/zenacssdebug/_index.md b/content/learning-paths/automotive/zenacssdebug/_index.md new file mode 100644 index 0000000000..9539aaaf9d --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/_index.md @@ -0,0 +1,54 @@ +--- +title: Debug Arm Zena CSS Reference Software Stack with Arm Development Studio + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: This is an introductory topic for software developers who wish to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack. + +learning_objectives: + - Set up debug configuration for the Arm Zena CSS FVP + - Debug Runtime Security Engine (RSE) from boot time + - Debug Safety Island (SI) + - Debug Linux OS on Primary Compute cores + +prerequisites: + - Ubuntu 22.04 host machine + - You will need [Arm Development Studio 2024.1 (or later)](/install-guides/armds) and an appropriate license + - A basic understanding of the Arm Zena CSS software stack and Arm processors + +author: Ronan Synnott + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Cortex-A + - Cortex-R +operatingsystems: + - Linux +tools_software_languages: + - Arm Development Studio + - Arm Zena CSS + + +further_reading: + - resource: + title: Arm Zena Compute System (CSS) + link: https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem + type: website + - resource: + title: Arm Development Studio + link: https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/_next-steps.md b/content/learning-paths/automotive/zenacssdebug/_next-steps.md similarity index 100% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/_next-steps.md rename to content/learning-paths/automotive/zenacssdebug/_next-steps.md diff --git a/content/learning-paths/automotive/zenacssdebug/config.md b/content/learning-paths/automotive/zenacssdebug/config.md new file mode 100644 index 0000000000..81e6367f40 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/config.md @@ -0,0 +1,63 @@ +--- +# User change +title: "Model Configuration" + +weight: 4 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +# Debug Configuration + +Arm Development Studio requires a `Debug Configuration` of the target that it will connect to. + +As of Arm Development Studio version 2025.0, there is no such configuration provided 'out-of-the-box' for the Zena CSS FVP. However creating such a configuration is straight forward. + +See the Arm Development Studio [Getting Started Guide](https://developer.arm.com/documentation/101469/latest/Migrating-from-DS-5-to-Arm-Development-Studio/Connect-to-new-or-custom-models) for full instructions, but they are also summarized below. + +## Launch FVP + +As per previous section, launch FVP with the Iris server enabled: + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" +``` +or if connecting to the FVP remotely: + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 -A" +``` +{{% notice Note %}} +A local connection is assumed for the remainder of this learning path. +{{% /notice %}} + +## Configuration Database + +Debug Configurations are stored in a configuration database. You must first create a local database in which to store the configuration. + +Navigate to `File` > `New` > `Other`, and then select `Configuration Database` > `Configuration Database` from the drop-down list. + +Click `Next`. Give the Database a name, and click `Finish`. + +## Debug Configuration + +Navigate to the same wizard as above, and select `Model Configuration`. + +Click `Next`, and you will be prompted to select the above `Configuration Database`. Click `Next` again, and you will be prompted to select a Model Interface. + +Select `Iris` from the pulldown, and click `Next`. + +You will then be prompted to locate the model to connect to. + +Select `Browse for model running on local host`. The FVP will be detected and interrogated by the debugger. + +{{% notice Note %}} +Use `Connect to model running on either local or remote host` if connecting remotely. +{{% /notice %}} + +A `model.mdf` file will be created that identifies all CPUs within the FVP. + +You can change the `Manufacturer Name` and `Platform Name` to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then `Save`, and `Import` into the configuration database. + +The debugger is now aware of the FVP and you are ready to debug. diff --git a/content/learning-paths/automotive/zenacssdebug/configdb.png b/content/learning-paths/automotive/zenacssdebug/configdb.png new file mode 100644 index 0000000000..819071a7ff Binary files /dev/null and b/content/learning-paths/automotive/zenacssdebug/configdb.png differ diff --git a/content/learning-paths/automotive/zenacssdebug/connect.md b/content/learning-paths/automotive/zenacssdebug/connect.md new file mode 100644 index 0000000000..921f80b467 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/connect.md @@ -0,0 +1,75 @@ +--- +# User change +title: "Debug Connections" + +weight: 5 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Debug Connections + +You are now ready to create debug connections for each of the sub-systems within Zena CSS. In this section you will create the connections, which will be subsequently enhanced in the following section. You may prefer to fully set up one such connection before moving to others. + +Arm Development Studio has full support for Heterogeneous systems such as Zena CSS, and so you can connect to all processors simultaneously. + +### Debug connection project + +First, create a project to store these connections (`.launch` files) in. + +Select `File` > `New...` > `Project` > `General` > `Project`, and give it a meaningful name (`Connections`). + +### RSE (Cortex-M55) + +Runtime Security Engine (RSE) is based on [Cortex-M55](https://developer.arm.com/Processors/Cortex-M55) core and is a security subsystem fulfilling the role of Root of Trust. + +Select `File` > `New` > `Model Connection`. + +{{% notice Note %}} +You can also use `File` > `New` > `Other` > `Arm Debugger` > `Model Connection`, or + +`Create a debug connection...` shortcut in the `Debug Control` pane. +{{% /notice %}} + +Specify a connection name (`RSE`), and associate with the above `Connections` project. Click `Next`. + +Locate the FVP based on the name you gave it previously (`Zena_CSS_FVP`). The text filter can help you locate it easily. + +You will then be presented with the `Edit configuration` pane. In the `Connection` tab, scroll down to locate `Bare Metal Debug` > `Arm_Cortex-M55`. + +As you will be later launching the FVP with the software stack loaded, select `Connect to an already running model`. + +Assuming the same host will be running both the FVP and the debugger, specify the `Connection address` as the default `127.0.0.1:7100`. + +{{% notice Note %}} +`127.0.0.1` is the same as `localhost`, that is the same host machine as is running the FVP. + +It is also possible to connect to a remote host by specifying appropriate IP address, and launching FVP with the `-A` option. + +`7100` is the default port number. You may need to change this if necessary. +{{% /notice %}} + +Click `Apply` to save the connection information, and `Close`. Observe that `RSE.launch` is created inside the `Connections` project. + +### Safety Island (Cortex-R82AE) + +The Safety Island is a subsystem based on [Cortex-R82AE](https://developer.arm.com/Processors/Cortex-R82AE) core. The software running on the Safety Island is responsible for power, clock and CMN control. + +The procedure to create this connection is very similar to the above, other than to select `Bare Metal Debug` > `Arm_Cortex-R82AE` from the drop-down. + +{{% notice %}} +For convenience you can copy-and-paste `RSE.launch` as `SI.launch` and just modify the CPU. +{{% /notice %}} + +### Primary Compute (Cortex-A720AE) + +The Primary Compute consists of four processor clusters to run a rich OS such as Linux. Each processor cluster includes four [Cortex-A720AE](https://developer.arm.com/Processors/Cortex-A720AE) cores and a [DSU-120AE](https://developer.arm.com/Processors/DSU-120AE) DynamIQ Shared Unit. + +The application processors will be debugged in an SMP configuration with Linux Kernel awareness. + +As shown above, create `Primary_init.launch` connection and scroll to `Bare Metal Debug` > `ARM_Cortex-A720AE_0`. This will connect to just CPU0, leaving the other CPUs free to run. + +To debug the Linux kernel you can make use of the [OS awareness](https://developer.arm.com/documentation/101470/latest/Debugging-Embedded-Systems/About-OS-awareness) feature of the Arm Debugger. + +Create `Primary_Linux.launch` connection and scroll to `Linux Kernel Debug` > `ARM_Cortex-A720AEx16 SMP Cluster 1`. This will connect to all 16 `Cortex-A720AE` processors present in the FVP, though only cores 0-3 are used. diff --git a/content/learning-paths/automotive/zenacssdebug/debugger_commands.png b/content/learning-paths/automotive/zenacssdebug/debugger_commands.png new file mode 100644 index 0000000000..8bafab27da Binary files /dev/null and b/content/learning-paths/automotive/zenacssdebug/debugger_commands.png differ diff --git a/content/learning-paths/automotive/zenacssdebug/launch.md b/content/learning-paths/automotive/zenacssdebug/launch.md new file mode 100644 index 0000000000..5aba66e2c8 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/launch.md @@ -0,0 +1,56 @@ +--- +# User change +title: "Launch FVP" + +weight: 3 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Launch FVP + +You can now launch the FVP within the virtual environment with the software stack loaded: + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose" +``` +Refer to the [documentation](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html#run-the-fvp) for more details. +While you can continue to use this method to launch the FVP whilst debugging, this command does not enable the Iris debug server inside the model, and so will not be debuggable. + +Additional command options are necessary. + +You will use the following. See output of `FVP_RD_Aspen --help` for full list and explanation. Options are case-sensitive. + +| Option | Alias | Notes | +|---------------------- |--------- |---------------------------------------------- | +| `--iris-server` | `-I` | Start Iris Debug Server | +| `--iris-port` | | Specify a port number (default = `7100`) | +| `--run` | `-R` | Run simulation when debug server started | +| `--iris-allow-remote` | `-A` | Allow remote connections (if different hosts) | + +### Launch FVP with additional options + +To launch the FVP with additional options, modify the above command by adding `--` and then the options. + +For example, to launch the model with the debug server and hold at the initial reset condition: + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" +``` + +To launch the model and start running (so that it can start to boot up): + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run" +``` + +To launch the model so that remote hosts can access it (not recommended if not needed), using options aliases: + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- -I -A --iris-port 7100" +``` + +{{% notice Note %}} +It is recommended to specify the port number used even if it is the default as that must match the debug connection setting (see later). +{{% /notice %}} diff --git a/content/learning-paths/automotive/zenacssdebug/primarycompute.md b/content/learning-paths/automotive/zenacssdebug/primarycompute.md new file mode 100644 index 0000000000..ae48551864 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/primarycompute.md @@ -0,0 +1,71 @@ +--- +# User change +title: "Debug Primary Compute and Linux" + +weight: 8 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Debug Primary Compute + +The Primary Compute application processors (`Cortex-A720AE`) are the final processors to be enabled. + +As before, you can connect whilst powered down and monitor the point that they are enabled. + +You can debug the initialization code and the final Linux Operating System (OS) threads. + +### Connect debugger to target + +Use the following debugger commands in the `Primary_init.launch` to load the symbols for the `BL2` initialization code, setting a breakpoint at `bl2_entrypoint`. + +Note that an address "offset" is used to specify the exception level that the image is relevant to. If the processor changes exception level, the debug information would need to also be loaded to the corresponding EL address space. + +For example the processors start in `EL3` and move to `EL2N` when the Linux kernel is enabled. + +``` text +stop +add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-a/2.11.0+git/image/firmware/bl2.elf EL3:0x0 +tbreak bl2_entrypoint +``` +{{% notice Note %}} +Exact paths may differ for your set up. +{{% /notice %}} + +Run the code to the `bl2_entrypoint` and you can debug as expected. + +### Debug Linux kernel modules + +To make use of the OS awareness feature, disconnect `Primary_init` and connect to `Primary_Linux` as created previously. Load the symbols from the `vmlinux` image. + +``` text +stop +add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/linux-yocto/6.6.54+git/linux-fvp_rd_aspen-standard-build/vmlinux EL2N:0x0 +set substitute-path /usr/src/kernel/ /arm-auto-solutions/build/tmp_baremetal/work-shared/fvp-rd-aspen/kernel-source/ +``` +Run the FVP until the OS prompt appears. + +{{% notice %}} +If you are only interested in kernel debug, modify the launch command for the FVP to include `--run` to start execution immediately. + +``` command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run" +``` +{{% /notice %}} + +You can now enable the `Threads` view in the `Debug Control` pane. + +Right-click on the connection, and select `Display Threads`. You can also do this by entering `thread` in the `Command` pane. + +The view will then change from listing the 16 application processors to the OS threads. + +{{% notice Note %}} +A warning of the form: +``` text +WARNING(ROS60): Could not enable OS support as the OS does not appear to be initialized. This might be caused by a mismatch between the loaded symbols and the code on the target or because the OS is not up and running. Enabling OS support will be re-attempted when the target next stops. +``` +may be emitted if the OS is not booted when you connect. It can safely be ignored. +{{% /notice %}} + +You have successfully learnt how to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack. diff --git a/content/learning-paths/automotive/zenacssdebug/rse.md b/content/learning-paths/automotive/zenacssdebug/rse.md new file mode 100644 index 0000000000..007a7a17f9 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/rse.md @@ -0,0 +1,79 @@ +--- +# User change +title: "Debug RSE from reset" + +weight: 6 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +## Debug RSE from reset + +Let us start by debugging the initial code that executes on the Cortex-M55 within the RSE block. + +### Launch FVP + +Start a new `tmux` session for the FVP (if necessary): +```command +tmux new-session -s arm-auto-solutions +``` +and navigate to your code repository. + +To debug from reset, launch the FVP with the Iris server but do not run. This will hold the FVP in the initial reset condition. + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" +``` +The FVP will start and generate various informational messages. Once initialized you should see something similar to: + +```output +... +Info: RD_Aspen: RD_Aspen.css.smb.rse_flashloader: FlashLoader: Saved 64MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/rse-flash-image.img' +Info: RD_Aspen: RD_Aspen.ros.flash_loader: FlashLoader: Saved 128MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/ap-flash-image.img' +``` + +Note that execution has not started. + +### Connect the debugger + +Using the `RSE` connection created in the previous section, connect the debugger to the FVP. Observe that the processor is stopped before the first instruction has been executed. + +In fact, the FVP is configured to have the vector table (`VTOR_S`) start at `0x11000000`, and if you inspect memory at that address the vector table will be populated. However no debug information is visible. Debug information must be loaded. + +In the `Debug Pane`, select `Load...` from the pane menu, and select `Add Symbols file`. + +Browse to the `bl1_1.axf` file which is likely at: + +``` bash +/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf +``` +Debug symbols will be loaded, but likely no source will be displayed. This is because the build was performed within the virtual environment but the debugger is running outside of that. + +You will be prompted to enter a path substitution to locate the sources. You can refer to the lowest common path so that all subsequent source files will also be located successfully. + +``` bash +/usr/src/debug/trusted-firmware-m/2.1.0/ +/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/" +``` +Finally, to perform a single instruction step (`stepi`) to allow the processor to fetch the address of the `Reset_Handler` and stop there. + +You can now step through the code, set breakpoints, and inspect the target as the code proceeds. + +### Automate setup + +For convenience, it is possible to automate these actions every time you connect by entering them as `Debugger Commands` in the `.launch` configuration. + +Open (double-click) the `.launch` file, and navigate to the `Debugger` pane. + +Enable `Execute debugger commands`, and enter the following (note pathing for your setup). You can copy the exact commands from the `Command` or `History` pane whilst performing the above GUI configuration. + +It is recommended to have an explicit `stop` command as symbols cannot be loaded whilst the target is running. + +``` text +stop +add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf +set substitute-path /usr/src/debug/trusted-firmware-m/2.1.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/ +stepi +``` +![Debugger pane](debugger_commands.png) diff --git a/content/learning-paths/automotive/zenacssdebug/safetyisland.md b/content/learning-paths/automotive/zenacssdebug/safetyisland.md new file mode 100644 index 0000000000..951e973531 --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/safetyisland.md @@ -0,0 +1,70 @@ +--- +# User change +title: "Debug Safety Island code" + +weight: 7 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- +## Debug Safety Island code from beginning + +The Safety Island (Cortex-R82AE) is released from reset by the RSE code, and so the RSE code must proceed to that point before the Safety Island core can execute. + +### Launch FVP + +If necessary, restart the FVP in the reset state as before, and reconnect `RSE`. + +```command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100" +``` + +Set up the `SI` connection in a similar way as the `RSE` connection. Use the following commands in the `Debugger` pane. This will load debug symbols and perform the necessary path substitution. You can then set a breakpoint on the entry of the `SI` code, `arch_exception_reset`. + +``` text +stop +add-symbol-file /arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/si0_ramfw.elf +set substitute-path /usr/src/debug/scp-firmware/2.14.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/scp-firmware/2.14.0/git/ +b arch_exception_reset +``` + +{{% notice Note %}} +Exact paths may differ for your set up. +{{% /notice %}} + +### Start execution + +Select the `RSE` connection in the `Debug Control` pane, and start execution (this will be unavailable in the `SI` connection, as that is currently powered down). + +The `RSE` code will run until the point that the `SI` is enabled. This is reflected in the output log. + +``` output +[INF] BL2: SI CL0 post load start +``` + +#### Full output log + +The full output lof is shown here for your reference: + +``` output +Trying ::1... +Trying 127.0.0.1... +Connected to localhost. +Escape character is '^]'. +[INF] Starting TF-M BL1_1 +[INF] Jumping to BL1_2 +[INF] Starting TF-M BL1_2 +[INF] Attempting to boot image 0 +[INF] BL2 image decrypted successfully +[INF] BL2 image validated successfully +[INF] Jumping to BL2 +[INF] Starting bootloader +[INF] PSA Crypto init done, sig_type: EC-P256 +[INF] BL2: SI CL0 pre load start +[INF] BL2: SI CL0 pre load complete +[INF] Primary slot: version=0.0.7+0 +[INF] Secondary slot: version=0.0.7+0 +[INF] Image 3 RAM loading to 0x70083c00 is succeeded. +[INF] Image 3 loaded from the primary slot +[INF] BL2: SI CL0 post load start +``` diff --git a/content/learning-paths/automotive/zenacssdebug/zena.md b/content/learning-paths/automotive/zenacssdebug/zena.md new file mode 100644 index 0000000000..fffc489bbb --- /dev/null +++ b/content/learning-paths/automotive/zenacssdebug/zena.md @@ -0,0 +1,73 @@ +--- +# User change +title: "Getting started" + +weight: 2 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +# Arm Zena Compute Subsystem + +The Arm Zena Compute Subsystem (CSS) consists of a high-performance Arm Cortex-A720AE Application Processor (Primary Compute) system augmented with an Arm Cortex-R82AE based Safety Island (SI) and real-time domain to host additional system safety monitoring and real-time services. + +The system additionally includes a Runtime Security Engine (RSE) used for the secure boot of the system elements and the runtime secure services. + +The Arm Zena CSS software stack provides an open-source, integrated solution running on a Fixed Virtual Platform (FVP). + +The reference software stack and the FVP are freely available. + +For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem) and associated links. + +## Build software stack + +Follow the steps to download and build the software stack in the [User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html). + +The default `Arm Automotive Solutions Demo` build is used. + +{{% notice Note %}} +The focus of this Learning Path is to demonstrate the **debug** of the software stack. +{{% /notice %}} + +## Verify correct build and execution + +Once the software stack has been built, you can verify that it runs successfully with the command: + +``` command +kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose" +``` + +The system will run through the boot process until a Linux prompt is available (in `terminal_ns_uart0`). + +Use `Ctrl+C` on the command terminal to terminate. + +## Install FVP (optional) + +The FVP is downloaded and installed as part of the build process above. + +The `Arm-Zena-CSS-FVP` can also be independently downloaded from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs). + +See also the Arm Ecosystem FVPs and Architecture Envelope Models [Install Guide](/install-guides/fm_fvp/eco_fvp/). + +{{% notice Note %}} +For legacy reasons the FVP is named is `FVP_RD_Aspen`. +{{% /notice %}} + +# Arm Development Studio + +Arm Development Studio is a software development solution with support of multicore debug for Arm CPUs. It provides the earliest support for the latest processors. + +The CPUs implemented within Arm Zena CSS are supported by Arm Development Studio 2024.0 and later, though 2024.1 or later is recommended for appropriate Linux OS support. At time of writing the latest version available is 2025.0, and that is the version used for this learning path. + +For more information see [Arm Development Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio). + +Arm Development Studio is a commercial, license managed, product. For installation and set up instructions, see this [Install Guide](/install-guides/armds/). + +Launch the IDE. It is recommended to create a new workspace folder. + +If prompted by the launcher (this is disabled by default) create a new folder there, else select `File` > `Switch Workspace` > `Other...`. + +{{% notice Note %}} +To enable this prompt by default, navigate to `Window` > `Preferences` > `General` > `Startup and Shutdown` > `Workspaces`, and enable `Prompt for workspace on startup`. +{{% /notice %}} diff --git a/content/learning-paths/cross-platform/avh_cicd/_index.md b/content/learning-paths/cross-platform/avh_cicd/_index.md index f4c50b95a5..6d4354819f 100644 --- a/content/learning-paths/cross-platform/avh_cicd/_index.md +++ b/content/learning-paths/cross-platform/avh_cicd/_index.md @@ -22,7 +22,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - Arm Virtual Hardware - GitHub diff --git a/content/learning-paths/cross-platform/avh_cicd2/_index.md b/content/learning-paths/cross-platform/avh_cicd2/_index.md index 50f3442f24..924ed550f7 100644 --- a/content/learning-paths/cross-platform/avh_cicd2/_index.md +++ b/content/learning-paths/cross-platform/avh_cicd2/_index.md @@ -23,7 +23,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - Arm Virtual Hardware - GitHub diff --git a/content/learning-paths/cross-platform/cca_rme/_index.md b/content/learning-paths/cross-platform/cca_rme/_index.md index 7276ae2e9e..75ddca964f 100644 --- a/content/learning-paths/cross-platform/cca_rme/_index.md +++ b/content/learning-paths/cross-platform/cca_rme/_index.md @@ -28,7 +28,6 @@ operatingsystems: - Android tools_software_languages: - - Coding - Trusted Firmware - Arm Development Studio - RME diff --git a/content/learning-paths/cross-platform/dynamic-memory-allocator/_index.md b/content/learning-paths/cross-platform/dynamic-memory-allocator/_index.md index 4ef03c2c09..54af0efaf1 100644 --- a/content/learning-paths/cross-platform/dynamic-memory-allocator/_index.md +++ b/content/learning-paths/cross-platform/dynamic-memory-allocator/_index.md @@ -41,8 +41,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - C - - Coding + - C - Runbook ### Cross-platform metadata only diff --git a/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/_index.md b/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/_index.md index da6864147c..0f2ebba76d 100644 --- a/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/_index.md +++ b/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/_index.md @@ -22,7 +22,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/_index.md b/content/learning-paths/cross-platform/floating-point-behavior/_index.md similarity index 69% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/_index.md rename to content/learning-paths/cross-platform/floating-point-behavior/_index.md index 923b028e69..0d9f1dad10 100644 --- a/content/learning-paths/cross-platform/floating-point-rounding-errors/_index.md +++ b/content/learning-paths/cross-platform/floating-point-behavior/_index.md @@ -1,19 +1,15 @@ --- -title: Explore floating-point differences between x86 and Arm - -draft: true -cascade: - draft: true +title: Understand floating-point behavior across x86 and Arm architectures minutes_to_complete: 30 -who_is_this_for: This is an introductory topic for developers who are porting applications from x86 to Arm and want to understand how floating-point behavior differs between these architectures - particularly in the context of numerical consistency, performance, and debugging subtle bugs. +who_is_this_for: This is an introductory topic for developers who are porting applications from x86 to Arm and want to understand floating-point behavior across these architectures. Both architectures provide reliable and consistent floating-point computation following the IEEE 754 standard. learning_objectives: - - Identify key differences in floating-point behavior between the x86 and Arm architectures. - - Recognize the impact of compiler optimizations and instruction sets on floating-point results. - - Apply compiler flags and best practices to ensure consistent floating-point behavior across - platforms. + - Understand that Arm and x86 produce identical results for all well-defined floating-point operations. + - Recognize that differences only occur in special undefined cases permitted by IEEE 754. + - Learn best practices for writing portable floating-point code across architectures. + - Apply appropriate precision levels for portable results. prerequisites: - Access to an x86 and an Arm Linux machine. @@ -47,8 +43,6 @@ further_reading: link: https://en.cppreference.com/w/cpp/numeric/fenv type: documentation - - ### FIXED, DO NOT MODIFY # ================================================================================ weight: 1 # _index.md always has weight of 1 to order correctly diff --git a/content/learning-paths/cross-platform/floating-point-behavior/_next-steps.md b/content/learning-paths/cross-platform/floating-point-behavior/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/cross-platform/floating-point-behavior/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/differences.png b/content/learning-paths/cross-platform/floating-point-behavior/differences.png similarity index 100% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/differences.png rename to content/learning-paths/cross-platform/floating-point-behavior/differences.png diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/floating-point-numbers.png b/content/learning-paths/cross-platform/floating-point-behavior/floating-point-numbers.png similarity index 100% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/floating-point-numbers.png rename to content/learning-paths/cross-platform/floating-point-behavior/floating-point-numbers.png diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-1.md b/content/learning-paths/cross-platform/floating-point-behavior/how-to-1.md similarity index 70% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-1.md rename to content/learning-paths/cross-platform/floating-point-behavior/how-to-1.md index c2855ae849..feb1a513ba 100644 --- a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-1.md +++ b/content/learning-paths/cross-platform/floating-point-behavior/how-to-1.md @@ -1,11 +1,19 @@ --- -title: "Floating-Point Representation" +title: "Floating-point representation" weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- +## Introduction + +This Learning Path explores floating-point behavior across x86 and Arm architectures. Both architectures fully implement the IEEE 754 standard and produce identical results for all well-defined floating-point operations. + +Any differences you encounter are limited to special undefined cases where the IEEE 754 standard explicitly permits different implementations. These cases represent edge conditions that can be avoided, not fundamental differences in floating-point results. + +Arm processors provide accurate floating-point computation that is comparable to x86 for all standard mathematical operations. By understanding the nuances of floating-point arithmetic and following best practices, you can write portable and robust code that performs consistently across platforms. + ## Review of floating-point numbers {{% notice Learning tip%}} @@ -47,8 +55,7 @@ Key takeaways: - ULP behavior impacts numerical stability and precision. {{% notice Learning tip %}} -Keep in mind that rounding and representation issues aren't bugs — they’re a consequence of how floating-point math works at the hardware level. Understanding these fundamentals is essential when porting numerical code across architectures like x86 and Arm. +Keep in mind that rounding and representation issues aren't bugs, they are a consequence of how floating-point math works at the hardware level. Understanding these fundamentals is useful when porting numerical code across architectures like x86 and Arm. {{% /notice %}} - -In the next section, you'll explore how x86 and Arm differ in how they implement and optimize floating-point operations — and why this matters for writing portable, accurate software. +In the next section, you'll explore why you may come across differences in undefined floating point operations and how you can use this information to write portable floating-point code. diff --git a/content/learning-paths/cross-platform/floating-point-behavior/how-to-2.md b/content/learning-paths/cross-platform/floating-point-behavior/how-to-2.md new file mode 100644 index 0000000000..9b8e425bed --- /dev/null +++ b/content/learning-paths/cross-platform/floating-point-behavior/how-to-2.md @@ -0,0 +1,132 @@ +--- +title: Overflow in floating-point to integer conversion +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Are there differences in behavior between x86 and Arm floating point? + +Both the x86 and Arm architectures fully comply with the IEEE 754 standard for floating-point representation. For all well-defined operations, both architectures produce identical results. Differences only occur in cases where the IEEE 754 standard explicitly leaves behavior undefined, such as converting out-of-range floating-point values to integers. These are special undefined cases where the standard permits implementations to behave differently and is not a flaw or limitation of either architecture. + +Understanding these undefined corner cases will help you correct any non-portable code. + +### Undefined behavior in floating-point to integer conversion + +The following example demonstrates undefined behavior that occurs when converting out-of-range floating-point values to integers. An out-of-range floating-point value is too large or too small to be represented within the limits of the floating-point format used, such as float or double. + +This behavior is explicitly undefined by the IEEE 754 standard and the C++ specification, meaning different architectures are permitted to handle these cases differently. + +The differences shown below only occur in undefined behavior cases. Normal floating-point operations produce identical results on both architectures. + +An example of undefined behavior in floating-point code is provided below. You can run the example application on both an x86 and an Arm Linux system. If you are using AWS, use EC2 instance types `t3.micro` and `t4g.small` with Ubuntu 24.04. + +To learn about floating-point conversions, use an editor to copy and paste the C++ code below into a new file named `conversions.cpp`. + +```cpp +#include +#include +#include +#include + +void convertFloatToInt(float value) { + // Convert to unsigned 32-bit integer + uint32_t u32 = static_cast(value); + + // Convert to signed 32-bit integer + int32_t s32 = static_cast(value); + + // Convert to unsigned 16-bit integer (truncation happens) + uint16_t u16 = static_cast(u32); + uint8_t u8 = static_cast(value); + + // Convert to signed 16-bit integer (truncation happens) + int16_t s16 = static_cast(s32); + + std::cout << "Floating-Point Value: " << value << "\n"; + std::cout << " → uint32_t: " << u32 << " (0x" << std::hex << u32 << std::dec << ")\n"; + std::cout << " → int32_t: " << s32 << " (0x" << std::hex << s32 << std::dec << ")\n"; + std::cout << " → uint16_t (truncated): " << u16 << " (0x" << std::hex << u16 << std::dec << ")\n"; + std::cout << " → int16_t (truncated): " << s16 << " (0x" << std::hex << s16 << std::dec << ")\n"; + std::cout << " → uint8_t (truncated): " << static_cast(u8) << std::endl; + + std::cout << "----------------------------------\n"; +} + +int main() { + std::cout << "Demonstrating Floating-Point to Integer Conversion\n\n"; + + // Test cases + convertFloatToInt(42.7f); // Normal case + convertFloatToInt(-15.3f); // Negative value -> wraps on unsigned + convertFloatToInt(4294967296.0f); // Overflow: 2^32 (UINT32_MAX + 1) + convertFloatToInt(3.4e+38f); // Large float exceeding UINT32_MAX + convertFloatToInt(-3.4e+38f); // Large negative float + convertFloatToInt(NAN); // NaN behavior on different platforms + return 0; +} +``` + +If you need to install the `g++` and `clang` compilers, run the commands below: + +```bash +sudo apt update +sudo apt install g++ clang -y +``` + +Compile `conversions.cpp` on an Arm and an x86 Linux machine. + +The compile command is the same on both systems. + +```bash +g++ conversions.cpp -o converting-float +``` + +Run the program on both systems: + +```bash +./converting-float +``` + +For easy comparison, the image below shows the x86 output (left) and Arm output (right). The highlighted lines show the difference in output: + +![differences](./differences.png) + +As you can see, there are several cases where different behavior is observed in these undefined scenarios. For example, when trying to convert a signed number to an unsigned number or dealing with out-of-bounds values. + +## Avoid out-of-range conversions + +The above differences demonstrate non-portable code. Undefined behavior, such as converting out-of-range floating-point values to integers, can lead to inconsistent results across platforms. To ensure portability and predictable behavior, it is essential to check for out-of-range values before performing such conversions. + +You can check for out-of-range values using the code below. This approach ensures that the conversion is only performed when the value is within the valid range for the target data type. If the value is out of range, a default value is used to handle the situation gracefully. This prevents unexpected results and makes the code portable. + +```cpp +constexpr float UINT32_MAX_F = static_cast(UINT32_MAX); + +void convertFloatToInt(float value) { + // Convert to unsigned 32-bit integer with range checking + uint32_t u32; + if (!std::isnan(value) && value >= 0.0f && value <= UINT32_MAX_F) { + u32 = static_cast(value); + std::cout << "The casted number is: " << u32 << std::endl; + } else { + u32 = 0; // Default value for out-of-range + std::cout << "The float is out of bounds for uint32_t, using 0." << std::endl; + } + + // ...existing code... +} +``` + +This checking provides a portable solution that identifies out-of-range values before casting and sets the out-of-range values to 0. By incorporating such checks, you can avoid undefined behavior and ensure that your code behaves consistently across different platforms. + +### Key takeaways + +- Arm and x86 produce identical results for all well-defined floating-point operations, both architectures comply with IEEE 754. +- Differences only occur in special undefined cases where the IEEE 754 standard explicitly permits different behaviors. +- An example undefined scenario is converting out-of-range floating-point values to integers. +- You should avoid relying on undefined behavior to ensure portability. + +By understanding these nuances, you can confidently write code that behaves consistently across platforms. + diff --git a/content/learning-paths/cross-platform/floating-point-behavior/how-to-3.md b/content/learning-paths/cross-platform/floating-point-behavior/how-to-3.md new file mode 100644 index 0000000000..437fbfdd16 --- /dev/null +++ b/content/learning-paths/cross-platform/floating-point-behavior/how-to-3.md @@ -0,0 +1,115 @@ +--- +title: Single and double precision considerations +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Understanding numerical precision differences in single vs double precision + +This section explores how different levels of floating-point precision can affect numerical results. The differences shown here are not architecture-specific issues, but demonstrate the importance of choosing appropriate precision levels for numerical computations. + +### Single precision limitations + +Consider two mathematically equivalent functions, `f1()` and `f2()`. While they should theoretically produce the same result, small differences can arise due to the limited precision of floating-point arithmetic. + +The differences shown in this example are due to using single precision (float) arithmetic, not due to architectural differences between Arm and x86. Both architectures handle single precision arithmetic according to IEEE 754. + +Functions `f1()` and `f2()` are mathematically equivalent. You would expect them to return the same value given the same input. + +Use an editor to copy and paste the C++ code below into a file named `single-precision.cpp` + +```cpp +#include +#include + +// Function 1: Computes sqrt(1 + x) - 1 using the naive approach +float f1(float x) { + return sqrtf(1 + x) - 1; +} + +// Function 2: Computes the same value using an algebraically equivalent transformation +// This version is numerically more stable +float f2(float x) { + return x / (sqrtf(1 + x) + 1); +} + +int main() { + float x = 1e-8; // A small value that causes floating-point precision issues + float result1 = f1(x); + float result2 = f2(x); + + // Theoretically, result1 and result2 should be the same + float difference = result1 - result2; + // Multiply by a large number to amplify the error + float final_result = 100000000.0f * difference + 0.0001f; + + // Print the results + printf("f1(%e) = %.10f\n", x, result1); + printf("f2(%e) = %.10f\n", x, result2); + printf("Difference (f1 - f2) = %.10e\n", difference); + printf("Final result after magnification: %.10f\n", final_result); + + return 0; +} +``` + +Compile and run the code on both x86 and Arm with the following command: + +```bash +g++ -g single-precision.cpp -o single-precision +./single-precision +``` + +Output running on x86: + +```output +f1(1.000000e-08) = 0.0000000000 +f2(1.000000e-08) = 0.0000000050 +Difference (f1 - f2) = -4.9999999696e-09 +Final result after magnification: -0.4999000132 +``` + +Output running on Arm: + +```output +f1(1.000000e-08) = 0.0000000000 +f2(1.000000e-08) = 0.0000000050 +Difference (f1 - f2) = -4.9999999696e-09 +Final result after magnification: -0.4998999834 +``` + +Depending on your compiler and library versions, you may get the same output on both systems. You can also use the `clang` compiler and see if the output matches. + +```bash +clang -g single-precision.cpp -o single-precision -lm +./single-precision +``` + +In some cases the GNU compiler output differs from the Clang output. + +Here's what's happening: + +1. Different square root algorithms: x86 and Arm use different hardware and library implementations for `sqrtf(1 + 1e-8)` + +2. Tiny implementation differences get amplified. The difference between the two `sqrtf()` results is only about 3e-10, but this gets multiplied by 100,000,000, making it visible in the final result. + +3. Both `f1()` and `f2()` use `sqrtf()`. Even though `f2()` is more numerically stable, both functions call `sqrtf()` with the same input, so they both inherit the same architecture-specific square root result. + +4. Compiler and library versions may produce different output due to different implementations of library functions such as `sqrtf()`. + +The final result is that x86 and Arm libraries compute `sqrtf(1.00000001)` with tiny differences in the least significant bits. This is normal and expected behavior and IEEE 754 allows for implementation variations in transcendental functions like square root, as long as they stay within specified error bounds. + +The very small difference you see is within acceptable floating-point precision limits. + +### Key takeaways + +- The small differences shown are due to library implementations in single-precision mode, not fundamental architectural differences. +- Single-precision arithmetic has inherent limitations that can cause small numerical differences. +- Using numerically stable algorithms, like `f2()`, can minimize error propagation. +- Understanding [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) is important for writing portable code. + +By adopting best practices and appropriate precision levels, developers can ensure consistent results across platforms. + +Continue to the next section to see how precision impacts the results. diff --git a/content/learning-paths/cross-platform/floating-point-behavior/how-to-4.md b/content/learning-paths/cross-platform/floating-point-behavior/how-to-4.md new file mode 100644 index 0000000000..0bbd869072 --- /dev/null +++ b/content/learning-paths/cross-platform/floating-point-behavior/how-to-4.md @@ -0,0 +1,74 @@ +--- +title: Minimize floating-point variability across platforms +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## How can I ensure consistent floating-point results across x86 and Arm? + +The most effective way to ensure consistent floating-point results across platforms is to use double precision arithmetic. Both Arm and x86 produce identical results when using double precision for the same operations. + +### Double precision floating-point eliminates differences + +The example below demonstrates how using double precision eliminates the small differences observed in the previous single-precision example. Switching from `float` to `double` ensures identical results on both architectures. + +Use an editor to copy and paste the C++ file below into a file named `double-precision.cpp` + +```cpp +#include +#include + +// Function 1: Computes sqrt(1 + x) - 1 using the naive approach +double f1(double x) { + return sqrtf(1 + x) - 1; +} + +// Function 2: Computes the same value using an algebraically equivalent transformation +// This version is numerically more stable +double f2(double x) { + return x / (sqrtf(1 + x) + 1); +} + +int main() { + double x = 1e-8; + double result1 = f1(x); + double result2 = f2(x); + + // Theoretically, result1 and result2 should be the same + double difference = result1 - result2; + // Multiply by a large number to amplify the error + double final_result = 100000000.0f * difference + 0.0001f; + + // Print the results + printf("f1(%e) = %.10f\n", x, result1); + printf("f2(%e) = %.10f\n", x, result2); + printf("Difference (f1 - f2) = %.10e\n", difference); + printf("Final result after magnification: %.10f\n", final_result); + + return 0; +} +``` + +Compile on both computers: + +```bash +g++ -o double-precision double-precision.cpp +./double-precision +``` + +Running the new binary on both systems shows that both functions produce identical results. + +Here is the output on both systems: + +```output +f1(1.000000e-08) = 0.0000000050 +f2(1.000000e-08) = 0.0000000050 +Difference (f1 - f2) = -1.7887354748e-17 +Final result after magnification: 0.0000999982 +``` + +By choosing appropriate precision levels, you can write code that remains consistent and reliable across architectures. Precision, however, involves a trade-off: single precision reduces memory use and often improves performance, while double precision is essential for applications demanding higher accuracy and greater numerical stability, particularly to control rounding errors. + +For the vast majority of floating-point application code, you will not notice any differences between x86 and Arm architectures. However, in rare cases where differences do occur, they are usually due to undefined behaviors or non-portable code. These differences should not be a cause for concern, but rather an opportunity to improve the code for better portability and consistency across platforms. By addressing these issues, you can ensure that your floating-point code runs reliably and produces identical results on both x86 and Arm systems. diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/ulp.png b/content/learning-paths/cross-platform/floating-point-behavior/ulp.png similarity index 100% rename from content/learning-paths/cross-platform/floating-point-rounding-errors/ulp.png rename to content/learning-paths/cross-platform/floating-point-behavior/ulp.png diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-2.md b/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-2.md deleted file mode 100644 index e165e911f6..0000000000 --- a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-2.md +++ /dev/null @@ -1,124 +0,0 @@ ---- -title: Differences between x86 and Arm -weight: 3 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## What are the differences in behavior between x86 and Arm floating point? - -Although both x86 and Arm generally follow the IEEE 754 standard for floating-point representation, their behavior in edge cases — like overflow and truncation — can differ due to implementation details and instruction sets. - -You can see this by comparing an example application on both an x86 and an Arm Linux system. - -Run this example on any Linux system with x86 and Arm architecture; on AWS, use EC2 instance types `t3.micro` and `t4g.small` with Ubuntu 24.04. - -To learn about floating-point differences, use an editor to copy and paste the C++ code below into a new file named `converting-float.cpp`: - -```cpp -#include -#include -#include -#include - -void convertFloatToInt(float value) { - // Convert to unsigned 32-bit integer - uint32_t u32 = static_cast(value); - - // Convert to signed 32-bit integer - int32_t s32 = static_cast(value); - - // Convert to unsigned 16-bit integer (truncation happens) - uint16_t u16 = static_cast(u32); - uint8_t u8 = static_cast(value); - - // Convert to signed 16-bit integer (truncation happens) - int16_t s16 = static_cast(s32); - - std::cout << "Floating-Point Value: " << value << "\n"; - std::cout << " → uint32_t: " << u32 << " (0x" << std::hex << u32 << std::dec << ")\n"; - std::cout << " → int32_t: " << s32 << " (0x" << std::hex << s32 << std::dec << ")\n"; - std::cout << " → uint16_t (truncated): " << u16 << " (0x" << std::hex << u16 << std::dec << ")\n"; - std::cout << " → int16_t (truncated): " << s16 << " (0x" << std::hex << s16 << std::dec << ")\n"; - std::cout << " → uint8_t (truncated): " << static_cast(u8) << std::endl; - - std::cout << "----------------------------------\n"; -} - -int main() { - std::cout << "Demonstrating Floating-Point to Integer Conversion\n\n"; - - // Test cases - convertFloatToInt(42.7f); // Normal case - convertFloatToInt(-15.3f); // Negative value -> wraps on unsigned - convertFloatToInt(4294967296.0f); // Overflow: 2^32 (UINT32_MAX + 1) - convertFloatToInt(3.4e+38f); // Large float exceeding UINT32_MAX - convertFloatToInt(-3.4e+38f); // Large negative float - convertFloatToInt(NAN); // NaN behavior on different platforms - return 0; -} -``` - -If you need to install the `g++` compiler, run the commands below: - -```bash -sudo apt update -sudo apt install g++ -y -``` - -Compile `converting-float.cpp` on an Arm and x86 machine. - -The compile command is the same on both systems. - -```bash -g++ converting-float.cpp -o converting-float -``` - -For easy comparison, the image below shows the x86 output (left) and Arm output (right). The highlighted lines show the difference in output: - -![differences](./differences.png) - -As you can see, there are several cases where different behavior is observed. For example when trying to convert a signed number to an unsigned number or dealing with out-of-bounds numbers. - -## Removing hardcoded values with macros - -The above differences show that explicitly checking for specific values will lead to unportable code. - -For example, the function below checks if the casted result is `0`. This can be misleading — on x86, casting an out-of-range floating-point value to `uint32_t` may wrap to `0`, while on Arm it may behave differently. Relying on these results makes the code unportable. - - - -```cpp -void checkFloatToUint32(float num) { - uint32_t castedNum = static_cast(num); - if (castedNum == 0) { - std::cout << "The casted number is 0, indicating that the float is out of bounds for uint32_t." << std::endl; - } else { - std::cout << "The casted number is: " << castedNum << std::endl; - } -} -``` - -This can simply be corrected by using the macro, `UINT32_MAX`. - -{{% notice Note %}} -To find out all the available compiler-defined macros, you can output them using: -```bash -echo "" | g++ -dM -E - -``` -{{% /notice %}} - -A portable version of the code is: - -```cpp -void checkFloatToUint32(float num) { - uint32_t castedNum = static_cast(num); - if (castedNum == UINT32_MAX) { - std::cout << "The casted number is " << UINT32_MAX << " indicating the float was out of bounds for uint32_t." << std::endl; - } else { - std::cout << "The casted number is: " << castedNum << std::endl; - } -} -``` - diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-3.md b/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-3.md deleted file mode 100644 index 40f4f964ce..0000000000 --- a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-3.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -title: Error propagation -weight: 4 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## What is error propagation in x86 and Arm systems? - -One cause of different outputs between x86 and Arm stems from the order of instructions and how errors are propagated. As a hypothetical example, an Arm system may decide to reorder the instructions that each have a different rounding error so that subtle changes are observed. - -It is possible that two functions that are mathematically equivalent will propagate errors differently on a computer. - - Functions `f1` and `f2` are mathematically equivalent. You would expect them to return the same value given the same input. - - If the input is a very small number, `1e-8`, the error is different due to the loss in precision caused by different operations. Specifically, `f2` avoids subtracting nearly equal numbers for clarity. For a full description look into the topic of [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability). - -Use an editor to copy and paste the C++ code below into a file named `error-propagation.cpp`: - -```cpp -#include -#include - -// Function 1: Computes sqrt(1 + x) - 1 using the naive approach -float f1(float x) { - return sqrtf(1 + x) - 1; -} - -// Function 2: Computes the same value using an algebraically equivalent transformation -// This version is numerically more stable -float f2(float x) { - return x / (sqrtf(1 + x) + 1); -} - -int main() { - float x = 1e-8; // A small value that causes floating-point precision issues - float result1 = f1(x); - float result2 = f2(x); - - // Theoretically, result1 and result2 should be the same - float difference = result1 - result2; - // Multiply by a large number to amplify the error - float final_result = 100000000.0f * difference + 0.0001f; - - // Print the results - printf("f1(%e) = %.10f\n", x, result1); - printf("f2(%e) = %.10f\n", x, result2); - printf("Difference (f1 - f2) = %.10e\n", difference); - printf("Final result after magnification: %.10f\n", final_result); - - return 0; -} -``` - -Compile the code on both x86 and Arm with the following command: - -```bash -g++ -g error-propagation.cpp -o error-propagation -``` - -Running the two binaries shows that the second function, `f2`, has a small rounding error on both architectures. Additionally, there is a further rounding difference when run on x86 compared to Arm. - -Running on x86: - -```output -f1(1.000000e-08) = 0.0000000000 -f2(1.000000e-08) = 0.0000000050 -Difference (f1 - f2) = -4.9999999696e-09 -Final result after magnification: -0.4999000132 -``` - -Running on Arm: -```output -f1(1.000000e-08) = 0.0000000000 -f2(1.000000e-08) = 0.0000000050 -Difference (f1 - f2) = -4.9999999696e-09 -Final result after magnification: -0.4998999834 -``` diff --git a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-4.md b/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-4.md deleted file mode 100644 index 159360f812..0000000000 --- a/content/learning-paths/cross-platform/floating-point-rounding-errors/how-to-4.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: Minimizing floating-point variability across platforms -weight: 5 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## How can I minimize floating-point variability across x86 and Arm? - -The line `#pragma STDC FENV_ACCESS ON` is a directive that informs the compiler to enable access to the floating-point environment. This is part of the C++11 standard and ensures that the program can properly handle floating-point exceptions and rounding modes, enabling your program to continue running if an exception is thrown. - -In the context below, enabling floating-point environment access is crucial because the functions in this example involve floating-point arithmetic, which can be prone to precision errors and exceptions such as overflow, underflow, division by zero, and invalid operations. Although not strictly necessary for this example, the directive is included because it may be relevant for your own applications. - -This directive is particularly important when performing operations that require high numerical stability and precision, such as the square root calculations in functions below. It allows the program to manage the floating-point state and handle any anomalies that might occur during these calculations, thereby improving the robustness and reliability of your numerical computations. - -Use an editor to copy and paste the C++ file below into a file named `error-propagation-min.cpp`: - -```cpp -#include -#include -#include - -// Enable floating-point exceptions -#pragma STDC FENV_ACCESS ON - -// Function 1: Computes sqrt(1 + x) - 1 using the naive approach -double f1(double x) { - return sqrt(1 + x) - 1; -} - -// Function 2: Computes the same value using an algebraically equivalent transformation -// This version is numerically more stable -double f2(double x) { - return x / (sqrt(1 + x) + 1); -} - -int main() { - // Enable all floating-point exceptions - std::feclearexcept(FE_ALL_EXCEPT); - std::feraiseexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); - - double x = 1e-8; // A small value that causes floating-point precision issues - double result1 = f1(x); - double result2 = f2(x); - - // Theoretically, result1 and result2 should be the same - double difference = result1 - result2; - // Multiply by a large number to amplify the error - double final_result = 100000000.0 * difference + 0.0001; - - // Print the results - printf("f1(%e) = %.10f\n", x, result1); - printf("f2(%e) = %.10f\n", x, result2); - printf("Difference (f1 - f2) = %.10e\n", difference); - printf("Final result after magnification: %.10f\n", final_result); - - return 0; -} -``` - -Compile on both computers, using the C++ flag, `-frounding-math`. - -You should use this flag when your program dynamically changes the floating-point rounding mode or needs to run correctly under different rounding modes. In this example, it ensures that `f1` uses a predictable rounding mode across both x86 and Arm. - -```bash -g++ -o error-propagation-min error-propagation-min.cpp -frounding-math -``` - -Running the new binary on both systems shows that function `f1` produces a value nearly identical to `f2`, and the difference between them is now identical across both Arm64 and x86. - -```bash -./error-propagation-min -``` - -Here is the output on both systems: - -```output -f1(1.000000e-08) = 0.0000000050 -f2(1.000000e-08) = 0.0000000050 -Difference (f1 - f2) = -1.7887354748e-17 -Final result after magnification: 0.0000999982 -``` - -G++ provides several compiler flags to help balance accuracy and performance. For example, `-ffp-contract` is useful when lossy, fused operations are used, such as fused-multiple. - -Another example is `-ffloat-store` which prevents floating-point variables from being stored in registers which can have different levels of precision and rounding. - -You can refer to compiler documentation for more information on the flags available. - diff --git a/content/learning-paths/cross-platform/function-multiversioning/_index.md b/content/learning-paths/cross-platform/function-multiversioning/_index.md index f430235fd5..f0b76e2cde 100644 --- a/content/learning-paths/cross-platform/function-multiversioning/_index.md +++ b/content/learning-paths/cross-platform/function-multiversioning/_index.md @@ -28,7 +28,8 @@ armips: - Cortex-A - Neoverse tools_software_languages: - - C/C++ + - C + - C++ - Runbook operatingsystems: - Linux diff --git a/content/learning-paths/cross-platform/integer-vs-floats/_index.md b/content/learning-paths/cross-platform/integer-vs-floats/_index.md index 20ef23de0a..7bb7d5077a 100644 --- a/content/learning-paths/cross-platform/integer-vs-floats/_index.md +++ b/content/learning-paths/cross-platform/integer-vs-floats/_index.md @@ -23,7 +23,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/intrinsics/_index.md b/content/learning-paths/cross-platform/intrinsics/_index.md index 832cbfbc44..9f286a16ca 100644 --- a/content/learning-paths/cross-platform/intrinsics/_index.md +++ b/content/learning-paths/cross-platform/intrinsics/_index.md @@ -35,9 +35,8 @@ armips: operatingsystems: - Linux tools_software_languages: - - Neon + - NEON - SVE - - Coding - Intrinsics - Runbook diff --git a/content/learning-paths/cross-platform/ipexplorer/_index.md b/content/learning-paths/cross-platform/ipexplorer/_index.md index e1df8ecca1..1a43958dba 100644 --- a/content/learning-paths/cross-platform/ipexplorer/_index.md +++ b/content/learning-paths/cross-platform/ipexplorer/_index.md @@ -25,7 +25,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - IP Explorer ### Cross-platform metadata only diff --git a/content/learning-paths/cross-platform/kleidiai-explainer/_index.md b/content/learning-paths/cross-platform/kleidiai-explainer/_index.md index 5e13bfdfd5..f84de2efc8 100644 --- a/content/learning-paths/cross-platform/kleidiai-explainer/_index.md +++ b/content/learning-paths/cross-platform/kleidiai-explainer/_index.md @@ -23,8 +23,7 @@ armips: - Neoverse tools_software_languages: - C++ - - GenAI - - Coding + - Generative AI - NEON - Runbook diff --git a/content/learning-paths/cross-platform/llm-fine-tuning-for-web-applications/_index.md b/content/learning-paths/cross-platform/llm-fine-tuning-for-web-applications/_index.md index 166d5a70fb..ad94a942b5 100644 --- a/content/learning-paths/cross-platform/llm-fine-tuning-for-web-applications/_index.md +++ b/content/learning-paths/cross-platform/llm-fine-tuning-for-web-applications/_index.md @@ -36,7 +36,7 @@ armips: tools_software_languages: - LLM - - GenAI + - Generative AI - Python - PyTorch - ExecuTorch diff --git a/content/learning-paths/cross-platform/loop-reflowing/_index.md b/content/learning-paths/cross-platform/loop-reflowing/_index.md index 291a818b9f..71c40ef573 100644 --- a/content/learning-paths/cross-platform/loop-reflowing/_index.md +++ b/content/learning-paths/cross-platform/loop-reflowing/_index.md @@ -22,7 +22,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/matrix/_index.md b/content/learning-paths/cross-platform/matrix/_index.md index 5cefa6eb0a..1a0589ddf5 100644 --- a/content/learning-paths/cross-platform/matrix/_index.md +++ b/content/learning-paths/cross-platform/matrix/_index.md @@ -32,7 +32,7 @@ tools_software_languages: - GCC - Clang - CMake - - GoogleTest + - Google Test - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/mca-godbolt/_index.md b/content/learning-paths/cross-platform/mca-godbolt/_index.md index 4aa889e061..01f4449176 100644 --- a/content/learning-paths/cross-platform/mca-godbolt/_index.md +++ b/content/learning-paths/cross-platform/mca-godbolt/_index.md @@ -23,7 +23,7 @@ armips: - Neoverse - Cortex-A tools_software_languages: - - assembly + - Assembly - llvm-mca - Runbook diff --git a/content/learning-paths/cross-platform/memory-latency/_index.md b/content/learning-paths/cross-platform/memory-latency/_index.md index af5b1cbb14..ddd715b8ed 100644 --- a/content/learning-paths/cross-platform/memory-latency/_index.md +++ b/content/learning-paths/cross-platform/memory-latency/_index.md @@ -23,7 +23,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/psa-tfm/_index.md b/content/learning-paths/cross-platform/psa-tfm/_index.md index daf7cbb4b0..4e16f97ab1 100644 --- a/content/learning-paths/cross-platform/psa-tfm/_index.md +++ b/content/learning-paths/cross-platform/psa-tfm/_index.md @@ -30,7 +30,7 @@ operatingsystems: - Linux tools_software_languages: - Trusted Firmware - - Fixed Virtual Platform + - FVP - GCC diff --git a/content/learning-paths/cross-platform/pytorch-digit-classification-arch-training/_index.md b/content/learning-paths/cross-platform/pytorch-digit-classification-arch-training/_index.md index 9f91195792..633a74f4dc 100644 --- a/content/learning-paths/cross-platform/pytorch-digit-classification-arch-training/_index.md +++ b/content/learning-paths/cross-platform/pytorch-digit-classification-arch-training/_index.md @@ -34,8 +34,7 @@ operatingsystems: - macOS tools_software_languages: - Android Studio - - Coding - - VS Code + - Visual Studio Code shared_path: true shared_between: - servers-and-cloud-computing diff --git a/content/learning-paths/cross-platform/restrict-keyword-c99/_index.md b/content/learning-paths/cross-platform/restrict-keyword-c99/_index.md index 223c436a3b..b4db2cb766 100644 --- a/content/learning-paths/cross-platform/restrict-keyword-c99/_index.md +++ b/content/learning-paths/cross-platform/restrict-keyword-c99/_index.md @@ -24,7 +24,6 @@ tools_software_languages: - GCC - Clang - SVE2 - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/rust_armds/_index.md b/content/learning-paths/cross-platform/rust_armds/_index.md index aa68cf3b0c..8a38c15fff 100644 --- a/content/learning-paths/cross-platform/rust_armds/_index.md +++ b/content/learning-paths/cross-platform/rust_armds/_index.md @@ -26,7 +26,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - IP Explorer ### Cross-platform metadata only diff --git a/content/learning-paths/cross-platform/simd-info-demo/_index.md b/content/learning-paths/cross-platform/simd-info-demo/_index.md index 9f7d15a985..13e2b879be 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/_index.md +++ b/content/learning-paths/cross-platform/simd-info-demo/_index.md @@ -26,7 +26,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Rust - Runbook diff --git a/content/learning-paths/cross-platform/simd-loops/1-about.md b/content/learning-paths/cross-platform/simd-loops/1-about.md new file mode 100644 index 0000000000..6d798ad108 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/1-about.md @@ -0,0 +1,70 @@ +--- +title: About single instruction, multiple data (SIMD) loops +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Writing high-performance software for Arm processors often involves delving into +SIMD technologies. For many developers, that journey started with NEON, a +familiar, fixed-width vector extension that has been around for many years. But as +Arm architectures continue to evolve, so do their SIMD technologies. + +Enter the world of Scalable Vector Extension (SVE) and Scalable Matrix Extension (SME): two powerful, scalable vector extensions designed for modern +workloads. Unlike NEON, they are not just wider; they are fundamentally different. These +extensions introduce new instructions, more flexible programming models, and +support for concepts like predication, scalable vectors, and streaming modes. +However, they also come with a learning curve. + +That is where [SIMD Loops](https://gitlab.arm.com/architecture/simd-loops) becomes a valuable resource, enabling you to quickly and effectively learn how to write high-performance SIMD code. + +SIMD Loops is designed to help +you learn how to write SVE and SME code. It is a collection +of self-contained, real-world loop kernels written in a mix of C, Arm C Language Extensions (ACLE) +intrinsics, and inline assembly. These kernels target tasks ranging from simple arithmetic +to matrix multiplication, sorting, and string processing. You can compile them, +run them, step through them, and use them as a foundation for your own SIMD +work. + +If you are familiar with NEON intrinsics, you can use SIMD Loops to learn and explore SVE and SME. + +## What is SIMD Loops? + +SIMD Loops is an open-source +project, licensed under BSD 3-Clause, built to help you learn how to write SIMD code for modern Arm +architectures, specifically using SVE and SME. +It is designed for programmers who already know +their way around NEON intrinsics but are now facing the more powerful and +complex world of SVE and SME. + +The goal of SIMD Loops is to provide working, readable examples that demonstrate +how to use the full range of features available in SVE, SVE2, and SME2. Each +example is a self-contained loop kernel, a small piece of code that performs +a specific task like matrix multiplication, vector reduction, histogram, or +memory copy. These examples show how that task can be implemented across different +vector instruction sets. + +Unlike a cookbook that tries to provide a recipe for every problem, SIMD Loops +takes the opposite approach. It aims to showcase the architecture rather than +the problem. The loop kernels are chosen to be realistic and meaningful, but the +main goal is to demonstrate how specific features and instructions work in +practice. If you are trying to understand scalability, predication, +gather/scatter, streaming mode, ZA storage, compact instructions, or the +mechanics of matrix tiles, this is where you will see them in action. + +The project includes: +- Dozens of numbered loop kernels, each focused on a specific feature or pattern +- Reference C implementations to establish expected behavior +- Inline assembly and/or intrinsics for scalar, NEON, SVE, SVE2, SVE2.1, SME2, and SME2.1 +- Build support for different instruction sets, with runtime validation +- A simple command-line runner to execute any loop interactively +- Optional standalone binaries for bare-metal and simulator use + +You do not need to worry about auto-vectorization, compiler flags, or tooling +quirks. Each loop is hand-written and annotated to make the use of SIMD features +clear. The intent is that you can study, modify, and run each loop as a learning +exercise, and use the project as a foundation for your own exploration of +Arm’s vector extensions. + + diff --git a/content/learning-paths/cross-platform/simd-loops/2-using.md b/content/learning-paths/cross-platform/simd-loops/2-using.md new file mode 100644 index 0000000000..86328c023d --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/2-using.md @@ -0,0 +1,77 @@ +--- +title: Using SIMD Loops +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +To get started, clone the SIMD Loops project and change current directory: + +```bash +git clone https://gitlab.arm.com/architecture/simd-loops simd-loops.git +cd simd-loops.git +``` + +## SIMD Loops structure + +In the SIMD Loops project, the +source code for the loops is organized under the loops directory. The complete +list of loops is documented in the loops.inc file, which includes a brief +description and the purpose of each loop. Every loop is associated with a +uniquely named source file following the naming pattern `loop_.c`, where +`` represents the loop number. + +A loop is structured as follows: + +```C +// Includes and loop__data structure definition + +#if defined(HAVE_NATIVE) || defined(HAVE_AUTOVEC) + +// C code +void inner_loop_(struct loop__data *data) { ... } + +#if defined(HAVE_xxx_INTRINSICS) + +// Intrinsics versions: xxx = SME, SVE, or SIMD (NEON) versions +void inner_loop_(struct loop__data *data) { ... } + +#elif defined() + + // Hand-written inline assembly : +// = __ARM_FEATURE_SME2p1, __ARM_FEATURE_SME2, __ARM_FEATURE_SVE2p1, +// __ARM_FEATURE_SVE2, __ARM_FEATURE_SVE, or __ARM_NEON +void inner_loop_(struct loop__data *data) { ... } + +#else + +#error "No implementations available for this target." + +#endif + +// Main of loop: Buffers allocations, loop function call, result functional checking +``` + +Each loop is implemented in several SIMD extension variants, and conditional +compilation is used to select one of the optimizations for the +`inner_loop_` function. The native C implementation is written first, and +it can be generated either when building natively (HAVE_NATIVE) or through +compiler auto-vectorization (HAVE_AUTOVEC). When SIMD ACLE is supported (e.g., +SME, SVE, or NEON), the code is compiled using high-level intrinsics. If ACLE +support is not available, the build process falls back to handwritten inline +assembly targeting one of the available SIMD extensions, such as SME2.1, SME2, +SVE2.1, SVE2, and others. The overall code structure also includes setup and +cleanup code in the main function, where memory buffers are allocated, the +selected loop kernel is executed, and results are verified for correctness. + +At compile time, you can select which loop optimization to compile, whether it +is based on SME or SVE intrinsics, or one of the available inline assembly +variants (`make scalar neon sve2 sme2 sve2p1 sme2p1 sve_intrinsics +sme_intrinsics` ...). + +As the result of the build, two types of binaries are generated. The first is a +single executable named `simd_loops`, which includes all the loop +implementations. A specific loop can be selected by passing parameters to the +program (e.g., `simd_loops -k -n `). The second type consists +of individual standalone binaries, each corresponding to a specific loop. diff --git a/content/learning-paths/cross-platform/simd-loops/3-example.md b/content/learning-paths/cross-platform/simd-loops/3-example.md new file mode 100644 index 0000000000..fa3b614a40 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/3-example.md @@ -0,0 +1,281 @@ +--- +title: Code example +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +To illustrate the structure and design principles of simd-loops, consider loop +202 as an example. `inner_loop_202` is defined at lines 69-79 in file +`loops/loops_202.c` and calls the `matmul_fp32` routine defined in +`matmul_fp32.c`. + +Open `loops/matmul_fp32.c`. + +This loop implements a single precision floating point matrix multiplication of +the form: + +`C[M x N] = A[M x K] x B[K x N]` + +A matrix multiplication can be understood in two equivalent ways: +- As the dot product between each row of matrix `A` and each column of matrix `B`. +- As the sum of outer products between the columns of `A` and the rows of `B`. + +## Data structure + +The loop begins by defining the data structure, which captures the matrix +dimensions (`M`, `K`, `N`) along with input and output buffers: + +```C +struct loop_202_data { + uint64_t m; + uint64_t n; + uint64_t k; + float *restrict a; + float *restrict b; + float *restrict c; +}; +``` + +For this loop: +- The first input matrix (A) is stored in column-major format in memory. +- The second input matrix (b) is stored in row-major format in memory. +- None of the memory area designated by `a`, `b` anf `c` alias (i.e. they + overlap in some way) --- as indicated by the `restrict` keyword. + +This layout choice helps optimize memory access patterns for all the targeted +SIMD architectures. + +## Loop attributes + +Next, the loop attributes are specified depending on the target architecture: +- For SME targets, the function `inner_loop_202` must be invoked with the + `__arm_streaming` attribute, using a shared `ZA` register context + (`__arm_inout("za")`). There attributes are wrapped in the LOOP_ATTR macro. +- For SVE or NEON targets, no additional attributes are required. + +This design enables portability across different SIMD extensions. + +## Function implementation + +The `matmul_fp32` function from file `loops/matmul_fp32.c` provides several +optimizations of the single-precision floating-point matrix multiplication, +including the ACLE intrinsics-based code, and the assembly hand-optimized code. + +### Scalar code + +A scalar C implementation is provided at lines 40-52. This version follows the +dot-product formulation of matrix multiplication, serving both as a functional +reference and a baseline for auto-vectorization: + +```C { line_numbers="true", line_start="40" } + for (uint64_t x = 0; x < m; x++) { + for (uint64_t y = 0; y < n; y++) { + c[x * n + y] = 0.0f; + } + } + + // Loops ordered for contiguous memory access in inner loop + for (uint64_t z = 0; z < k; z++) + for (uint64_t x = 0; x < m; x++) { + for (uint64_t y = 0; y < n; y++) { + c[x * n + y] += a[z * m + x] * b[z * n + y]; + } + } +``` + +### SVE optimized code + +The SVE implementation uses the indexed floating-point multiply-accumulate +(`fmla`) instruction to optimize the matrix multiplication operation. In this +formulation, the outer-product is decomposed into multiple indexed +multiplication steps, with results accumulated directly into `Z` registers. + +In the intrinsic version (lines 167-210), the innermost loop is structured as follows: + +```C { line_numbers = "true", line_start="167"} + for (m_idx = 0; m_idx < m; m_idx += 8) { + for (n_idx = 0; n_idx < n; n_idx += svcntw() * 2) { + ZERO_PAIR(0); + ZERO_PAIR(1); + ZERO_PAIR(2); + ZERO_PAIR(3); + ZERO_PAIR(4); + ZERO_PAIR(5); + ZERO_PAIR(6); + ZERO_PAIR(7); + + ptr_a = &a[m_idx]; + ptr_b = &b[n_idx]; + while (ptr_a < cnd_k) { + lda_0 = LOADA_PAIR(0); + lda_1 = LOADA_PAIR(1); + ldb_0 = LOADB_PAIR(0); + ldb_1 = LOADB_PAIR(1); + + MLA_GROUP(0); + MLA_GROUP(1); + MLA_GROUP(2); + MLA_GROUP(3); + MLA_GROUP(4); + MLA_GROUP(5); + MLA_GROUP(6); + MLA_GROUP(7); + + ptr_a += m * 2; + ptr_b += n * 2; + } + + ptr_c = &c[n_idx]; + STORE_PAIR(0); + STORE_PAIR(1); + STORE_PAIR(2); + STORE_PAIR(3); + STORE_PAIR(4); + STORE_PAIR(5); + STORE_PAIR(6); + STORE_PAIR(7); + } + c += n * 8; + } +``` + +At the beginning of the loop, the accumulators (`Z` registers) are explicitly +initialized to zero. This is achieved using `svdup` intrinsic (or its equivalent +`dup` assembly instruction), encapsulated in the `ZERO_PAIR` macro. + +Within each iteration over the `K` dimension: +- 128 bits (four consecutive floating point values) are loaded from the matrix + `A`, using the load replicate `svld1rq` intrinsics (or `ld1rqw` in assembly) + in `LOADA_PAIR` macro. +- Two consecutive vectors are loaded from matrix `B`, using the SVE load + instructions, called by the `LOADB_PAIR` macro. +- A sequence of indexed multiply-accumulate operations is performed, computing + the product of each element from `A` with the vectors from `B`. +- The results are accumulated across the 16 `Z` register accumulators, + progressively building the partial results of the matrix multiplication. + +After completing all iterations across the `K` dimension, the accumulated +results in the `Z` registers are stored back to memory. The `STORE_PAIR` macro +writes the values into the corresponding locations of the output matrix `C`. + +The equivalent SVE hand-optimized assembly code is written at lines 478-598. + +This loop showcases how SVE registers and indexed `fmla` instructions enable +efficient decomposition of the outer-product formulation into parallel, +vectorized accumulation steps. + +For more details on SVE/SVE2 instruction semantics, optimization guidelines and +other documents refer to the [Scalable Vector Extensions +resources](https://developer.arm.com/Architectures/Scalable%20Vector%20Extensions). + +### SME2 optimized code + +The SME2 implementation leverages the outer-product formulation of the matrix +multiplication function, utilizing the `fmopa` SME instruction to perform the +outer-product and accumulate partial results in `ZA` tiles. + +A snippet of the loop is shown below: + +```C { line_numbers = "true", line_start="78"} +#if defined(__ARM_FEATURE_SME2p1) + svzero_za(); +#endif + + for (m_idx = 0; m_idx < m; m_idx += svl_s * 2) { + for (n_idx = 0; n_idx < n; n_idx += svl_s * 2) { +#if !defined(__ARM_FEATURE_SME2p1) + svzero_za(); +#endif + + ptr_a = &a[m_idx]; + ptr_b = &b[n_idx]; + while (ptr_a < cnd_k) { + vec_a0 = svld1_x2(c_all, &ptr_a[0]); + vec_b0 = svld1_x2(c_all, &ptr_b[0]); + vec_a1 = svld1_x2(c_all, &ptr_a[m]); + vec_b1 = svld1_x2(c_all, &ptr_b[n]); + + MOPA_TILE(0, 0, 0, 0); + MOPA_TILE(1, 0, 0, 1); + MOPA_TILE(2, 0, 1, 0); + MOPA_TILE(3, 0, 1, 1); + MOPA_TILE(0, 1, 0, 0); + MOPA_TILE(1, 1, 0, 1); + MOPA_TILE(2, 1, 1, 0); + MOPA_TILE(3, 1, 1, 1); + + ptr_a += m * 2; + ptr_b += n * 2; + } + + ptr_c = &c[n_idx]; + for (l_idx = 0; l_idx < l_cnd; l_idx += 8) { +#if defined(__ARM_FEATURE_SME2p1) + vec_c0 = svreadz_hor_za8_u8_vg4(0, l_idx + 0); + vec_c1 = svreadz_hor_za8_u8_vg4(0, l_idx + 4); +#else + vec_c0 = svread_hor_za8_u8_vg4(0, l_idx + 0); + vec_c1 = svread_hor_za8_u8_vg4(0, l_idx + 4); +#endif + + STORE_PAIR(0, 0, 1, 0); + STORE_PAIR(1, 0, 1, n); + STORE_PAIR(0, 2, 3, c_blk); + STORE_PAIR(1, 2, 3, c_off); + + ptr_c += n * 2; + } + } + c += c_blk * 2; + } +``` + +Within the SME2 intrinsics code (lines 91-106), the innermost loop iterates across +the `K` dimension - corresponding to the columns of matrix `A` and the rows of +matrix `B`. + +In each iteration: +- Two consecutive vectors are loaded from `A` and two consecutive vectors are + loaded from `B` (`vec_a`, and `vec_b`), using the multi-vector load + instructions. +- The `fmopa` instruction, encapsulated within the `MOPA_TILE` macro, computes + the outer product of the input vectors. +- The results are accumulated into the four 32-bit `ZA` tiles. + +After all iterations over K dimension, the accumulated results are stored back +to memory through a store loop at lines 111-124: + +During this phase, four rows of `ZA` tiles are read out into four `Z` vectors +using the `svread_hor_za8_u8_vg4` intrinsic (or the equivalent `mova` assembly +instruction). The vectors are then stored into the output buffer with SME +multi-vector `st1w` store instructions, wrapped in the `STORE_PAIR` macro. + +The equivalent SME2 hand-optimized code is at lines 229-340. + +For more details on instruction semantics, and SME/SME2 optimization guidelines, +refer to the official [SME Programmer's +Guide](https://developer.arm.com/documentation/109246/latest/). + +## Other optimizations + +Beyond the SME2 and SVE2 implementations shown above, this loop also includes several +alternative optimized versions, each leveraging architecture-specific features. + +### NEON + +The neon version (lines 612-710) relies on multiple structure load/store +combined with indexed `fmla` instructions to vectorize the matrix multiplication +operation. + +### SVE2.1 + +The SVE2.1 implementation (lines 355-462) extends the base SVE approach by +utilizing multi-vector load and store instructions. + +### SME2.1 + +The SME2.1 leverages the `movaz` instruction / `svreadz_hor_za8_u8_vg4` +intrinsic to simultaneously reinitialize `ZA` tile accumulators while moving +data out to registers. diff --git a/content/learning-paths/cross-platform/simd-loops/4-conclusion.md b/content/learning-paths/cross-platform/simd-loops/4-conclusion.md new file mode 100644 index 0000000000..d1e85d10d0 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/4-conclusion.md @@ -0,0 +1,26 @@ +--- +title: Conclusion +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +SIMD Loops is an invaluable +resource for developers looking to learn or master the intricacies of SVE and +SME on modern Arm architectures. By providing practical, hands-on examples, it +bridges the gap between the architecture specification and real-world +application. Whether you're transitioning from NEON or starting fresh with SVE +and SME, SIMD Loops offers a comprehensive toolkit to enhance your understanding +and proficiency. + +With its extensive collection of loop kernels, detailed documentation, and +flexible build options, SIMD Loops empowers you to explore +and leverage the full potential of Arm's advanced vector extensions. Dive into +the project, experiment with the examples, and take your high-performance coding +skills for Arm to the next level. + +For more information and to get started, visit the GitLab project and refer +to the +[README.md](https://gitlab.arm.com/architecture/simd-loops/-/blob/main/README.md) +for instructions on building and running the code. diff --git a/content/learning-paths/cross-platform/simd-loops/_index.md b/content/learning-paths/cross-platform/simd-loops/_index.md new file mode 100644 index 0000000000..d10ffa777f --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/_index.md @@ -0,0 +1,95 @@ +--- +title: "Code kata: perfect your SVE and SME skills with SIMD Loops" + +minutes_to_complete: 30 + +draft: true +cascade: + draft: true + +who_is_this_for: This is an advanced topic for software developers who want to learn how to use the full range of features available in SVE, SVE2 and SME2 to improve software performance on Arm processors. + +learning_objectives: + - Improve SIMD code performance using Scalable Vector Extension (SVE) and Scalable Matrix Extension (SME). + +prerequisites: + - An AArch64 computer running Linux or macOS. You can use cloud instances, refer to [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/) for a list of cloud service providers. + - Some familiarity with SIMD programming and NEON intrinsics. + +author: + - Alejandro Martinez Vicente + - Mohamad Najem + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Neoverse +operatingsystems: + - Linux + - macOS +tools_software_languages: + - GCC + - Clang + - FVP + +shared_path: true +shared_between: + - servers-and-cloud-computing + - laptops-and-desktops + - mobile-graphics-and-gaming + - automotive + +further_reading: + - resource: + title: SVE Programming Examples + link: https://developer.arm.com/documentation/dai0548/latest + type: documentation + - resource: + title: Port Code to Arm Scalable Vector Extension (SVE) + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/sve + type: website + - resource: + title: Introducing the Scalable Matrix Extension for the Armv9-A Architecture + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture + type: website + - resource: + title: Arm Scalable Matrix Extension (SME) Introduction (Part 1) + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction + type: blog + - resource: + title: Arm Scalable Matrix Extension (SME) Introduction (Part 2) + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2 + type: blog + - resource: + title: (Part 3) Matrix-matrix multiplication. Neon, SVE, and SME compared + link: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/matrix-matrix-multiplication-neon-sve-and-sme-compared + type: blog + - resource: + title: Build adaptive libraries with multiversioning + link: https://learn.arm.com/learning-paths/cross-platform/function-multiversioning/ + type: website + - resource: + title: SME Programmer's Guide + link: https://developer.arm.com/documentation/109246/latest + type: documentation + - resource: + title: Compiler Intrinsics + link: https://en.wikipedia.org/wiki/Intrinsic_function + type: website + - resource: + title: ACLE - Arm C Language Extension + link: https://github.com/ARM-software/acle + type: website + - resource: + title: Application Binary Interface for the Arm Architecture + link: https://github.com/ARM-software/abi-aa + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/simd-loops/_next-steps.md b/content/learning-paths/cross-platform/simd-loops/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/cross-platform/simd-loops/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/cross-platform/simd-on-rust/_index.md b/content/learning-paths/cross-platform/simd-on-rust/_index.md index 1c7f5fdf7b..a073acd562 100644 --- a/content/learning-paths/cross-platform/simd-on-rust/_index.md +++ b/content/learning-paths/cross-platform/simd-on-rust/_index.md @@ -22,7 +22,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Rust - Runbook diff --git a/content/learning-paths/cross-platform/vectorization-friendly-data-layout/_index.md b/content/learning-paths/cross-platform/vectorization-friendly-data-layout/_index.md index 2533dd7d82..e8f1a0020e 100644 --- a/content/learning-paths/cross-platform/vectorization-friendly-data-layout/_index.md +++ b/content/learning-paths/cross-platform/vectorization-friendly-data-layout/_index.md @@ -22,7 +22,6 @@ armips: tools_software_languages: - GCC - Clang - - Coding - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md index f51f506190..0752fb1626 100644 --- a/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md +++ b/content/learning-paths/cross-platform/zenoh-multinode-ros2/_index.md @@ -26,7 +26,7 @@ armips: - Cortex-A - Neoverse tools_software_languages: - - ROS2 + - ROS 2 - C - Raspberry Pi - Zenoh diff --git a/content/learning-paths/embedded-and-microcontrollers/armds/_index.md b/content/learning-paths/embedded-and-microcontrollers/armds/_index.md index 1a7b785db9..09d6b9bc51 100644 --- a/content/learning-paths/embedded-and-microcontrollers/armds/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/armds/_index.md @@ -30,7 +30,6 @@ tools_software_languages: - Arm Compiler for Embedded - Arm Fast Models - DSTREAM - - Coding further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/asm/_index.md b/content/learning-paths/embedded-and-microcontrollers/asm/_index.md index b763d61bc1..68ae741c0c 100644 --- a/content/learning-paths/embedded-and-microcontrollers/asm/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/asm/_index.md @@ -25,8 +25,7 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - - Keil + - Keil MDK further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/_index.md b/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/_index.md index d3841a0607..c71b2d278a 100644 --- a/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/avh_ppocr/_index.md @@ -27,7 +27,6 @@ operatingsystems: - Baremetal tools_software_languages: - Arm Virtual Hardware - - Coding - GCC - Paddle - TVMC diff --git a/content/learning-paths/embedded-and-microcontrollers/bare-metal/_index.md b/content/learning-paths/embedded-and-microcontrollers/bare-metal/_index.md index bddfbc0cb0..cf18a919bd 100644 --- a/content/learning-paths/embedded-and-microcontrollers/bare-metal/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/bare-metal/_index.md @@ -25,7 +25,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - Arm Development Studio - Arm Compiler for Embedded - Arm Fast Models diff --git a/content/learning-paths/embedded-and-microcontrollers/cloud-native-deployment-on-hybrid-edge-systems/_index.md b/content/learning-paths/embedded-and-microcontrollers/cloud-native-deployment-on-hybrid-edge-systems/_index.md index b8f6e35257..37cd2e44f7 100644 --- a/content/learning-paths/embedded-and-microcontrollers/cloud-native-deployment-on-hybrid-edge-systems/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/cloud-native-deployment-on-hybrid-edge-systems/_index.md @@ -25,7 +25,7 @@ armips: - Cortex-A tools_software_languages: - Docker - - AVH + - Arm Virtual Hardware - K3s - Containerd operatingsystems: diff --git a/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx/_index.md b/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx/_index.md index ddeb8735f9..cfbb5e6f2e 100644 --- a/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx/_index.md @@ -23,8 +23,7 @@ armips: operatingsystems: - RTOS tools_software_languages: - - Coding - - RTX + - Keil RTX RTOS - Keil MDK - Arm Development Studio diff --git a/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx_vs/_index.md b/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx_vs/_index.md index 7d2f9ace21..4344354a84 100644 --- a/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx_vs/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/cmsis_rtx_vs/_index.md @@ -25,8 +25,7 @@ armips: operatingsystems: - RTOS tools_software_languages: - - Coding - - RTX + - Keil RTX RTOS - Keil MDK - Arm Development Studio diff --git a/content/learning-paths/embedded-and-microcontrollers/context-switch-cortex-m/_index.md b/content/learning-paths/embedded-and-microcontrollers/context-switch-cortex-m/_index.md index f66b9f23bd..87877583e8 100644 --- a/content/learning-paths/embedded-and-microcontrollers/context-switch-cortex-m/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/context-switch-cortex-m/_index.md @@ -25,7 +25,6 @@ operatingsystems: - Baremetal tools_software_languages: - CMSIS - - Coding - Arm Development Studio further_reading: diff --git a/content/learning-paths/embedded-and-microcontrollers/coverage_mdk/_index.md b/content/learning-paths/embedded-and-microcontrollers/coverage_mdk/_index.md index 9b7e69857f..a26ce9b9e1 100644 --- a/content/learning-paths/embedded-and-microcontrollers/coverage_mdk/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/coverage_mdk/_index.md @@ -23,10 +23,7 @@ operatingsystems: - Baremetal - RTOS tools_software_languages: - - Coding - - Keil - - MDK - - Fixed Virtual Platform + - Keil MDK - FVP further_reading: diff --git a/content/learning-paths/embedded-and-microcontrollers/docker/_index.md b/content/learning-paths/embedded-and-microcontrollers/docker/_index.md index 1c74a88d27..f182e0040d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/docker/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/docker/_index.md @@ -29,7 +29,6 @@ tools_software_languages: - Arm Development Studio - Arm Compiler for Embedded - Arm Fast Models - - Coding further_reading: diff --git a/content/learning-paths/embedded-and-microcontrollers/img_nn_stcube/_index.md b/content/learning-paths/embedded-and-microcontrollers/img_nn_stcube/_index.md index 1088ee125b..bdd760c3a0 100644 --- a/content/learning-paths/embedded-and-microcontrollers/img_nn_stcube/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/img_nn_stcube/_index.md @@ -26,7 +26,6 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - TensorFlow - STM32 diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md index 4ec73b992b..e67b323d70 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md @@ -31,7 +31,7 @@ operatingsystems: tools_software_languages: - Arm Virtual Hardware - - Fixed Virtual Platform + - FVP - Python - PyTorch - ExecuTorch diff --git a/content/learning-paths/embedded-and-microcontrollers/keilstudiocloud/_index.md b/content/learning-paths/embedded-and-microcontrollers/keilstudiocloud/_index.md index 711275db01..cd1eed83dd 100644 --- a/content/learning-paths/embedded-and-microcontrollers/keilstudiocloud/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/keilstudiocloud/_index.md @@ -27,11 +27,10 @@ operatingsystems: - Baremetal - RTOS tools_software_languages: - - Keil + - Keil Studio Cloud - Arm Compiler for Embedded - Arm Virtual Hardware - CMSIS - - Coding further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md b/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md index 7f43f5a1dc..635925889d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md @@ -25,7 +25,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Raspberry Pi - Python - Hugging Face diff --git a/content/learning-paths/embedded-and-microcontrollers/migration/_index.md b/content/learning-paths/embedded-and-microcontrollers/migration/_index.md index a0860e1c4d..97dd27e923 100644 --- a/content/learning-paths/embedded-and-microcontrollers/migration/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/migration/_index.md @@ -29,7 +29,7 @@ tools_software_languages: - GCC - Arm Compiler for Linux - Docker - - Neon + - NEON further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/mlek/_index.md b/content/learning-paths/embedded-and-microcontrollers/mlek/_index.md index dd94259436..8fecbbb7a5 100644 --- a/content/learning-paths/embedded-and-microcontrollers/mlek/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/mlek/_index.md @@ -31,9 +31,8 @@ armips: operatingsystems: - Baremetal tools_software_languages: - - Coding - Arm Virtual Hardware - - Fixed Virtual Platform + - FVP - GCC - Arm Compiler for Embedded diff --git a/content/learning-paths/embedded-and-microcontrollers/nav-mlek/_index.md b/content/learning-paths/embedded-and-microcontrollers/nav-mlek/_index.md index ba9dc78732..9e0251572b 100644 --- a/content/learning-paths/embedded-and-microcontrollers/nav-mlek/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/nav-mlek/_index.md @@ -30,7 +30,7 @@ subjects: ML test_maintenance: false tools_software_languages: - - Fixed Virtual Platform + - FVP - Arm Virtual Hardware - GCC - Arm Compiler for Embedded diff --git a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md index 83920389c6..1e47bb2d34 100644 --- a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md @@ -29,7 +29,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Raspberry Pi - Hugging Face diff --git a/content/learning-paths/embedded-and-microcontrollers/rpi_pico/_index.md b/content/learning-paths/embedded-and-microcontrollers/rpi_pico/_index.md index a023cae134..71497432b9 100644 --- a/content/learning-paths/embedded-and-microcontrollers/rpi_pico/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/rpi_pico/_index.md @@ -28,7 +28,6 @@ operatingsystems: - Baremetal tools_software_languages: - Raspberry Pi - - Coding further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/tflow_nn_stcube/_index.md b/content/learning-paths/embedded-and-microcontrollers/tflow_nn_stcube/_index.md index e9cdffc7bc..6a78c8682d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/tflow_nn_stcube/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/tflow_nn_stcube/_index.md @@ -28,7 +28,6 @@ operatingsystems: tools_software_languages: - TensorFlow - STM32 - - Coding further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md index 5c286ae95a..e23ea09452 100644 --- a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md @@ -31,10 +31,9 @@ operatingsystems: - Baremetal tools_software_languages: - Arm Virtual Hardware -- Fixed Virtual Platform +- FVP - TrustZone - Trusted Firmware -- Coding further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/trustzone_nxp_lpc/_index.md b/content/learning-paths/embedded-and-microcontrollers/trustzone_nxp_lpc/_index.md index 732e4f791b..78645c6614 100644 --- a/content/learning-paths/embedded-and-microcontrollers/trustzone_nxp_lpc/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/trustzone_nxp_lpc/_index.md @@ -28,8 +28,7 @@ operatingsystems: tools_software_languages: - TrustZone - Arm Compiler for Embedded - - Keil - - Coding + - Keil MDK further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/universal-sbc-chassis/_index.md b/content/learning-paths/embedded-and-microcontrollers/universal-sbc-chassis/_index.md index a4e4cce137..e9777b7106 100644 --- a/content/learning-paths/embedded-and-microcontrollers/universal-sbc-chassis/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/universal-sbc-chassis/_index.md @@ -37,7 +37,6 @@ operatingsystems: - Linux tools_software_languages: - - Slicing software - Fusion 360 further_reading: diff --git a/content/learning-paths/embedded-and-microcontrollers/uv_debug/_index.md b/content/learning-paths/embedded-and-microcontrollers/uv_debug/_index.md index e507a100ab..a2f823da95 100644 --- a/content/learning-paths/embedded-and-microcontrollers/uv_debug/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/uv_debug/_index.md @@ -40,9 +40,8 @@ operatingsystems: - RTOS - Baremetal tools_software_languages: - - Coding - - Keil - - Fixed Virtual Platform + - Keil MDK + - FVP diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md index 0127cde363..0a37a894c2 100644 --- a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/_index.md @@ -32,7 +32,7 @@ operatingsystems: tools_software_languages: - Arm Virtual Hardware - - Fixed Virtual Platform + - FVP - Python - PyTorch - ExecuTorch diff --git a/content/learning-paths/embedded-and-microcontrollers/yocto_qemu/_index.md b/content/learning-paths/embedded-and-microcontrollers/yocto_qemu/_index.md index 77222a254a..9345b95d4a 100644 --- a/content/learning-paths/embedded-and-microcontrollers/yocto_qemu/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/yocto_qemu/_index.md @@ -25,7 +25,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - Yocto Linux + - Yocto Project - QEMU further_reading: diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md index 1dc3bb5b73..d1b21d9810 100644 --- a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md @@ -31,8 +31,7 @@ operatingsystems: tools_software_languages: - Zephyr - Arm Virtual Hardware -- Fixed Virtual Platform -- Coding +- FVP further_reading: - resource: diff --git a/content/learning-paths/iot/avh_balena/_index.md b/content/learning-paths/iot/avh_balena/_index.md index 9d21a6d279..bdc5f362bf 100644 --- a/content/learning-paths/iot/avh_balena/_index.md +++ b/content/learning-paths/iot/avh_balena/_index.md @@ -31,9 +31,9 @@ operatingsystems: tools_software_languages: - Arm Virtual Hardware - - Balena Cloud + - balenaCloud - Raspberry Pi - - Balena OS + - BalenaOS further_reading: - resource: diff --git a/content/learning-paths/iot/azure-iot/_index.md b/content/learning-paths/iot/azure-iot/_index.md index 77e137d122..63666f8c2d 100644 --- a/content/learning-paths/iot/azure-iot/_index.md +++ b/content/learning-paths/iot/azure-iot/_index.md @@ -32,7 +32,7 @@ operatingsystems: tools_software_languages: - Python - Azure - - VS Code + - Visual Studio Code further_reading: - resource: diff --git a/content/learning-paths/iot/iot-sdk/_index.md b/content/learning-paths/iot/iot-sdk/_index.md index b4628bfe5f..bac7725e1e 100644 --- a/content/learning-paths/iot/iot-sdk/_index.md +++ b/content/learning-paths/iot/iot-sdk/_index.md @@ -27,8 +27,7 @@ operatingsystems: - RTOS tools_software_languages: - Arm Virtual Hardware - - Coding - - Fixed Virtual Platform + - FVP - Arm Compiler for Embedded diff --git a/content/learning-paths/laptops-and-desktops/llvm_putty/_index.md b/content/learning-paths/laptops-and-desktops/llvm_putty/_index.md index 04818725a4..ab8c90aff1 100644 --- a/content/learning-paths/laptops-and-desktops/llvm_putty/_index.md +++ b/content/learning-paths/laptops-and-desktops/llvm_putty/_index.md @@ -23,8 +23,7 @@ operatingsystems: - Windows tools_software_languages: - LLVM - - VS Code - - Coding + - Visual Studio Code further_reading: - resource: diff --git a/content/learning-paths/laptops-and-desktops/memory-tagged-dynamic-memory-allocator/_index.md b/content/learning-paths/laptops-and-desktops/memory-tagged-dynamic-memory-allocator/_index.md index fe2610d6ab..637e4058bf 100644 --- a/content/learning-paths/laptops-and-desktops/memory-tagged-dynamic-memory-allocator/_index.md +++ b/content/learning-paths/laptops-and-desktops/memory-tagged-dynamic-memory-allocator/_index.md @@ -25,7 +25,6 @@ tools_software_languages: - MTE - Linux - C -- Coding operatingsystems: - Linux diff --git a/content/learning-paths/laptops-and-desktops/win_arm64ec/_index.md b/content/learning-paths/laptops-and-desktops/win_arm64ec/_index.md index c7cd6d13cb..9f22e91fe0 100644 --- a/content/learning-paths/laptops-and-desktops/win_arm64ec/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_arm64ec/_index.md @@ -22,7 +22,6 @@ armips: operatingsystems: - Windows tools_software_languages: - - Coding - Arm64EC - Visual Studio diff --git a/content/learning-paths/laptops-and-desktops/win_arm64ec_porting/_index.md b/content/learning-paths/laptops-and-desktops/win_arm64ec_porting/_index.md index 1e3d28c738..9e2f0205df 100644 --- a/content/learning-paths/laptops-and-desktops/win_arm64ec_porting/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_arm64ec_porting/_index.md @@ -25,7 +25,8 @@ armips: operatingsystems: - Windows tools_software_languages: - - C/C++ + - C + - C++ - Qt further_reading: diff --git a/content/learning-paths/laptops-and-desktops/win_arm_qt/_index.md b/content/learning-paths/laptops-and-desktops/win_arm_qt/_index.md index e3bbe4a3af..3a0e1e40e0 100644 --- a/content/learning-paths/laptops-and-desktops/win_arm_qt/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_arm_qt/_index.md @@ -23,7 +23,8 @@ armips: operatingsystems: - Windows tools_software_languages: - - C/C++ + - C + - C++ - Qt further_reading: diff --git a/content/learning-paths/laptops-and-desktops/win_on_arm_build_onnxruntime/_index.md b/content/learning-paths/laptops-and-desktops/win_on_arm_build_onnxruntime/_index.md index 0626a7d5d9..4373053083 100644 --- a/content/learning-paths/laptops-and-desktops/win_on_arm_build_onnxruntime/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_on_arm_build_onnxruntime/_index.md @@ -23,7 +23,7 @@ tools_software_languages: - C++ - Python - Git - - cmake + - CMake - ONNX Runtime operatingsystems: - Windows diff --git a/content/learning-paths/laptops-and-desktops/win_win32_dll_porting/_index.md b/content/learning-paths/laptops-and-desktops/win_win32_dll_porting/_index.md index 034316ab77..e76230eef5 100644 --- a/content/learning-paths/laptops-and-desktops/win_win32_dll_porting/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_win32_dll_porting/_index.md @@ -24,7 +24,8 @@ armips: operatingsystems: - Windows tools_software_languages: - - C/C++ + - C + - C++ further_reading: - resource: diff --git a/content/learning-paths/laptops-and-desktops/windows_cicd_github/_index.md b/content/learning-paths/laptops-and-desktops/windows_cicd_github/_index.md index eeccfdab6e..458231ea0a 100644 --- a/content/learning-paths/laptops-and-desktops/windows_cicd_github/_index.md +++ b/content/learning-paths/laptops-and-desktops/windows_cicd_github/_index.md @@ -26,7 +26,6 @@ armips: operatingsystems: - Windows tools_software_languages: - - Coding - GitHub further_reading: diff --git a/content/learning-paths/laptops-and-desktops/wsl2/_index.md b/content/learning-paths/laptops-and-desktops/wsl2/_index.md index 2dcdac804f..419e65854d 100644 --- a/content/learning-paths/laptops-and-desktops/wsl2/_index.md +++ b/content/learning-paths/laptops-and-desktops/wsl2/_index.md @@ -29,7 +29,7 @@ operatingsystems: - Linux tools_software_languages: - WSL - - VS Code + - Visual Studio Code further_reading: - resource: diff --git a/content/learning-paths/mobile-graphics-and-gaming/afrc/_index.md b/content/learning-paths/mobile-graphics-and-gaming/afrc/_index.md index b76fa5e727..77c991ba0e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/afrc/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/afrc/_index.md @@ -27,7 +27,6 @@ operatingsystems: - Android tools_software_languages: - Vulkan - - Coding further_reading: diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/1-prerequisites.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/1-prerequisites.md index 470f7d0cfa..4892557858 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/1-prerequisites.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/1-prerequisites.md @@ -8,39 +8,44 @@ layout: learningpathall ## Host machine requirements -This Learning Path demonstrates how to improve the performance of camera pipelines using KleidiAI and KleidiCV in applications running on Arm. You will need an Arm64 machine, preferably running an Ubuntu-based distribution. The instructions have been tested on Ubuntu 24.04. +This Learning Path demonstrates how to improve the performance of camera pipelines using KleidiAI and KleidiCV on Arm. You’ll need an Arm64 machine, preferably running an Ubuntu-based distribution. The instructions have been tested on Ubuntu 24.04. ## Install required software Make sure the following tools are installed: -- `git` - a version control system, for cloning the AI camera pipelines codebase. -- `git lfs` - an extension to `git` for managing large files by storing lightweight references instead of the files themselves. -- `docker` - an open-source containerization platform for running applications in isolated environments. -- `libomp` - LLVM's OpenMP runtime library, required for enabling parallel execution during application performance optimization. +- **Git** – version control, for cloning the AI camera pipelines codebase +- **Git LFS** – extension to Git for managing large files using lightweight pointers +- **Docker** – an open-source container platform for running applications in isolated environments +- **OpenMP runtime (`libomp`)** – LLVM’s OpenMP runtime library, required for enabling parallel execution during application performance optimization -### git and git lfs +### Git and Git LFS -These tools can be installed by running the following command, depending on your OS: +Install with the commands for your OS: {{< tabpane code=true >}} {{< tab header="Linux/Ubuntu" language="bash">}} -sudo apt install git git-lfs -y +sudo apt update +sudo apt install -y git git-lfs +# one-time LFS setup on this machine: +git lfs install {{< /tab >}} {{< tab header="macOS" language="bash">}} brew install git git-lfs +# one-time LFS setup on this machine: +git lfs install {{< /tab >}} {{< /tabpane >}} ### Docker -Start by checking that `docker` is installed on your machine by typing the following command line in a terminal: +Check that Docker is installed: ```bash { output_lines="2" } docker --version Docker version 27.3.1, build ce12230 ``` -If you see an error like "`docker: command not found`," then follow the steps from the [Docker Install Guide](https://learn.arm.com/install-guides/docker/). +If you see "`docker: command not found`," follow the [Docker Install Guide](https://learn.arm.com/install-guides/docker/). {{% notice Note %}} You might need to log in again or restart your machine for the changes to take effect. diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md index 4287f39064..1ee5968d74 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/2-overview.md @@ -1,53 +1,71 @@ --- - title: Overview weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall - --- ## KleidiAI -[KleidiAI](https://gitlab.arm.com/kleidi/kleidiai) is an open-source library that provides optimized, performance-critical routines - also known as micro-kernels - for artificial intelligence (AI) workloads on Arm CPUs. +[KleidiAI](https://gitlab.arm.com/kleidi/kleidiai) is an open-source library of optimized, performance-critical routines (micro-kernels) for AI workloads on Arm CPUs. These routines are tuned for specific Arm microarchitectures to maximize performance and are designed for straightforward integration into C/C++ ML and AI frameworks. -These routines are tuned to take full advantage of specific Arm hardware architectures to maximize performance. The [KleidiAI](https://gitlab.arm.com/kleidi/kleidiai) library is designed for easy integration into C or C++ machine learning (ML) and AI frameworks. - -Several popular AI frameworks already take advantage of [KleidiAI](https://gitlab.arm.com/kleidi/kleidiai) to improve performance on Arm platforms. +Several popular AI frameworks already take advantage of KleidiAI to improve performance on Arm platforms. ## KleidiCV -[KleidiCV](https://gitlab.arm.com/kleidi/kleidicv) is an open-source library that provides high-performance image processing functions for AArch64. - -It is designed to be lightweight and simple to integrate into a wide variety of projects. Some computer vision frameworks, such as OpenCV, leverage [KleidiCV](https://gitlab.arm.com/kleidi/kleidicv) to accelerate image processing on Arm devices. +[KleidiCV](https://gitlab.arm.com/kleidi/kleidicv) is an open-source library that provides high-performance image-processing functions for AArch64. It is lightweight and simple to integrate, and computer-vision frameworks such as OpenCV can leverage KleidiCV to accelerate image processing on Arm devices. ## AI camera pipelines -This Learning Path provides two example applications that combine AI and computer vision (CV) techniques: -- Background Blur. -- Low-Light Enhancement. +This Learning Path provides three example applications that combine AI and computer vision (CV) techniques: -Both applications: -- Use input and output images that are stored in `ppm` (Portable Pixmap format), with three RGB channels (Red, Green, and Blue). Each channel supports 256 intensity levels (0-255) commonly referred to as `RGB8`. -- Convert the images to the `YUV420` color space for processing. -- Apply the relevant effect (background blur or low-light enhancement). -- Convert the processed images back to `RGB8` and save them as `ppm` files. +- Background blur +- Low-light enhancement (LLE) +- Neural denoising -### Background Blur +## Background blur and low-light enhancement + +The applications: + +- Use input and output images in **PNG** format with three **RGB** channels (8-bit per channel, often written as **RGB8**) +- Convert images to **YUV 4:2:0** for processing +- Apply the relevant effect (background blur or low-light enhancement) +- Convert the processed images back to **RGB8** and save as **.png** + +## Background blur The background blur pipeline is implemented as follows: -![example image alt-text#center](blur_pipeline.png "Background Blur Pipeline Diagram") +![Background blur pipeline diagram showing RGB8 input, conversion to YUV 4:2:0, blur applied to background mask, and reconversion to RGB8 alt-text#center](blur_pipeline.png "Background blur pipeline") + +## Low-light enhancement + +The low-light enhancement pipeline is adapted from the LiveHDR+ method proposed by Google Research (2017): + +![Low-light enhancement pipeline diagram with burst capture, alignment/merge, coefficient prediction network (LiteRT), tone mapping, and RGB output alt-text#center](lle_pipeline.png "Low-light enhancement pipeline") + +The low-resolution coefficient-prediction network (implemented with LiteRT) performs operations such as: + +- Strided convolutions +- Local feature extraction using convolutional layers +- Global feature extraction using convolutional and fully connected layers +- Add, convolve, and reshape ops + +## Neural denoising + +Every smartphone photographer has experienced it: images that look sharp in daylight but degrade in dim lighting. This is because **signal-to-noise ratio (SNR)** drops sharply when sensors capture fewer photons. At 1000 lux, the signal dominates and images look clean; at 1 lux, readout noise becomes visible as grain, color speckling, and loss of fine detail. + +That’s why **neural camera denoising** is a critical, computationally-demanding, stage in modern camera pipelines. Done well, it can transform noisy frames into sharp, vibrant captures; done poorly, it leaves smudges and artifacts. -### Low Light Enhancement +As shown below, the neural-denoising pipeline uses two algorithms: -The low-light enhancement pipeline is adapted from the LiveHDR+ method originally proposed by Google Research in 2017: +- **Temporal** denoising, `ultralite` in the repository (uses a history of previous frames) +- **Spatial** denoising, `collapsenet` in the repository +- Or a combination of both -![example image alt-text#center](lle_pipeline.png "Low-Light Enhancement Pipeline Diagram") +![Neural denoising pipeline diagram showing temporal path (with frame history) and spatial path, followed by fusion and output alt-text#center](denoising_pipeline.png "Neural denoising pipeline") -The Low-Resolution Coefficient Prediction Network (implemented with LiteRT) performs computations such as: -- Strided convolutions. -- Local feature extraction using convolutional layers. -- Global feature extraction using convolutional and fully connected layers. -- Add, convolve, and reshape operations. \ No newline at end of file +The Neural Denoising application works on frames, as emitted by a camera sensor in Bayer format: +- The input frames are in RGGB 1080x1920x4 format +- The output frames in YGGV 4x1080x1920 format diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md index 6257f4cba2..8fa4f9aa9b 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/3-build.md @@ -6,7 +6,7 @@ weight: 5 layout: learningpathall --- -## Download the AI Camera Pipelines Project +## Download the AI camera pipelines project Clone the project repository: @@ -29,10 +29,11 @@ Build the Docker container used to compile the pipelines: ```bash docker build -t ai-camera-pipelines -f docker/Dockerfile \ --build-arg DOCKERHUB_MIRROR=docker.io \ - --build-arg CI_UID=$(id -u) . + --build-arg CI_UID=$(id -u) \ + docker/ ``` -## Build the AI Camera Pipelines +## Build the AI camera pipelines Start a shell in the container you just built: @@ -44,8 +45,7 @@ Inside the container, run the following commands: ```bash ENABLE_SME2=0 -TENSORFLOW_GIT_TAG=ddceb963c1599f803b5c4beca42b802de5134b44 - +TENSORFLOW_GIT_TAG="v2.19.0" # Build flatbuffers git clone https://github.com/google/flatbuffers.git cd flatbuffers @@ -71,11 +71,11 @@ tar cfz example/install.tar.gz install Leave the container by pressing `Ctrl+D`. -## Notes on the cmake configuration options +## Notes on the CMake configuration options -The `cmake` command line options relevant to this learning path are: +The `cmake` command-line options relevant to this learning path are: -| Command line option | Description | +| Command-line option | Description | |-------------------------------------|----------------------------------------------------------------------------------------------| | `ENABLE_SME2=$ENABLE_SME2` | SME2 (Scalable Matrix Extension 2) is disabled in this build with `ENABLE_SME2=0`. | | `ARMNN_TFLITE_PARSER=0` | Configures the `ai-camera-pipelines` repository to use LiteRT with XNNPack instead of ArmNN. | @@ -90,7 +90,7 @@ tar xfz ai-camera-pipelines.git/install.tar.gz mv install ai-camera-pipelines ``` -## Diving further in the AI camera pipelines +## Dive deeper into the AI camera pipelines The AI camera pipelines [repository](https://git.gitlab.arm.com/kleidi/kleidi-examples/ai-camera-pipelines) diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/4-run.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/4-run.md index 828852f459..277a8a2f26 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/4-run.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/4-run.md @@ -9,26 +9,55 @@ layout: learningpathall ## Apply transformations -In the previous section, you built the AI Camera Pipelines. In this section, you'll run them to apply transformations to an input image. +In the previous section, you built the AI Camera Pipelines. In this section, you'll run them to apply transformations to an input image or input frames. -### Background Blur -Run the background blur pipeline: +```bash +cd $HOME/ai-camera-pipelines +python3 -m venv venv +. venv/bin/activate +pip install -r ai-camera-pipelines.git/docker/python-requirements.txt +``` + +## Background blur + +Run the background Blur pipeline, using `resources/test_input.png` as the input image and write the transformed image to `test_output.png`: ```bash cd $HOME/ai-camera-pipelines -bin/cinematic_mode resources/test_input2.ppm test_output2.ppm resources/depth_and_saliency_v3_2_assortedv2_w_augment_mobilenetv2_int8_only_ptq.tflite +bin/cinematic_mode resources/test_input.png test_output.png resources/depth_and_saliency_v3_2_assortedv2_w_augment_mobilenetv2_int8_only_ptq.tflite ``` -![example image alt-text#center](test_input2.png "Original picture") -![example image alt-text#center](test_output2.png "Picture with blur applied") +![example image alt-text#center](test_input2.png "Input image") +![example image alt-text#center](test_output2.png "Image with blur applied") + +## Low-Light Enhancement -### Low-Light Enhancement +Run the Low-Light Enhancement pipeline, using `resources/test_input.png` as the input image and write the transformed image to `test_output2_lime.png`: ```bash cd $HOME/ai-camera-pipelines -bin/low_light_image_enhancement resources/test_input2.ppm test_output2_lime.ppm resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l2_loss_int8_only_ptq.tflite +bin/low_light_image_enhancement resources/test_input.png test_output2_lime.png resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l1_loss_float32.tflite +``` + +![example image alt-text#center](test_input2.png "Input image") +![example image alt-text#center](test_output2_lime.png "Image with low-light enhancement applied") + + +### Neural denoising + +When the SME extension is not available, only temporal neural denoising is +available, so this is what you will run for now --- but stay tuned as the SME extension +will become available very soon: + +```bash +./scripts/run_neural_denoiser_temporal.sh ``` -![example image alt-text#center](test_input2.png "Original picture") -![example image alt-text#center](test_output2_lime.png "Picture with low-light enhancement applied") \ No newline at end of file +The input frames are: + - first converted from `.png` files in the `resources/test-lab-sequence/` directory to the sensor format (RGGB Bayer) into `neural_denoiser_io/input_noisy*` + - those frames are then processed by the Neural Denoiser and written into `neural_denoiser_io/output_denoised*` + - last, the denoised frames are converted back to `.png` for easy visualization in directory `test-lab-sequence-out` + +![example image alt-text#center](denoising_input_0010.png "Original frame") +![example image alt-text#center](denoising_output_0010.png "Frame with temporal denoising applied") \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md index 1372ae1b5e..d75ebf3e77 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/5-performances.md @@ -12,19 +12,19 @@ The application you built earlier includes a *benchmark mode* that runs the core - `ai-camera-pipelines/bin/cinematic_mode_benchmark` - `ai-camera-pipelines/bin/low_light_image_enhancement_benchmark` +- `ai-camera-pipelines/bin/neural_denoiser_temporal_benchmark_4K` These benchmarks demonstrate the performance improvements enabled by KleidiCV and KleidiAI: - KleidiCV enhances OpenCV performance with computation kernels optimized for Arm processors. +- KleidiAI accelerates LiteRT+XNNPack inference using AI-optimized micro-kernels tailored for Arm CPUs. -- KleidiAI accelerates LiteRT + XNNPack inference using AI-optimized micro-kernels tailored for Arm CPUs. +## Performance with KleidiCV and KleidiAI -## Performances with KleidiCV and KleidiAI - -By default, the OpenCV library is built with KleidiCV support, and LiteRT+xnnpack is built with KleidiAI support. +By default, the OpenCV library is built with KleidiCV support, and LiteRT+XNNPack is built with KleidiAI support. You can run the benchmarks using the applications you built earlier. -Run the first benchmark: +Run the Background Blur benchmark: ```bash bin/cinematic_mode_benchmark 20 resources/depth_and_saliency_v3_2_assortedv2_w_augment_mobilenetv2_int8_only_ptq.tflite @@ -34,25 +34,38 @@ The output is similar to: ```output INFO: Created TensorFlow Lite XNNPACK delegate for CPU. -Total run time over 20 iterations: 2023.39 ms +Total run time over 20 iterations: 2028.745 ms ``` -Run the second benchmark: +Run the Low Light Enhancement benchmark: ```bash -bin/low_light_image_enhancement_benchmark 20 resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l2_loss_int8_only_ptq.tflite +bin/low_light_image_enhancement_benchmark 20 resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l1_loss_float32.tflite ``` The output is similar to: ```output INFO: Created TensorFlow Lite XNNPACK delegate for CPU. -Total run time over 20 iterations: 54.3546 ms +Total run time over 20 iterations: 58.2126 ms +``` + +Last, run the Neural Denoising benchmark: + +```bash +bin/neural_denoiser_temporal_benchmark_4K 20 +``` + +The output is similar to: + +```output +Total run time over 20 iterations: 37.6839 ms ``` From these results, you can see that: -- `cinematic_mode_benchmark` performed 20 iterations in 1985.99 ms. -- `low_light_image_enhancement_benchmark` performed 20 iterations in 52.3448 ms. +- `cinematic_mode_benchmark` performed 20 iterations in 2028.745 ms +- `low_light_image_enhancement_benchmark` performed 20 iterations in 58.2126 ms +- `neural_denoiser_temporal_benchmark_4K` performed 20 iterations in 37.6839 ms ## Benchmark results without KleidiCV and KleidiAI @@ -61,7 +74,7 @@ To measure the performance without these optimizations, recompile the pipelines -DENABLE_KLEIDICV:BOOL=OFF -DXNNPACK_ENABLE_KLEIDIAI:BOOL=OFF ``` -Re-run the first benchmark: +Re-run the Background Blur benchmark: ```bash bin/cinematic_mode_benchmark 20 resources/depth_and_saliency_v3_2_assortedv2_w_augment_mobilenetv2_int8_only_ptq.tflite @@ -71,35 +84,52 @@ The new output is similar to: ```output INFO: Created TensorFlow Lite XNNPACK delegate for CPU. -Total run time over 20 iterations: 2029.25 ms +Total run time over 20 iterations: 2030.5525 ms ``` -Re-run the second benchmark: +Re-run the Low Light Enhancement benchmark: ```bash -bin/low_light_image_enhancement_benchmark 20 resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l2_loss_int8_only_ptq.tflite +bin/low_light_image_enhancement_benchmark 20 resources/HDRNetLIME_lr_coeffs_v1_1_0_mixed_low_light_perceptual_l1_loss_float32.tflite ``` The new output is similar to: ```output INFO: Created TensorFlow Lite XNNPACK delegate for CPU. -Total run time over 20 iterations: 79.431 ms +Total run time over 20 iterations: 58.0613 ms ``` -### Comparison table +Re-run the Neural Denoising benchmark: -| Benchmark | Without KleidiCV+KleidiAI | With KleidiCV+KleidiAI | -|-------------------------------------------|---------------------------|------------------------| -| `cinematic_mode_benchmark` | 2029.25 ms | 2023.39 ms | -| `low_light_image_enhancement_benchmark` | 79.431 ms | 54.3546 ms | - -As shown, the background blur pipeline (`cinematic_mode_benchmark`) gains only a small improvement, while the low-light enhancement pipeline sees a significant ~30% performance uplift when KleidiCV and KleidiAI are enabled. - -## Future performance uplift with SME2 +```bash +bin/neural_denoiser_temporal_benchmark_4K 20 +``` -A major benefit of using KleidiCV and KleidiAI is that they can automatically leverage new Arm architecture features - such as SME2 (Scalable Matrix Extension v2) - without requiring changes to your application code. +The new output is similar to: -As KleidiCV and KleidiAI operate as performance abstraction layers, any future hardware instruction support can be utilized by simply rebuilding the application. This enables better performance on newer processors without additional engineering effort. +```output +Total run time over 20 iterations: 38.0813 ms +``` +## Comparison table and future performance uplift with SME2 +| Benchmark | Without KleidiCV+KleidiAI | With KleidiCV+KleidiAI | +|-------------------------------------------|---------------------------|------------------------| +| `cinematic_mode_benchmark` | 2030.5525 ms | 2028.745 ms (-0.09%) | +| `low_light_image_enhancement_benchmark` | 58.0613 ms | 58.2126 ms (0.26%) | +| `neural_denoiser_temporal_benchmark_4K` | 38.0813 ms | 37.6839 ms (-1.04%) | + +As shown, the Background Blur (`cinematic_mode_benchmark`) and Neural Denoising +pipelines gain only a minor improvement, while the low-light enhancement pipeline +sees a minor performance degradation (0.26%) when KleidiCV and KleidiAI are +enabled. + +A major benefit of using KleidiCV and KleidiAI though is that they can +automatically leverage new Arm architecture features - such as SME2 (Scalable +Matrix Extension v2) - without requiring changes to your application code. + +As KleidiCV and KleidiAI operate as performance abstraction layers, any future +hardware instruction support can be utilized by simply rebuilding the +application. This enables better performance on newer processors without +additional engineering effort. diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md index 1b39ea65e5..b3d992d968 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/_index.md @@ -1,16 +1,16 @@ --- -title: Accelerate Background Blur and Low-Light Camera Effects +title: Accelerate Denoising, Background Blur and Low-Light Camera Effects with KleidiAI and KleidiCV minutes_to_complete: 30 -who_is_this_for: This Learning Path introduces developers to the benefits of optimizing the performance of camera pipelines using KleidiAI and KleidiCV. +who_is_this_for: This introductory topic is for mobile and computer-vision developers, camera pipeline engineers, and performance-minded practitioners who want to optimize real-time camera effects on Arm using KleidiAI and KleidiCV. learning_objectives: - - Compile and run AI-powered camera pipeline applications. - - Use KleidiCV and KleidiAI to improve the performance of real-time camera pipelines. + - Build and run AI-powered camera pipeline applications + - Use KleidiCV and KleidiAI to improve the performance of real-time camera pipelines prerequisites: - - A computer running Arm Linux or macOS with Docker installed. + - A computer running Arm Linux or macOS with Docker installed author: Arnaud de Grandmaison @@ -25,6 +25,7 @@ armips: - Cortex-A tools_software_languages: - C++ + - Docker operatingsystems: - Linux - macOS diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_input_0010.png b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_input_0010.png new file mode 100644 index 0000000000..c4484ac60f Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_input_0010.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_output_0010.png b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_output_0010.png new file mode 100644 index 0000000000..30bea1096c Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_output_0010.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_pipeline.png b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_pipeline.png new file mode 100644 index 0000000000..906ae9b625 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/ai-camera-pipelines/denoising_pipeline.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md b/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md index 8ce7ec174e..ccc2bc5a4f 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md @@ -34,7 +34,6 @@ operatingsystems: tools_software_languages: - Arm Performance Studio - Arm Mobile Studio - - Coding further_reading: - resource: diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md index b1243c0750..234ac62fa3 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md @@ -29,7 +29,6 @@ operatingsystems: - Android tools_software_languages: - Android Studio - - Coding further_reading: - resource: diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_sve2/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_sve2/_index.md index fedeb23e08..0dbe5be9d5 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_sve2/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_sve2/_index.md @@ -27,7 +27,6 @@ operatingsystems: - Android tools_software_languages: - Android Studio - - Coding further_reading: - resource: diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_webgpu_dawn/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_webgpu_dawn/_index.md index ec01534552..6c0cc7a8bb 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_webgpu_dawn/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_webgpu_dawn/_index.md @@ -34,7 +34,6 @@ subjects: Graphics armips: - Cortex-A tools_software_languages: - - Mobile - Java - Kotlin - C++ diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md b/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md index b8ac59ad38..37c394d2bd 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md @@ -25,7 +25,6 @@ tools_software_languages: - C++ - ONNX Runtime - Android - - Mobile - Hugging Face operatingsystems: diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/_index.md b/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/_index.md index d5a9bc06a8..3a98dbaa2a 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-android-selfie-app-using-mediapipe-multimodality/_index.md @@ -28,7 +28,6 @@ armips: - Cortex-A - Mali GPU tools_software_languages: - - mobile - Android Studio - Kotlin - MediaPipe diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md index 878514a0c7..dc2d80493f 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md @@ -31,7 +31,6 @@ subjects: ML armips: - Cortex-A tools_software_languages: - - Mobile - Java - C++ - Python diff --git a/content/learning-paths/mobile-graphics-and-gaming/debugging_with_mte_on_pixel8/_index.md b/content/learning-paths/mobile-graphics-and-gaming/debugging_with_mte_on_pixel8/_index.md index 3fc0767436..439008c584 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/debugging_with_mte_on_pixel8/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/debugging_with_mte_on_pixel8/_index.md @@ -27,7 +27,7 @@ armips: - Cortex-A tools_software_languages: - Android Studio - - Memory Tagging Extension + - MTE operatingsystems: - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md index 4734b8f8e7..77a5f176c3 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md @@ -24,7 +24,6 @@ armips: - Immortalis tools_software_languages: - Unreal Engine - - Mobile operatingsystems: - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-unity-on-android/_index.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-unity-on-android/_index.md index 8020be13bd..80d333f768 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-unity-on-android/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-unity-on-android/_index.md @@ -18,13 +18,12 @@ prerequisites: author: visualSilicon ### Tags -skilllevels: Beginner +skilllevels: Introductory subjects: Gaming armips: - Cortex tools_software_languages: - Unity - - Mobile - C# operatingsystems: - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/libgpuinfo/_index.md b/content/learning-paths/mobile-graphics-and-gaming/libgpuinfo/_index.md index 318d3ffb32..37e36ff894 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/libgpuinfo/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/libgpuinfo/_index.md @@ -26,7 +26,7 @@ armips: operatingsystems: - Android tools_software_languages: - - NDK + - Android NDK - adb further_reading: diff --git a/content/learning-paths/mobile-graphics-and-gaming/mte_on_pixel8/_index.md b/content/learning-paths/mobile-graphics-and-gaming/mte_on_pixel8/_index.md index ec25ccfad3..f0cabdd6e6 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/mte_on_pixel8/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/mte_on_pixel8/_index.md @@ -24,7 +24,9 @@ subjects: Performance and Architecture armips: - Cortex-A tools_software_languages: - - Memory Bug Report + - MTE + - adb + - Google Pixel 8 operatingsystems: - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/profiling-unity-apps-on-android/_index.md b/content/learning-paths/mobile-graphics-and-gaming/profiling-unity-apps-on-android/_index.md index 03c7d7b29f..4be9196bd2 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/profiling-unity-apps-on-android/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/profiling-unity-apps-on-android/_index.md @@ -29,7 +29,6 @@ armips: - arm architecture tools_software_languages: - Unity - - Mobile - C# operatingsystems: - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/ray_tracing/_index.md b/content/learning-paths/mobile-graphics-and-gaming/ray_tracing/_index.md index d6ccb5c6ef..28c0f1a219 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ray_tracing/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ray_tracing/_index.md @@ -27,7 +27,6 @@ operatingsystems: - Android tools_software_languages: - Vulkan - - Coding further_reading: diff --git a/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md b/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md index c445131929..5caa77dcdb 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md @@ -26,10 +26,9 @@ armips: - aarch64 - arm64 - arm architecture - - Neon + - NEON tools_software_languages: - Unity - - Mobile - C# operatingsystems: - Android diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md index 9988ed3cb3..c488c35b9e 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -25,7 +25,7 @@ subjects: ML arm_ips: - Neoverse tools_software_languages: - - Amazon Web Services + - AWS - Hugging Face - Python - Llama.cpp diff --git a/content/learning-paths/servers-and-cloud-computing/cca-trustee/_index.md b/content/learning-paths/servers-and-cloud-computing/cca-trustee/_index.md index d0afe5fdc3..2228ac69df 100644 --- a/content/learning-paths/servers-and-cloud-computing/cca-trustee/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/cca-trustee/_index.md @@ -1,62 +1,57 @@ --- -title: Run an end-to-end Attestation Flow with Arm CCA and Trustee - -draft: true -cascade: - draft: true - +title: Run an end-to-end attestation flow with Arm CCA and Trustee + minutes_to_complete: 60 -who_is_this_for: This Learning Path is for software developers who want to learn how Trustee services can be used to run an end-to-end attestation flow with Arm's Confidential Computing Architecture (CCA). +who_is_this_for: This Learning Path is for software developers who want to run an end-to-end attestation flow using Arm Confidential Compute Architecture (CCA) and Trustee services. learning_objectives: - - Describe how you can use attestation with Arm's Confidential Computing Architecture (CCA) and Trustee services. - - Deploy a simple workload in a CCA realm on an Armv9-A AEM Base Fixed Virtual Platform (FVP) that has support for RME extensions. - - Connect the workload with Trustee services to create an end-to-end example that uses attestation to unlock the confidential processing of data. + - Describe how you can use attestation with Arm's Confidential Computing Architecture (CCA) and Trustee services + - Deploy a simple workload in a CCA realm on an Armv9-A AEM Base Fixed Virtual Platform (FVP) that has support for RME extensions + - Connect the workload with Trustee services to create an end-to-end example that uses attestation to unlock the confidential processing of data prerequisites: - - An AArch64 or x86_64 computer running Linux or MacOS. You can use cloud instances, see this list of [Arm cloud service providers](/learning-paths/servers-and-cloud-computing/csp/). - - Completion of the [Get Started with CCA Attestation and Veraison](/learning-paths/servers-and-cloud-computing/cca-veraison) Learning Path. - - Completion of the [Run an end-to-end Attestation Flow with Arm CCA](/learning-paths/servers-and-cloud-computing/cca-essentials/) Learning Path. + - An AArch64 or x86_64 computer running Linux or macOS; you can use cloud instances - see the [Arm cloud service providers](/learning-paths/servers-and-cloud-computing/csp/) + - Completion of the [Get started with CCA attestation and Veraison](/learning-paths/servers-and-cloud-computing/cca-veraison) Learning Path + - Completion of the [Run an end-to-end attestation flow with Arm CCA](/learning-paths/servers-and-cloud-computing/cca-essentials/) Learning Path author: - - Anton Antonov + - Anton Antonov ### Tags skilllevels: Advanced subjects: Performance and Architecture armips: - - Neoverse - - Cortex-A + - Neoverse + - Cortex-A operatingsystems: - - Linux - - MacOS + - Linux + - macOS tools_software_languages: - - FVP - - RME - - CCA - - Docker - - Veraison - - Trustee + - FVP + - RME + - CCA + - Docker + - Veraison + - Trustee further_reading: - - resource: - title: Arm Confidential Compute Architecture - link: https://www.arm.com/architecture/security-features/arm-confidential-compute-architecture - type: website - - resource: - title: Arm Confidential Compute Architecture open source enablement - link: https://www.youtube.com/watch?v=JXrNkYysuXw - type: video - - resource: - title: Learn the architecture - Realm Management Extension - link: https://developer.arm.com/documentation/den0126 - type: documentation - - resource: - title: Realm Management Monitor specification - link: https://developer.arm.com/documentation/den0137/latest/ - type: documentation - + - resource: + title: Arm Confidential Compute Architecture + link: https://www.arm.com/architecture/security-features/arm-confidential-compute-architecture + type: website + - resource: + title: Arm Confidential Compute Architecture open-source enablement + link: https://www.youtube.com/watch?v=JXrNkYysuXw + type: video + - resource: + title: Learn the architecture - Realm Management Extension + link: https://developer.arm.com/documentation/den0126 + type: documentation + - resource: + title: Realm Management Monitor specification + link: https://developer.arm.com/documentation/den0137/latest/ + type: documentation ### FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/servers-and-cloud-computing/cca-trustee/cca-trustee.md b/content/learning-paths/servers-and-cloud-computing/cca-trustee/cca-trustee.md index 0eded26191..d217544637 100644 --- a/content/learning-paths/servers-and-cloud-computing/cca-trustee/cca-trustee.md +++ b/content/learning-paths/servers-and-cloud-computing/cca-trustee/cca-trustee.md @@ -1,6 +1,6 @@ --- # User change -title: "Overview of the Software Architecture" +title: "Architecture overview for Arm CCA Attestation with Trustee" weight: 2 # 1 is first, 2 is second, etc. @@ -8,145 +8,97 @@ weight: 2 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -## The role of Attestation -In this Learning Path, you will learn how attestation can control the release -of confidential data into a confidential Linux realm for processing. +## The role of attestation -The role of attestation is to assess whether the target compute environment -offers a provable level of confidential isolation. In this Learning Path, -the target compute environment is a Linux realm. The assessment of a provable -level of confidential isolation needs to occur before the realm can be trusted -to receive confidential data or algorithms. This use of attestation to judge -the trustworthiness of a compute environment, before allowing it to do any -processing, is a common practice in confidential computing. +In this Learning Path, you will learn how attestation controls the release of confidential data into a confidential Linux realm for processing. The role of attestation is to assess whether the target compute environment offers a provable level of confidential isolation. In this Learning Path, +the target compute environment is a Linux realm. The assessment of a provable level of confidential isolation must occur before the realm can be trusted to receive confidential data or algorithms. This use of attestation to judge the trustworthiness of a compute environment, before allowing it to do any processing, is a common practice in confidential computing. -## Understanding the key software components +## Key software components This Learning Path is similar to -[Run an end-to-end Attestation Flow with Arm CCA](/learning-paths/servers-and-cloud-computing/cca-essentials/). - -The main difference is that instead of KBS from the [Veraison](https://github.com/veraison) project you will use -the components implemented in the [confidential containers (CoCo)](https://github.com/confidential-containers) -to support the [IETF RATS model](https://datatracker.ietf.org/doc/rfc9334/) -(Remote ATtestation procedureS Architecture). The components include the Attestation Service (AS), -Key Broker Service (KBS), Reference Value Provider Service (RVPS), Attestation Agent (AA), and Confidential Data Hub (CDH). +[Run an end-to-end Attestation Flow with Arm CCA](/learning-paths/servers-and-cloud-computing/cca-essentials/). The main difference is that, instead of the KBS from the [Veraison](https://github.com/veraison) project, you will use components implemented in the [Confidential Containers (CoCo) Project](https://github.com/confidential-containers) to support the [IETF RATS model](https://datatracker.ietf.org/doc/rfc9334/) (Remote ATtestation procedureS). These components include the Attestation Service (AS), Key Broker Service (KBS), Reference Value Provider Service (RVPS), Attestation Agent (AA), and Confidential Data Hub (CDH). The AS, KBS, and RVPS components are part of the [Trustee project](https://github.com/confidential-containers/trustee), whereas the AA and CDH are part of the [Guest Components](https://github.com/confidential-containers/guest-components) project in CoCo. -### RATS key components +## RATS roles -This is a list of components used in this Learning Path: +This Learning Path focuses on the following key concepts: -- `Attester` - provides Evidence, which is evaluated and appraised to decide its - trustworthiness (for instance, a test to see whether it’s authorized to perform some action). - Evidence may include configuration data, measurements, telemetry, or inferences. -- `Verifier` - evaluates the validity of the evidence received from the attester - and produces attestation results, which are sent to the Relying party. - Attestation results typically include information regarding the Attester, - while the Verifier vouches for the validity of the results. -- `Relying party` - depends on the validity of information originating from - the attester for reliably applying an action. This information can come - from the verifier or directly through the attester. +- **Attester** – provides evidence that is evaluated to decide **Trustworthiness** (for example, whether it is authorized to perform an action). +- **Evidence** can include configuration data, measurements, telemetry, or inferences. +- **Verifier** – evaluates the evidence from the Attester and produces attestation results that are sent to the Relying party. The Verifier vouches for the validity of those results. +- **Relying party** – depends on the validity of information from the Attester (either directly or through the Verifier) to make an access or policy decision. -### Trustee components +## Trustee components -The Trustee project includes components deployed on a trusted side and used to verify -whether the remote workload is running in a trusted execution environment (TEE). -It also verifies that the remote environment uses the expected software and hardware versions. +Trustee components run on the trusted side and verify whether a remote workload is executing in a trusted execution environment (TEE) and using the expected software and hardware versions. -#### Key Broker Service (KBS) +## Key Broker Service (KBS) -The Key Broker Service (KBS) facilitates remote attestation and managing -and delivering secrets. Equating this to the RATS model, the KBS is the -`relying party` entity. The KBS, however, doesn’t validate the attestation evidence. -Instead, it uses the attestation service (AS) to verify the TEE evidence. +The KBS facilitates remote attestation and manages and delivers secrets. In RATS terms, the KBS is the **Relying party**. The KBS does not validate attestation evidence itself; it relies on the Attestation Service (AS) to verify the TEE evidence. -#### Attestation Service (AS) +## Attestation Service (AS) -The Attestation Service (AS) is responsible for validating the TEE evidence. -When mapped to the RATS model, the AS is the equivalent of the `verifier`. -The AS receives attestation evidence and returns an attestation token -containing the results of a two-step verification process. +The Attestation Service (AS) validates TEE evidence. In RATS terms, the AS is the **Verifier**. The AS receives attestation evidence and returns an attestation token containing the results of a two-step verification process. The following diagram shows the AS components: -![attestation-services](attestation-services.png "Attestation Service components") - -The AS runs the following verification process: +![Attestation Service components alt-text#center](attestation-services.png "Attestation Service components") -1. Verify the formatting and the origin of the evidence - for example, checking the signature of the evidence. - This is accomplished by one of the platform-specific Verifier Drivers. -2. Evaluate the claims provided in the evidence - for example, validating that the measurements are what the - client expects. This is done by a Policy Engine with help from the RVPS. +The AS performs this verification flow: -##### Verifier driver +- **Verify format and origin of evidence** – for example, verify the evidence signature. This is handled by a platform-specific Verifier driver. +- **Evaluate claims** – for example, validate that measurements match expected values. This is handled by the Policy engine, with RVPS support. -A verifier driver parses the attestation evidence provided by the hardware TEE. It performs the following tasks: +## Verifier driver -1. Verifies the hardware TEE signature of the TEE quote and report provided in the evidence -2. Receives the evidence and organizes the status into a JSON format to be returned +A Verifier driver parses the attestation evidence provided by the hardware TEE and: -In this Learning Path, the AS is configured to use an external CCA verifier. +- Verifies the hardware TEE signature of the quote and report included in the evidence. +- Normalizes the verified evidence into a JSON structure to be returned. -[Linaro](https://www.linaro.org) provides such an attestation verifier for use with pre-silicon Arm CCA platforms. -This verifier is built from the Open-Source [Veraison project](https://github.com/veraison). -You can learn more about Veraison and Linaro attestation verifier service in -[Get Started with CCA Attestation and Veraison](https://learn.arm.com/learning-paths/servers-and-cloud-computing/cca-veraison/) +In this Learning Path, the AS is configured to use an external CCA Verifier. -##### Policy Engine +[Linaro](https://www.linaro.org) provides an attestation Verifier for pre-silicon Arm CCA platforms. It is built from the open-source [Veraison](https://github.com/veraison) project. Learn more in +[Get Started with CCA Attestation and Veraison](https://learn.arm.com/learning-paths/servers-and-cloud-computing/cca-veraison/). -The AS allows users to upload their own policies when performing evidence verification. -When an attestation request is received by the AS, it uses a policy ID in the request -to decide which policies should be evaluated. -The results of all policies evaluated are included in the attestation response. +## Policy engine -In this Learning Path the AS attestation policy includes specific Arm CCA rules. +The AS lets you upload custom policies used during evidence verification. When the AS receives an attestation request, it uses the policy ID in the request to decide which policies to evaluate. The attestation response includes the results of all evaluated policies. -#### Reference Value Provider Service (RVPS) +In this Learning Path, the AS policy includes Arm CCA–specific rules. -The reference value provider service (RVPS) is a component in the AS responsible for verifying, -storing, and providing reference values. RVPS receives and verifies inputs from the software -supply chain, stores the measurement values, and generates reference value claims for the AS. -This operation is performed based on the evidence verified by the AS. +## Reference Value Provider Service (RVPS) +RVPS verifies, stores, and provides reference values. It receives inputs from the software supply chain, stores measurement values, and generates reference value claims for the AS, based on evidence verified by the AS. -### Guest components +## Guest components -The guest components are the services/tools that run inside the realm (TEE). -When mapped to the RATS model, these components are the equivalent of the `Attester`. +Guest components run inside the realm (TEE). In RATS terms, these components act as the **Attester**. -For simplicity instead of Attestation Agent (AA) and Confidential Data Hub (CDH) -you will use [KBS Client Tool](https://github.com/confidential-containers/trustee/tree/main/tools/kbs-client) +For simplicity, instead of Attestation Agent (AA) and Confidential Data Hub (CDH), you will use the [KBS Client Tool](https://github.com/confidential-containers/trustee/tree/main/tools/kbs-client). This is a simple client for the KBS that facilitates basic attestation flows. -You will run this tool inside of a realm to make requests for an attestation result token (EAR) and a secret. +You will run this tool in a realm to make requests for an attestation result token (EAR) and a secret. The client tool can also be used to provision the KBS/AS with resources and policies. -KBS Client connects to the KBS in order to perform attestation. To prove the trustworthiness of the environment -KBS Client sends the evidence (claims) from the TEE in the form of a CCA attestation token. -You can learn more about CCA attestation tokens in -[Get Started with CCA Attestation and Veraison](https://learn.arm.com/learning-paths/servers-and-cloud-computing/cca-veraison/) +To prove the environment’s trustworthiness, the KBS Client sends CCA attestation evidence (a CCA attestation token) to the KBS. Learn more about CCA attestation tokens in +[Get Started with CCA Attestation and Veraison](https://learn.arm.com/learning-paths/servers-and-cloud-computing/cca-veraison/). + +For convenience, Trustee services and the client software are packaged in Docker containers, which you can run on any suitable AArch64 or x86_64 development host. Because the client runs in a realm, it uses the Fixed Virtual Platform (FVP) and the reference software stack for Arm CCA. If you are new to running applications in realms with FVP, see +[Run an application in a Realm using the Arm Confidential Computing Architecture (CCA)](/learning-paths/servers-and-cloud-computing/cca-container). -For convenience, Trustee services and the client software are packaged in -docker containers, which you can execute on any suitable AArch64 or x86_64 -development machine. Since the client software runs in a realm, it makes use -of the Fixed Virtual Platform (FVP) and the reference software stack for Arm CCA. -If you have not yet familiarized yourself with running applications in realms using -FVP and the reference software stack, see the -[Run an application in a Realm using the Arm Confidential Computing Architecture (CCA)](/learning-paths/servers-and-cloud-computing/cca-container) -Learning Path. +When the AS receives an attestation token from the realm using the KBS, it: -When the AS receives an attestation token from the realm via KBS: -- it calls an external CCA verifier (the Linaro attestation verifier service) to obtain an attestation result. -- the external CCA verifier checks the token's cryptographic signature, - verifies that it denotes a confidential computing platform and provides an attestation result. -- it also checks the token evidences against its own attestation policies and updates attestation result status and trustworthiness vectors. +- Calls an external CCA Verifier (the Linaro attestation Verifier service) to obtain an attestation result. +- Checks the token’s cryptographic signature and confirms that it represents a confidential computing platform. +- Evaluates evidence in the token against its policies and updates the attestation result and trustworthiness vectors. -When asked for a resource the KBS uses the attestation result to decide whether to release the secrets into the realm for processing. +When a resource is requested, the KBS uses the attestation result to decide whether to release secrets to the realm for processing. -Figure 1 demonstrates the software architecture that you will construct to run the attestation example. +This diagram shows the software architecture you will construct to run the attestation example: -![cca-trustee](trustee.png "Figure 1: Software architecture for running attestation.") +![Software architecture for running attestation alt-text#center](trustee.png "Software architecture for running attestation") -You can now proceed to the next section to run the end-to-end attestation example with the software components and architecture as described here. +Proceed to the next section to run the end-to-end attestation example using the components and architecture described here. diff --git a/content/learning-paths/servers-and-cloud-computing/cca-trustee/flow.md b/content/learning-paths/servers-and-cloud-computing/cca-trustee/flow.md index 63c5b033a7..0e314dfefc 100644 --- a/content/learning-paths/servers-and-cloud-computing/cca-trustee/flow.md +++ b/content/learning-paths/servers-and-cloud-computing/cca-trustee/flow.md @@ -7,21 +7,22 @@ weight: 3 # 1 is first, 2 is second, etc. # Do not modify these elements layout: "learningpathall" --- +## Overview +In this section you’ll run the **Trustee services** (AS, KBS, RVPS), launch a **CCA realm** on **Arm FVP**, generate attestation evidence, and request a secret. You’ll intentionally fail the first request to see how **attestation policy** gates secret release, then **endorse the realm initial measurement (RIM)**, re-attest, and successfully retrieve the secret. -### Run Trustee Services +## Install dependencies -#### Prerequisites +Start by installing Docker. On Ubuntu 24.04 LTS, set up Docker’s APT repository: -Install docker. For example, on your Ubuntu 24.04 LTS host machine, first set up Docker's apt repository: -``` bash +```bash # Add Docker's official GPG key: sudo apt-get update -sudo apt-get install ca-certificates curl +sudo apt-get install -y ca-certificates curl sudo install -m 0755 -d /etc/apt/keyrings sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc sudo chmod a+r /etc/apt/keyrings/docker.asc -# Add the repository to Apt sources: +# Add the repository to APT sources: echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \ @@ -29,35 +30,39 @@ echo \ sudo apt-get update ``` -Install git and docker packages: -``` bash -sudo apt-get install git docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +Install Git and Docker packages: +``` +sudo apt-get install -y git docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` -Add your user name to the docker group: +Add your user name to the Docker group (open a new shell after this so the change takes effect): ``` bash sudo usermod -aG docker $USER newgrp docker ``` -#### Start Trustee Services docker containers +## Start Trustee services containers Clone the `cca-trustee` repository: ``` bash git clone https://github.com/ArmDeveloperEcosystem/cca-trustee.git ``` -This repository contains configuration files used for running Trustee services docker containers with CCA attestation support as a simple cluster. -The config files are based on the recommended configurations from [KBS Cluster](https://github.com/confidential-containers/trustee/blob/main/kbs/docs/cluster.md) +This repository contains configuration to run Trustee services (KBS, AS, RVPS) with CCA attestation support as a simple cluster. The configuration is based on the recommended settings from [KBS Cluster](https://github.com/confidential-containers/trustee/blob/main/kbs/docs/cluster.md). + +Additional Learning Path–specific changes include: + +- External Linaro CCA verifier in the AS configuration + +- Attestation policy with CCA rules + +- An *affirming* resource policy + +- A demo secret message -In addition to the recommended configuration, the following changes were also made for this Learning Path: -- Included the external Linaro CCA verifier into AS configuration -- Included an attestation policy with CCA rules -- Defined an "affirming" resource policy -- Created a secret demo message. -- Defined a docker network shared by all containers in this demo. +- A shared Docker network for all containers in this demo -Go into the `cca-trustee` directory and start the Trustee services docker containers (as detached services): +Go into the `cca-trustee` directory and start the Trustee services Docker containers (as detached services): ``` bash { output_lines = "3-9" } cd cca-trustee docker compose up -d @@ -70,19 +75,17 @@ docker compose up -d ✔ Container cca-trustee-kbs-client-1 Started ``` -While running the demo you can also check logs of the Trustee services in this termimal: +While running the demo you can also check logs of the Trustee services in this terminal: ``` bash docker compose logs ``` Where `service` is either `as`,`kbs` or `rvps`. -### Launch a CCA Realm with FVP +## Launch a CCA Realm with FVP -With the Trustee Services running in one terminal, -open up a new terminal in which you will run CCA attestations. +With the Trustee Services running in one terminal, open up a new terminal in which you will run CCA attestations. -Pull the docker image with the pre-built FVP, -and then run the container connected to the same docker network: +Pull the Docker image with the pre-built FVP, and then run the container connected to the same Docker network: ```bash docker pull armswdev/cca-learning-path:cca-simulation-v2 @@ -91,16 +94,15 @@ docker pull armswdev/cca-learning-path:cca-simulation-v2 docker run --rm -it --network cca-trustee armswdev/cca-learning-path:cca-simulation-v2 ``` -Within your running container, -launch the `run-cca-fvp.sh` script to run the Arm CCA pre-built binaries on the FVP: +Within your running container, launch the `run-cca-fvp.sh` script to run the Arm CCA pre-built binaries on the FVP: ```bash ./run-cca-fvp.sh ``` -The `run-cca-fvp.sh` script uses the screen command to connect to the different UARTs in the FVP. +The `run-cca-fvp.sh` script uses the `screen` command to connect to the different UARTs in the FVP. -You should see the host Linux kernel boot on your terminal. You will be prompted to log in to the host. +When the host Linux boots, log in: Enter root as the username: ```output @@ -120,10 +122,7 @@ cd /cca You should see the realm boot. -The `realm` will take some time to boot, please be patient. -After boot up, you will be prompted to log in at the guest Linux prompt. - -Use root again as the username: +After the realm boots, log in, using the root again as the username: ```output @@ -132,12 +131,9 @@ realm login: root (realm) # ``` -### Try to use attestation to request a secret +## Request a secret using attestation -In this step, you will go through the process of using attestation to request -a secret from the KBS. This will not work on the first attempt. -But don't worry. You will learn why that is the case, and how to rectify the problem. -You will have a better understanding of the attestation process as a result. +This first attempt intentionally fails so you can see why and how attestation policy gates secret release. Change directory to `/cca` and use `openssl` to create a realm RSA key: ```bash @@ -147,15 +143,17 @@ openssl genrsa -traditional -out realm.key Run the attestation command and save the EAT Attestation Result (EAR) message in JWT (JSON Web Token) format in a file named `ear.jwt`: ```bash -./kbs-client --url http://kbs:8080 attest --tee-key-file realm.key >ear.jwt +./kbs-client --url http://kbs:8080 attest --tee-key-file realm.key > ear.jwt ``` -Now try to request a secret demo message using the attestation result: +Request the demo secret with that EAR: + ```bash -./kbs-client --url http://kbs:8080 get-resource \ + ./kbs-client --url http://kbs:8080 get-resource \ --tee-key-file realm.key --attestation-token ear.jwt \ --path "cca-trustee/demo-message/message.txt" -``` +``` + The request will fail with `Access denied by policy` and `Token Verifier` errors: ```output @@ -174,14 +172,9 @@ The request will fail with `Access denied by policy` and `Token Verifier` errors Error: request unauthorized ``` -Proceed to the next step to understand why the KBS did not grant access -to the requested secret, and how to resolve the problem. - -#### Evaluate the Attestation Result +## Evaluate the Attestation result -In the previous step, the KBS failed to provide the requested secret. -To understand why this happened, you need to learn more about how -the attestation result is used to evaluate the trustworthiness of a CCA realm. +In the previous step, the KBS failed to provide the requested secret. To understand why this happened, you need to learn more about how the attestation result is used to evaluate the trustworthiness of a CCA realm. In this step, you will examine the attestation result more closely. The following command will use the `arc` tool to verify the cryptographic signature on the attestation result and display the result in a human-readable format: @@ -191,23 +184,22 @@ The following command will use the `arc` tool to verify the cryptographic signat ``` {{% notice EAR expiry note %}} -The EAR message produced by Trustee AS in this Learning Path demo is valid for 30 minutes. +The EAR is valid for 30 minutes. If it expires, re-run the attestation command to generate a fresh token. If you spend more time on analyzing the message you will start seeing errors from `arc verify` command: ``` output Using JWK key from JWT header Error: verifying signed EAR from "ear.jwt" using "JWK header" key: failed verifying JWT message: jwt.Parse: failed to parse token: jwt.Validate: validation failed: "exp" not satisfied: token is expired ``` - -Please obtain a new EAR message by re-running the attestation command. {{% /notice %}} The `arc verify` command produces quite a lot of output. + However, the main part is the CCA attestation token that is similar to the one you inspected in [Get Started with CCA Attestation and Veraison](/learning-paths/servers-and-cloud-computing/cca-veraison) Learning Path. -The most interesting part of the output is towards the bottom, and should look like this: +Check the trustworthiness vectors near the end of the output: ```output [trustworthiness vectors] @@ -222,11 +214,9 @@ Storage Opaque [none]: no claim being made Sourced Data [none]: no claim being made ``` -This part of the output shows how the attestation service has compared the attestation token against its expectations of a trustworthy system. -These comparisons are known as "trustworthiness vectors". -It also shows the conclusions that were drawn from that comparison. +This part of the output shows how the attestation service has compared the attestation token against its expectations of a trustworthy system. These comparisons are known as *trustworthiness vectors*. It also shows the conclusions that were drawn from that comparison. -Please notice these two trustworthiness vectors in the result: +Note these two trustworthiness vectors in the result: - __Hardware [affirming]__. Evidence in the attestation token shows a good match against the expectations of CCA platform. - __Executables [warning]__. Attestation token does not show a good match against the expectations of a recognized genuine set of approved executables have been loaded during the boot process. @@ -236,13 +226,13 @@ You can also check the status of the EAR: "ear.status": "warning", ``` -The warning status is the reason why the KBS chose not to grant access +The warning status is the reason why the KBS does not grant access to the secret that you requested in the earlier step. It has not concluded that the realm is trustworthy. But this is simply because you have not supplied an expected reference measurement for the realm. You will do this in the next step. -### Endorse Realm Initial Measurement (RIM) +## Endorse Realm Initial Measurement (RIM) For a successful attestation of your CCA real you need to provide the Trustee Reference Values Provider Service (RVPS) with a known good reference value. @@ -271,7 +261,7 @@ In the terminal where you started Trustee services, run `endorse-rim.sh` script Reference Values Updated ``` -### Re-run attestation and request a secret +## Re-run attestation and request a secret In the realm terminal re-run the attestation command: ```bash @@ -285,7 +275,7 @@ Verify that the new EAR now contains `affirming` status: "ear.status": "affirming", ``` -and `affirming` result for the `Executables` trustworthness vector: +and `affirming` result for the `Executables` trustworthiness vector: ```bash { output_lines = "2-11" } ./arc verify ear.jwt |grep -A10 "trustworthiness vectors" [trustworthiness vectors] diff --git a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md index 7bba539482..3fa5dea73e 100644 --- a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md @@ -25,7 +25,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Python diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md index f14c87fac8..c66f54ae2f 100644 --- a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md @@ -27,7 +27,7 @@ armips: - Neoverse tools_software_languages: - LLM - - GenAI + - Generative AI - AWS operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/django/_index.md b/content/learning-paths/servers-and-cloud-computing/django/_index.md index e022248793..9aa84d0447 100644 --- a/content/learning-paths/servers-and-cloud-computing/django/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/django/_index.md @@ -26,7 +26,7 @@ armips: tools_software_languages: - Django - Python - - Nginx + - NGINX - PostgreSQL operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md b/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md index 24f910e5ec..8da5c61ffd 100644 --- a/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md @@ -28,7 +28,7 @@ armips: - Neoverse tools_software_languages: - .NET - - OrchardCore + - Orchard Core - C operatingsystems: - Linux @@ -36,7 +36,7 @@ operatingsystems: further_reading: - resource: - title: OrchardCore documentation + title: Orchard Core documentation link: https://docs.orchardcore.net/ type: documentation - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/_index.md index 75351eaac2..4e66d05f1b 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/_index.md @@ -1,28 +1,25 @@ --- -title: Deploy Envoy on Google Axion processors +title: Deploy Envoy Proxy on Google Cloud C4A (Arm-based Axion VMs) -draft: true -cascade: - draft: true - minutes_to_complete: 30 -who_is_this_for: This is an introductory topic is for software developers interested in migrating their Envoy workloads from x86_64 servers to Arm-based servers, specifically on Google Axion–based C4A virtual machines. +who_is_this_for: This introductory topic for software developers migrating Envoy Proxy workloads from x86_64 to Arm-based servers, specifically on Google Cloud C4A virtual machines built on Axion processors. + learning_objectives: - - Start an Arm virtual machine on Google Cloud Platform (GCP) using the C4A Google Axion instance - - Install and configure Envoy on Arm-based GCP C4A instances - - Validate Envoy functionality through baseline testing - - Benchmark Envoy performance on Arm + - Provision an Arm-based C4A VM on Google Cloud Platform (GCP) + - Install and configure Envoy Proxy on a C4A instance + - Validate Envoy functionality with baseline tests + - Benchmark Envoy performance on both Arm64 (AArch64) and x86_64 architectures prerequisites: - A [Google Cloud Platform (GCP)](https://cloud.google.com/free?utm_source=google&hl=en) account with billing enabled - - Familiarity with networking concepts and the [Envoy architecture](https://www.envoyproxy.io/docs/envoy/latest/). + - Familiarity with networking concepts and the [Envoy architecture](https://www.envoyproxy.io/docs/envoy/latest/) author: Pareena Verma ##### Tags -skilllevels: Advanced +skilllevels: Introductory subjects: Web cloud_service_providers: Google Cloud @@ -31,7 +28,9 @@ armips: tools_software_languages: - Envoy - - Siege + - Siege + - Networking + - Service Mesh operatingsystems: - Linux @@ -41,7 +40,7 @@ operatingsystems: # ================================================================================ further_reading: - resource: - title: Google Cloud official documentation + title: Google Cloud documentation link: https://cloud.google.com/docs type: documentation @@ -51,11 +50,11 @@ further_reading: type: documentation - resource: - title: The official documentation for Siege + title: Siege documentation link: https://www.joedog.org/siege-manual/ type: documentation -weight: 1 # _index.md always has weight of 1 to order correctly -layout: "learningpathall" # All files under learning paths have this same wrapper -learning_path_main_page: "yes" # Indicates this should be surfaced when looking for related content. Only set for _index.md of learning path content. +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" --- diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/background.md index 633ea96aa0..9952aa2028 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/background.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/background.md @@ -1,23 +1,20 @@ --- -title: Getting started with Envoy on Google Axion C4A (Arm Neoverse-V2) +title: Get started with Envoy Proxy on Google Axion C4A (Arm Neoverse V2) weight: 2 - layout: "learningpathall" --- -## Google Axion C4A Arm instances in Google Cloud - -Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. +## Google Axion C4A instances on Google Cloud -The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. +Google Axion C4A is a family of Arm-based virtual machines powered by Google’s custom Axion CPU, which is built on Arm Neoverse V2 cores. Designed for high performance and energy efficiency, these VMs are well-suited to modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. -To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. +The C4A series can provide a cost-efficient alternative to x86 VMs while leveraging the scalability and performance characteristics of the Arm64 (AArch64) architecture on Google Cloud. -## Envoy for service proxy and traffic management on Arm +To learn more about Google Axion, see the blog [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu). -Envoy is an open-source, high-performance edge and service proxy designed for cloud-native applications. +## Envoy Proxy for service proxying and traffic management on Arm -It handles service-to-service communication, traffic routing, load balancing, and observability, making microservices more reliable and secure. +Envoy Proxy is an open-source, high-performance edge and service proxy designed for cloud-native applications. It handles service-to-service communication, traffic routing, load balancing, and observability - improving the reliability and security of microservices. -Envoy is widely used in service meshes, API gateways, and modern cloud environments. Learn more from the [Envoy official website](https://www.envoyproxy.io/) and its [official documentation](https://www.envoyproxy.io/docs/envoy/latest/). +Envoy is widely used in service meshes, API gateways, and modern cloud environments. Learn more on the [Envoy website](https://www.envoyproxy.io/) and in the [Envoy documentation](https://www.envoyproxy.io/docs/envoy/latest/). diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/baseline-testing.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/baseline-testing.md index ec16a99ec4..1df3f27757 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/baseline-testing.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/baseline-testing.md @@ -1,24 +1,32 @@ --- -title: Envoy baseline testing on Google Axion C4A Arm Virtual machine +title: Run baseline Envoy testing on a Google Axion C4A Arm VM weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- +## Validate Envoy installation with a baseline test -With Envoy installed successfully on your GCP C4A Arm virtual machine, you will proceed to validate that the Envoy is running as expected. +With Envoy installed successfully on your GCP C4A Arm virtual machine, you can now validate that Envoy is running as expected. -## Validate Envoy installation with a baseline test +In this section, you will do the following: + +- Create a minimal Envoy config +- Start Envoy with it with config +- Verify functionality using `curl` + +The test confirms the following: -In this section, you will learn how to create a minimal Envoy config, start Envoy with it, and verify functionality using `curl`. -The test will confirm that Envoy listens on port **10000**, forwards requests to `httpbin.org`, and returns a successful **200 OK** response. +- Envoy listens on port **10000** +- Forwards requests to `httpbin.org` +- Returns a **200 OK** response -### Create a Minimal Configuration File +## Create a minimal configuration file -Using a file editor of your choice, create a file named `envoy_config.yaml`, and add the below content to it. This file configures Envoy to listen on port **10000** and forward all traffic to `http://httpbin.org`. The `host_rewrite_literal` is essential to prevent 404 Not Found errors from the upstream server. +Using a text editor, create a file named `envoy_config.yaml` and add the following content as shown below. -```YAML +```yaml static_resources: listeners: - name: listener_0 @@ -64,18 +72,23 @@ static_resources: address: httpbin.org port_value: 80 ``` -- **Listeners:** Envoy is configured to accept incoming HTTP requests on port **10000** of your VM. -- **HTTP Connection Manager:** A filter processes the incoming requests, directing them to the appropriate backend. + +## Explanatory notes on the configuration + +This configures Envoy to listen on port **10000** and forward all traffic to `http://httpbin.org`. The `host_rewrite_literal` is required to prevent `404 Not Found` from the upstream server. + +- **Listeners:** Envoy accepts incoming HTTP requests on port **10000** of your VM. +- **HTTP Connection Manager:** Processes incoming requests, and applies routing. - **Routing:** All traffic is routed to the `service_httpbin` cluster, with the `Host` header rewritten to `httpbin.org`. -- **Clusters:** The `service_httpbin` cluster defines the upstream service as `httpbin.org` on port **80**, which is where requests are ultimately forwarded. +- **Clusters:** The `service_httpbin` cluster defines the upstream as `httpbin.org:80`. -### Run and Test Envoy +## Run and test Envoy This is the final phase of functional validation, confirming that the proxy is operational. -Start the Envoy proxy using your configuration file as shown on your current terminal: +Start the Envoy proxy using your configuration file: ```console - envoy -c envoy_config.yaml --base-id 1 +envoy -c envoy_config.yaml --base-id 1 ``` The output should look similar to: @@ -90,14 +103,14 @@ The output should look similar to: [2025-08-21 11:53:51.599][67137][info][config] [source/common/listener_manager/listener_manager_impl.cc:930] all dependencies initialized. starting workers ``` -Now, open a new terminal and send a test request to the Envoy listener using `curl`. +Leave this terminal running. In a new terminal, send a test request to the Envoy listener using `curl`: ```console curl -v http://localhost:10000/get ``` The `-v` flag provides verbose output, showing the full request and response headers. A successful test will show a **HTTP/1.1 200 OK** response with a JSON body from `httpbin.org`. -The output should look similar to: +A successful test shows HTTP/1.1 200 OK with a JSON body from httpbin.org, for example: ```output * Trying 127.0.0.1:10000... @@ -131,11 +144,11 @@ The output should look similar to: } * Connection #0 to host 127.0.0.1 left intact ``` -#### Summary of the curl Output +## Summary of the curl output -- **Successful Connection:** The `curl` command successfully connected to the Envoy proxy on `localhost:10000`. -- **Correct Status Code:** Envoy successfully forwarded the request and received a successful `200 OK` response from the upstream server. -- **Host Header Rewrite:** The Host header was correctly modified from `localhost:10000` to `httpbin.org` as defined in the configuration. -- **End-to-End Success:** The proxy is fully operational, proving that requests are correctly received, processed, and forwarded to the intended backend. +- **Successful connection:** The `curl` command successfully connected to the Envoy proxy on `localhost:10000`. +- **Correct status code:** Envoy forwards the request and receives a successful `200 OK` response from the upstream. +- **Host header rewrite:** Envoy rewrites `Host` to `httpbin.org` as configured. +- **End-to-end Success:** The proxy is operational; requests are received, processed, and forwarded to the backend. -This confirms the end-to-end flow with Envoy server is working correctly. +To stop Envoy in the first terminal, press **Ctrl+C**. This confirms the end-to-end flow with Envoy server is working correctly. diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/benchmarking.md index bb9675b17d..9c58032cfd 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/benchmarking.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/benchmarking.md @@ -1,5 +1,5 @@ --- -title: Envoy performance benchmarks on Arm64 and x86_64 in Google Cloud +title: Benchmark Envoy on Google Cloud for Arm64 and x86_64 with Siege weight: 6 ### FIXED, DO NOT MODIFY @@ -8,19 +8,20 @@ layout: learningpathall ## How to run Envoy benchmarks with Siege on Arm64 in GCP -**Siege** is a lightweight HTTP load testing and benchmarking tool that simulates concurrent users making requests to a target service. It is useful for Envoy benchmarking because it measures availability, throughput, response time, and failure rates under load, thus helping evaluate Envoy’s performance as a proxy under real-world traffic conditions. +Siege is a lightweight HTTP load testing and benchmarking tool that simulates concurrent users making requests to a target service. It is useful for Envoy benchmarking because it measures availability, throughput, response time, and failure rates under load, thus helping evaluate Envoy’s performance as a proxy under real-world traffic conditions. Follow the steps outlined to run Envoy benchmarks using Siege. -### Install Siege(Build from Source) +## Install Siege (build from source) -1. Install required build tools + +Install required build tools: ```console sudo dnf groupinstall -y "Development Tools" sudo dnf install -y wget make gcc ``` -2. Download, extract and build Siege source +Download, extract and build Siege source: ```console wget http://download.joedog.org/siege/siege-4.1.6.tar.gz @@ -30,14 +31,14 @@ cd siege-4.1.6 make sudo make install ``` -You have now successfully built and installed Seige on your Arm-based machine. +You have now successfully built and installed Siege on your Arm-based machine. -3. Verify installation +Verify installation: ```console siege --version ``` -This checks if Siege is installed properly and shows the version number. +This checks if Siege is installed properly and shows the version number: ```output SIEGE 4.1.6 @@ -46,9 +47,10 @@ This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ``` -### Envoy Benchmarking +## Envoy benchmarking + -1. To start, make sure Envoy is up and running with your config file (listening on port 10000): +To start, make sure Envoy is up and running with your config file (listening on port 10000): ```console @@ -56,16 +58,18 @@ envoy -c envoy_config.yaml --base-id 1 ``` This runs the Envoy proxy with your configuration file (envoy_config.yaml) so it can start listening for requests. -2. On another terminal, verify that envoy is running as expected with curl: +On another terminal, verify that Envoy is running as expected with curl: + ``` curl -v http://127.0.0.1:10000/get ``` Running from another terminal returns a **200 OK** status, confirming that Envoy is running and successfully processing requests. -3. Run a Time-based Load Test +## Run a time-based load test + +There are different ways you can set up your benchmark tests. Here, you will run a benchmark for a fixed time instead of using a request count: -There are different ways you can setup your benchmark tests. Here you will run a Benchmark for a fixed time instead of using request count: ```console siege -c30 -t10S http://127.0.0.1:10000/get @@ -101,7 +105,7 @@ Longest transaction: 2.89 Shortest transaction: 0.02 ``` -### Understanding Envoy benchmark metrics and results with Siege +## Understanding Envoy benchmark metrics and results with Siege - **Transactions**: Total number of completed requests during the benchmark. - **Availability**: Percentage of requests that returned a successful response. @@ -116,8 +120,9 @@ Shortest transaction: 0.02 - **Longest Transaction**: Maximum response time observed for a single request. - **Shortest Transaction**: Minimum response time observed for a single request. -### Benchmark summary on x86_64: -To compare the benchmark results, the following results were collected by running the same benchmark on a `c3-standard-4` (4 vCPU, 2 core, 16 GB Memory) x86_64 virtual machine in GCP, running RHEL 9. +## Benchmark summary on x86_64 +To compare the benchmark results, the following were collected by running the same benchmark on a `c3-standard-4` (4 vCPU, 16 GB memory) x86_64 VM in GCP, running RHEL 9: + | Metric | Value | Metric | Value | |-------------------------|--------------|---------------------------|-----------------| @@ -128,8 +133,9 @@ To compare the benchmark results, the following results were collected by runnin | Successful transactions | 720 | Failed transactions | 8 | | Longest transaction | 4.63 secs | Shortest transaction | 0.02 secs | -### Benchmark summary on Arm64: -Results from the earlier run on the c4a-standard-4 (4 vCPU, 16 GB memory) Arm64 VM in GCP (RHEL 9): +## Benchmark summary on Arm64 +Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (RHEL 9): + | Metric | Value | Metric | Value | |-------------------------|---------------|---------------------------|-----------------| @@ -143,7 +149,7 @@ Results from the earlier run on the c4a-standard-4 (4 vCPU, 16 GB memory) Arm64 ### Envoy performance benchmarking comparison on Arm64 and x86_64 When you compare the benchmarking performance results between the two instance types with the same vCPUs, you will notice that on the Google Axion C4A Arm-based instances: -- You have more successful transactions, fewer failures. -- Lower response times, higher transaction rate, better throughput. +- You have more successful transactions, fewer failures +- Lower response times, higher transaction rate, better throughput -You have successfully learned how to use Siege to benchmark Envoy on your Arm-based Axion Google cloud instance, validating both performance and reliability against similar x86 instances. +You have successfully learned how to use Siege to benchmark Envoy on your Arm-based Axion Google cloud instance, validating both performance and reliability against similar x86_64 instances. diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/deploy.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/deploy.md index b932b53b36..649ccd63f0 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/deploy.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/deploy.md @@ -1,5 +1,5 @@ --- -title: How to deploy Envoy on Google Axion C4A Arm virtual machines +title: Deploy Envoy on Google Axion C4A Arm virtual machines weight: 4 ### FIXED, DO NOT MODIFY @@ -7,10 +7,10 @@ layout: learningpathall --- -## How to deploy Envoy on a Google Axion C4A Arm virtual machine -In this section you will learn how to install Envoy Proxy v1.30.0 on a Google Cloud Axion C4A virtual machine running RHEL 9. You will install the dependencies, download the official static Arm64 Envoy binary and check the installed version. +## Install Envoy Proxy v1.30.0 on a Google Axion C4A Arm VM +In this section you'll install Envoy Proxy v1.30.0 on a Google Cloud Axion C4A virtual machine running RHEL 9. You'll install the dependencies, download the official static Arm64 Envoy binary, and verify the installation. -1. Install Dependencies +## Install Dependencies ```console sudo dnf install -y \ @@ -25,10 +25,9 @@ sudo dnf install -y \ pip3 install virtualenv ``` -2. Install Envoy (Static Arm64 Binary) +## Install Envoy (static Arm64 binary) -You will now download and install the Envoy binary on your Arm-based instance. -Download the binary directly to **/usr/local/bin/envoy**. The `-L` flag is crucial as it follows any redirects from the download URL. +Download the Envoy binary. `-L` follows redirects: ```console sudo curl -L \ @@ -40,17 +39,16 @@ Change the permissions on the downloaded binary to make it an executable: ```console sudo chmod +x /usr/local/bin/envoy ``` -Verify the installation by checking its version. +Verify the installation by checking its version: ```console envoy --version ``` This confirms the binary is correctly placed and executable. -The output should look like: +Expected output: ```output envoy version: 50ea83e602d5da162df89fd5798301e22f5540cf/1.30.0/Clean/RELEASE/BoringSSL ``` -This confirms the installation of Envoy. -You can now proceed with the baseline testing in the next section. +Envoy is now installed. Continue to baseline testing in the next section. diff --git a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/instance.md index 87db47d57e..04bba3a19e 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy-gcp/instance.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy-gcp/instance.md @@ -1,30 +1,31 @@ --- -title: How to create a Google Axion C4A Arm virtual machine on GCP +title: Create a Google Axion C4A Arm virtual machine on GCP weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## How to create a Google Axion C4A Arm VM on Google Cloud +## Overview -In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the **c4a-standard-4 (4 vCPUs, 16 GB memory)** machine type in the Google Cloud Console. +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. -For details on GCP setup, refer to the [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/) Learning Path. +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} -### Create a Google Axion C4A Arm VM in Google Cloud Console +## Provision a Google Axion C4A Arm VM in Google Cloud Console To create a virtual machine based on the C4A instance type: -1. Navigate to the [Google Cloud Console](https://console.cloud.google.com/). -2. Go to **Compute Engine > VM Instances** and select **Create Instance**. -3. Under **Machine configuration**: - - Enter details such as **Instance name**, **Region**, and **Zone**. +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. - Set **Series** to `C4A`. - - Select a machine type such as `c4a-standard-4`. + - Select `c4a-standard-4` for machine type. - ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](./image1.png "Google Cloud Console – creating a Google Axion C4A Arm virtual machine") + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](./image1.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") -4. Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. - For this Learning Path, use **Red Hat Enterprise Linux 9**. Ensure you select the **Arm image** variant. Click **Select**. -5. Under **Networking**, enable **Allow HTTP traffic**. -6. Click **Create** to launch the instance. +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **Red Hat Enterprise Linux 9**. Ensure you select the **Arm image** variant. Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md index f56a19089e..7e3bcbd4b9 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md @@ -23,7 +23,7 @@ subjects: Performance and Architecture armips: - Neoverse tools_software_languages: - - Perf + - perf - Runbook operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/funasr/_index.md b/content/learning-paths/servers-and-cloud-computing/funasr/_index.md index edfe947480..ee20ec87bb 100644 --- a/content/learning-paths/servers-and-cloud-computing/funasr/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/funasr/_index.md @@ -26,7 +26,7 @@ tools_software_languages: - ModelScope - FunASR - LLM - - GenAI + - Generative AI - Python diff --git a/content/learning-paths/servers-and-cloud-computing/github-actions-runner/_index.md b/content/learning-paths/servers-and-cloud-computing/github-actions-runner/_index.md index bceac5fad9..42121a6e12 100644 --- a/content/learning-paths/servers-and-cloud-computing/github-actions-runner/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/github-actions-runner/_index.md @@ -24,7 +24,7 @@ armips: - Neoverse tools_software_languages: - - CloudFormation + - AWS Cloud Formation - GitHub - AWS EC2 diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-with-lse/_index.md b/content/learning-paths/servers-and-cloud-computing/glibc-with-lse/_index.md index f7e72bddab..eea49bbb39 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-with-lse/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-with-lse/_index.md @@ -31,7 +31,7 @@ operatingsystems: - Linux tools_software_languages: -- Glibc +- glibc - LSE - MongoDB - Runbook diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_index.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_index.md new file mode 100644 index 0000000000..bc5f687039 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_index.md @@ -0,0 +1,45 @@ +--- +title: Learn about the impact of NIC IRQs and patterns on cloud + +draft: true +cascade: + draft: true + +minutes_to_complete: 20 + +who_is_this_for: This is anyone interested in understanding how IRQ patterns can enhance networking workload performance on cloud. + + +learning_objectives: + - Analyze the current IRQ layout on the machine. + - Test different options and patterns to improve performance. + +prerequisites: + - An Arm computer running Linux installed. + - Some familiarity with running Linux command line commands. + +author: Kiel Friedt + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - AArch64 +tools_software_languages: + +operatingsystems: + - Linux + + +further_reading: + - resource: + title: Perf for Linux on Arm (LinuxPerf) + link: https://learn.arm.com/install-guides/perf/ + type: website + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md new file mode 100644 index 0000000000..3eee3c2d17 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md @@ -0,0 +1,72 @@ +--- +title: checking IRQs +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +First you should run the following command to identify all IRQs on the system. Identify the NIC IRQs and adjust the system by experimenting and seeing how performance improves. + +``` +grep '' /proc/irq/*/smp_affinity_list | while IFS=: read path cpus; do + irq=$(basename $(dirname $path)) + device=$(grep -E "^ *$irq:" /proc/interrupts | awk '{print $NF}') + printf "IRQ %s -> CPUs %s -> Device %s\n" "$irq" "$cpus" "$device" +done +``` + + +{{% notice Note %}} +output should look similar to this: +``` +IRQ 104 -> CPUs 12 -> Device ens34-Tx-Rx-5 +IRQ 105 -> CPUs 5 -> Device ens34-Tx-Rx-6 +IRQ 106 -> CPUs 10 -> Device ens34-Tx-Rx-7 +IRQ 11 -> CPUs 0-15 -> Device +IRQ 14 -> CPUs 0-15 -> Device ttyS0 +IRQ 17 -> CPUs 0-15 -> Device ACPI:Ged +IRQ 19 -> CPUs 0-15 -> Device ACPI:Ged +IRQ 2 -> CPUs 0-15 -> Device +IRQ 20 -> CPUs 0-15 -> Device ACPI:Ged +IRQ 21 -> CPUs 0-15 -> Device ACPI:Ged +... +IRQ 26 -> CPUs 0-15 -> Device ACPI:Ged +``` +{{% /notice %}} + +Now, you may notice that the NIC IRQs are assigned to a duplicate CPU by default. + +like this example: +``` +IRQ 100 -> CPUs 2 -> Device ens34-Tx-Rx-1 +IRQ 101 -> CPUs 12 -> Device ens34-Tx-Rx-2 +IRQ 102 -> CPUs 14 -> Device ens34-Tx-Rx-3 +IRQ 103 -> CPUs 9 -> Device ens34-Tx-Rx-4 +IRQ 104 -> CPUs 12 -> Device ens34-Tx-Rx-5 +IRQ 105 -> CPUs 5 -> Device ens34-Tx-Rx-6 +IRQ 106 -> CPUs 10 -> Device ens34-Tx-Rx-7 +``` +This can potential hurt performance. Suggestions and patterns to experiment with will be on the next step. + +### reset + +If performance reduces, you can return the IRQs back to default using the following commands. + +``` +sudo systemctl unmask irqbalance +sudo systemctl enable --now irqbalance +``` + +or you can run the following + +``` +DEF=$(cat /proc/irq/default_smp_affinity) +for f in /proc/irq/*/smp_affinity; do + echo "$DEF" | sudo tee "$f" >/dev/null || true +done +``` + +### Saving these changes + +Any changes you make to IRQs will be reset at reboot. You will need to change your systems settings to make your changes permanant. diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/conclusion.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/conclusion.md new file mode 100644 index 0000000000..c25a4ceb1c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/conclusion.md @@ -0,0 +1,17 @@ +--- +title: conclusion +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +While a single pattern does not work for all workloads. Our testing found that under heavy network workloads, different patterns performed better based on sizing. + +### upto and under 16 vCPUs +For best performance, reduce NIC IRQs to either one or two cores. Otherwise random or default performed second best. + +*If the number of NIC IRQS are more then the number of vCPUs, concentrating them over less cores improved performance significantly. + +### over 16 vCPUs +No pattern showed significant improvement over default as long as all NIC IRQs were not on duplicate cores. diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/patterns.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/patterns.md new file mode 100644 index 0000000000..285819a065 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/patterns.md @@ -0,0 +1,60 @@ +--- +title: patterns +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +The following patterns were ran on multiple cloud and on a variety of sizes. A recommended IRQ pattern will be suggested at the end. Based on your workload, a different pattern may result in higher performance. + +### Patterns +1. Default: IRQ pattern provided at boot. +2. Random: All IRQs are assigned a core and do not overlap with network IRQs. +3. Housekeeping: All IRQs outside of network IRQs are assign to specific core(s). +4. NIC IRQs are set to single or multiple ranges of cores and including pairs. EX. 1, 1-2, 0-3, 0-7, [0-1, 2-3..], etc. + + +### Scripts to change IRQ + +To change the NIC IRQs or IRQs in general you can use the following scripts. + +Housekeeping pattern example, you will need to add more to account for other IRQs on your system + +``` +HOUSEKEEP=#core range here + +# ACPI:Ged +for irq in $(awk '/ACPI:Ged/ {sub(":","",$1); print $1}' /proc/interrupts); do + echo $HOUSEKEEP | sudo tee /proc/irq/$irq/smp_affinity_list >/dev/null +done +``` + +This is for pairs on a 16 vCPU machine, you will need the interface name. + +``` +IFACE=#interface name + +PAIRS=("0,1" "2,3" "4,5" "6,7" "8,9" "10,11" "12,13" "14,15") + +# Match IRQs for the NIC +mapfile -t irqs < <(grep "$IFACE-Tx-Rx" /proc/interrupts | awk '{gsub(":","",$1); print $1}') + +i=0 +for irq in "${irqs[@]}"; do + pair=${PAIRS[$((i % ${#PAIRS[@]}))]} + echo "$pair" | sudo tee /proc/irq/$irq/smp_affinity_list >/dev/null + echo "Set IRQ $irq -> CPUs $pair" + ((i++)) +done +``` + +This will assign a specific core(s) to NIC IRQs only + +``` +IFACE=#interface name + +for irq in $(awk '/$IFACE/ {sub(":","",$1); print $1}' /proc/interrupts); do + echo 0-15 | sudo tee /proc/irq/$irq/smp_affinity_list > /dev/null +done +``` \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md new file mode 100644 index 0000000000..ff29c8cfcf --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md @@ -0,0 +1,64 @@ +--- +title: Deploy Java applications on the Microsoft Azure Cobalt 100 processors + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This Learning Path introduces Java deployment on Microsoft Azure Cobalt 100 (Arm-based) virtual machines. It is designed for developers migrating Java applications from x86_64 to Arm with minimal or no changes. + +learning_objectives: + - Provision an Azure Arm64 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image. + - Deploy Java on the Ubuntu Pro virtual machine. + - Perform Java baseline testing and benchmarking on both x86_64 and Arm64 virtual machines. + +prerequisites: + - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6). + - Basic understanding of Linux command line. + - Familiarity with the [Java platform](https://openjdk.org/) and deployment practices on Arm64 platforms. + + +author: Jason Andrews + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +cloud_service_providers: Microsoft Azure + +armips: + - Neoverse + +tools_software_languages: + - Java + - JMH + +operatingsystems: + - Linux + +further_reading: + - resource: + title: Azure Virtual Machines documentation + link: https://learn.microsoft.com/en-us/azure/virtual-machines/ + type: documentation + - resource: + title: Azure Container Instances documentation + link: https://learn.microsoft.com/en-us/azure/container-instances/ + type: documentation + - resource: + title: Java on Azure + link: https://learn.microsoft.com/en-us/java/azure/ + type: documentation + - resource: + title: JMH (Java Microbenchmark Harness) documentation + link: https://openjdk.org/projects/code-tools/jmh/ + type: documentation + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md new file mode 100644 index 0000000000..32b9847d65 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md @@ -0,0 +1,20 @@ +--- +title: "Overview" + +weight: 2 + +layout: "learningpathall" +--- + +## Cobalt 100 Arm-based processor + +Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU, ensuring consistent and predictable performance. + +To learn more about Cobalt 100, refer to the blog [Announcing the preview of new Azure virtual machine based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353). + +## Java +Java is a high-performance, open-source, object-oriented programming language and runtime environment widely used for building scalable, reliable, and secure applications. + +It enables developers to write code once and run it anywhere, thanks to the Java Virtual Machine (JVM), which abstracts away hardware and operating system differences. Java applications are compiled into bytecode, which the JVM executes, providing portability and performance across platforms. + +Java is extensively used in enterprise systems, cloud-native applications, Android development, big data processing, and high-performance computing. Learn more from the [OpenJDK official website](https://openjdk.org/) and its [official documentation](https://docs.oracle.com/en/java/). diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md new file mode 100644 index 0000000000..8625a0c451 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md @@ -0,0 +1,63 @@ +--- +title: Java Baseline Testing +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +### Deploy a Java application with Tomcat-like operation +Apache Tomcat is a Java-based web application server (technically, a Servlet container) that executes Java web applications. It's widely used to host Java servlets, JSP (JavaServer Pages), +and RESTful APIs written in Java. + +The below Java class simulates the generation of a basic HTTP response and measures the time taken to construct it, mimicking a lightweight Tomcat-like operation. It measures how long it +takes to build the response string, helping evaluate raw Java execution efficiency before deploying heavier frameworks like Tomcat. + +Create a file named `HttpSingleRequestTest.java`, and add the below content to it: + +```java +public class HttpSingleRequestTest { + public static void main(String[] args) { + long startTime = System.nanoTime(); + String response = generateHttpResponse("Tomcat baseline test on Arm64"); + long endTime = System.nanoTime(); + double durationInMicros = (endTime - startTime) / 1_000.0; + System.out.println("Response Generated:\n" + response); + System.out.printf("Response generation took %.2f microseconds.%n", durationInMicros); + } + private static String generateHttpResponse(String body) { + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/plain\r\n" + + "Content-Length: " + body.length() + "\r\n\r\n" + + body; + } +} +``` +Compile and Run Java program : + +```console +javac HttpSingleRequestTest.java +java -Xms128m -Xmx256m -XX:+UseG1GC HttpSingleRequestTest +``` + +- -Xms128m sets the initial heap size for the Java Virtual Machine to 128 MB. +- -Xmx256m sets the maximum heap size for the JVM to 256 MB. +- -XX:+UseG1GC enables the G1 Garbage Collector (Garbage First GC), designed for low pause times and better performance in large heaps. + +You should see an output similar to: +```output +java -Xms128m -Xmx256m -XX:+UseG1GC HttpSingleRequestTest +Response Generated: +HTTP/1.1 200 OK +Content-Type: text/plain +Content-Length: 29 + +Tomcat baseline test on Arm64 +Response generation took 12901.53 microseconds. +``` +Output summary: + +- The program generated a fake HTTP 200 OK response with a custom message. +- It then measured and printed the time taken to generate that response (22125.79 microseconds). +- This serves as a basic baseline performance test of string formatting and memory handling on the JVM running on an Azure Arm64 instance. diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md new file mode 100644 index 0000000000..cf4105f0cc --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md @@ -0,0 +1,223 @@ +--- +title: Benchmarking via JMH +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Now that you’ve built and run the Tomcat-like response, you can use it to test the JVM performance using JMH. You can also use it to test the performance difference between Cobalt 100 instances and other similar D series x86_64 based instances. +## Run the performance tests using JMH + +JMH (Java Microbenchmark Harness) is a Java benchmarking framework developed by the JVM team at Oracle to measure the performance of small code snippets with high precision. It accounts for JVM optimizations like JIT and warm-up to ensure accurate and reproducible results. It measures the throughput, average latency, or execution time. Below steps help benchmark the Tomcat-like operation: + + +Install Maven: + +```console +sudo apt install maven -y +``` +Create Benchmark Project: + +```console +mvn archetype:generate \ + -DinteractiveMode=false \ + -DarchetypeGroupId=org.openjdk.jmh \ + -DarchetypeArtifactId=jmh-java-benchmark-archetype \ + -DarchetypeVersion=1.37 \ + -DgroupId=com.example \ + -DartifactId=jmh-benchmark \ + -Dversion=1.0 +cd jmh-benchmark +``` + +Edit the `src/main/java/com/example/MyBenchmark.java` file and add the below code on it: + +```java +package com.example; + +import org.openjdk.jmh.annotations.Benchmark; + +public class MyBenchmark { + + @Benchmark + public void benchmarkHttpResponse() { + String body = "Benchmarking a Tomcat-like operation"; + StringBuilder sb = new StringBuilder(); + sb.append("HTTP/1.1 200 OK\r\n"); + sb.append("Content-Type: text/plain\r\n"); + sb.append("Content-Length: ").append(body.length()).append("\r\n\r\n"); + sb.append(body); + + // Prevent dead-code elimination + if (sb.length() == 0) { + throw new RuntimeException(); + } + } +} +``` +This simulates HTTP response generation similar to Tomcat. + +Build the Benchmark: + +```console +mvn clean install +``` + +After the build is complete, the JMH benchmark jar will be in the target/ directory. + +Run the Benchmark: + +```console +java -jar target/benchmarks.jar +``` + +You should see an output similar to: +```output +# JMH version: 1.37 +# VM version: JDK 21.0.8, OpenJDK 64-Bit Server VM, 21.0.8+9-Ubuntu-0ubuntu124.04.1 +# VM invoker: /usr/lib/jvm/java-21-openjdk-arm64/bin/java +# VM options: +# Blackhole mode: compiler (auto-detected, use -Djmh.blackhole.autoDetect=false to disable) +# Warmup: 5 iterations, 10 s each +# Measurement: 5 iterations, 10 s each +# Timeout: 10 min per iteration +# Threads: 1 thread, will synchronize iterations +# Benchmark mode: Throughput, ops/time +# Benchmark: com.example.MyBenchmark.benchmarkHttpResponse + +# Run progress: 0.00% complete, ETA 00:08:20 +# Fork: 1 of 5 +# Warmup Iteration 1: 33509694.060 ops/s +# Warmup Iteration 2: 36783933.354 ops/s +# Warmup Iteration 3: 35202103.615 ops/s +# Warmup Iteration 4: 36493073.361 ops/s +# Warmup Iteration 5: 36470050.153 ops/s +Iteration 1: 35188405.658 ops/s +Iteration 2: 35011856.616 ops/s +Iteration 3: 36282916.441 ops/s +Iteration 4: 34558682.952 ops/s +Iteration 5: 34878375.325 ops/s + +# Run progress: 20.00% complete, ETA 00:06:41 +# Fork: 2 of 5 +# Warmup Iteration 1: 33055148.091 ops/s +# Warmup Iteration 2: 36374390.556 ops/s +# Warmup Iteration 3: 35020852.850 ops/s +# Warmup Iteration 4: 36463924.398 ops/s +# Warmup Iteration 5: 35116009.523 ops/s +Iteration 1: 36604427.854 ops/s +Iteration 2: 35151064.855 ops/s +Iteration 3: 35171529.012 ops/s +Iteration 4: 35092144.416 ops/s +Iteration 5: 36670199.634 ops/s + +# Run progress: 40.00% complete, ETA 00:05:00 +# Fork: 3 of 5 +# Warmup Iteration 1: 34021525.130 ops/s +# Warmup Iteration 2: 35796028.914 ops/s +# Warmup Iteration 3: 36813541.649 ops/s +# Warmup Iteration 4: 34424554.094 ops/s +# Warmup Iteration 5: 35100074.155 ops/s +Iteration 1: 33533209.090 ops/s +Iteration 2: 34755031.947 ops/s +Iteration 3: 36463135.748 ops/s +Iteration 4: 34961009.997 ops/s +Iteration 5: 36496001.612 ops/s + +# Run progress: 60.00% complete, ETA 00:03:20 +# Fork: 4 of 5 +# Warmup Iteration 1: 33393091.940 ops/s +# Warmup Iteration 2: 35235407.288 ops/s +# Warmup Iteration 3: 36203077.665 ops/s +# Warmup Iteration 4: 34580888.238 ops/s +# Warmup Iteration 5: 35984836.776 ops/s +Iteration 1: 34896194.779 ops/s +Iteration 2: 36479405.215 ops/s +Iteration 3: 35010049.135 ops/s +Iteration 4: 36277296.075 ops/s +Iteration 5: 36340953.266 ops/s + +# Run progress: 80.00% complete, ETA 00:01:40 +# Fork: 5 of 5 +# Warmup Iteration 1: 35482444.435 ops/s +# Warmup Iteration 2: 37116032.766 ops/s +# Warmup Iteration 3: 35389871.716 ops/s +# Warmup Iteration 4: 36814888.849 ops/s +# Warmup Iteration 5: 35462220.484 ops/s +Iteration 1: 36896452.473 ops/s +Iteration 2: 35362724.405 ops/s +Iteration 3: 36992383.389 ops/s +Iteration 4: 35535471.437 ops/s +Iteration 5: 36881529.760 ops/s + + +Result "com.example.MyBenchmark.benchmarkHttpResponse": + 35659618.044 ±(99.9%) 686946.011 ops/s [Average] + (min, avg, max) = (33533209.090, 35659618.044, 36992383.389), stdev = 917053.272 + CI (99.9%): [34972672.032, 36346564.055] (assumes normal distribution) + + +# Run complete. Total time: 00:08:21 + +REMEMBER: The numbers below are just data. To gain reusable insights, you need to follow up on +why the numbers are the way they are. Use profilers (see -prof, -lprof), design factorial +experiments, perform baseline and negative tests that provide experimental control, make sure +the benchmarking environment is safe on JVM/OS/HW level, ask for reviews from the domain experts. +Do not assume the numbers tell you what you want them to tell. + +NOTE: Current JVM experimentally supports Compiler Blackholes, and they are in use. Please exercise +extra caution when trusting the results, look into the generated code to check the benchmark still +works, and factor in a small probability of new VM bugs. Additionally, while comparisons between +different JVMs are already problematic, the performance difference caused by different Blackhole +modes can be very significant. Please make sure you use the consistent Blackhole mode for comparisons. + +Benchmark Mode Cnt Score Error Units +MyBenchmark.benchmarkHttpResponse thrpt 25 35659618.044 ± 686946.011 ops/s +``` + +### Benchmark Metrics Explained + +- **Run Count**: The total number of benchmark iterations executed. A higher run count increases statistical reliability and reduces the effect of outliers. +- **Average Throughput**: The mean number of operations executed per second across all iterations. This metric represents the overall sustained performance of the benchmarked workload. +- **Standard Deviation**: Indicates the amount of variation or dispersion from the average throughput. A smaller standard deviation means more consistent performance. +- **Confidence Interval (99.9%)**: The statistical range within which the true average throughput is expected to fall, with 99.9% certainty. Narrow intervals imply more reliable results. +- **Min Throughput**: The lowest throughput observed across all iterations, reflecting the worst-case performance scenario. +- **Max Throughput**: The highest throughput observed across all iterations, reflecting the best-case performance scenario. + +### Benchmark summary on Arm64 + +Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**. +| Metric | Value | +|--------------------------------|---------------------------| +| **Java Version** | OpenJDK 21.0.8 | +| **Run Count** | 25 iterations | +| **Average Throughput** | 35.66M ops/sec | +| **Standard Deviation** | ±0.92M ops/sec | +| **Confidence Interval (99.9%)**| [34.97M, 36.34M] ops/sec | +| **Min Throughput** | 33.53M ops/sec | +| **Max Throughput** | 36.99M ops/sec | + +### Benchmark summary on x86 + +Here is a summary of benchmark results collected on x86 **D4s_v6 Ubuntu Pro 24.04 LTS virtual machine**. + +| Metric | Value | +|--------------------------------|---------------------------| +| **Java Version** | OpenJDK 21.0.8 | +| **Run Count** | 25 iterations | +| **Average Throughput** | 16.78M ops/sec | +| **Standard Deviation** | ±0.06M ops/sec | +| **Confidence Interval (99.9%)**| [16.74M, 16.83M] ops/sec | +| **Min Throughput** | 16.64M ops/sec | +| **Max Throughput** | 16.88M ops/sec | + + +### Benchmark comparison insights +When comparing the results on Arm64 vs x86_64 virtual machines: + +- **High Throughput:** Achieved an average of **35.66M ops/sec**, with peak performance reaching **36.99M ops/sec**. +- **Stable Performance:** Standard deviation of **±0.92M ops/sec**, with results tightly bounded within the 99.9% confidence interval **[34.97M, 36.34M]**. +- **Consistent Efficiency:** Demonstrates the reliability of Arm64 architecture for sustaining high-throughput Java workloads on Azure Ubuntu Pro environments. + +You have now benchmarked Java on an Azure Cobalt 100 Arm64 virtual machine and compared results with x86_64. diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md new file mode 100644 index 0000000000..9571395aa2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md @@ -0,0 +1,50 @@ +--- +title: Create an Arm based cloud virtual machine using Microsoft Cobalt 100 CPU +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +There are several ways to create an Arm-based Cobalt 100 virtual machine : the Microsoft Azure console, the Azure CLI tool, or using your choice of IaC (Infrastructure as Code). This guide will use the Azure console to create a virtual machine with Arm-based Cobalt 100 Processor. + +This learning path focuses on the general-purpose virtual machine of the D series. Please read the guide on [Dpsv6 size series](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) offered by Microsoft Azure. + +If you have never used the Microsoft Cloud Platform before, please review the microsoft [guide to Create a Linux virtual machine in the Azure portal](https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal?tabs=ubuntu). + +#### Create an Arm-based Azure Virtual Machine + +Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to "Virtual Machines". +1. Select "Create", and click on "Virtual Machine" from the drop-down list. +2. Inside the "Basic" tab, fill in the Instance details such as "Virtual machine name" and "Region". +3. Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select “Arm64” as the VM architecture. +4. In the “Size” field, click on “See all sizes” and select the D-Series v6 family of virtual machines. Select “D4ps_v6” from the list. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Figure 1: Select the D-Series v6 family of virtual machines") + +5. Select "SSH public key" as an Authentication type. Azure will automatically generate an SSH key pair for you and allow you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine. +6. Fill in the Administrator username for your VM. +7. Select "Generate new key pair", and select "RSA SSH Format" as the SSH Key Type. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key. +8. In the "Inbound port rules", select HTTP (80) and SSH (22) as the inbound ports. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance1.png "Figure 2: Allow inbound port rules") + +9. Click on the "Review + Create" tab and review the configuration for your virtual machine. It should look like the following: + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/ubuntu-pro.png "Figure 3: Review and Create an Azure Cobalt 100 Arm64 VM") + +10. Finally, when you are confident about your selection, click on the "Create" button, and click on the "Download Private key and Create Resources" button. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Figure 4: Download Private key and Create Resources") + +11. Your virtual machine should be ready and running within no time. You can SSH into the virtual machine using the private key, along with the Public IP details. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "Figure 5: VM deployment confirmation in Azure portal") + +{{% notice Note %}} + +To learn more about Arm-based virtual machine in Azure, refer to “Getting Started with Microsoft Azure” in [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/azure). + +{{% /notice %}} diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md new file mode 100644 index 0000000000..0a0096f224 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md @@ -0,0 +1,64 @@ +--- +title: Install Java +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + + +## Java Installation on Azure Ubuntu Pro virtual machine +Install Java on Ubuntu Pro virtual machine by updating the system and installing `default-jdk`, which includes both JRE and JDK. Verify the installation using `java -version` and `javac -version`, then set the `JAVA_HOME` environment variable for Arm-based systems. + + +### Install Java + +```console +sudo apt update +sudo apt install -y default-jdk +``` + +`default-jdk` installs both the default JRE and JDK provided by Azure Ubuntu Pro machine. + +Check to ensure that the JRE is properly installed: + +```console +java -version +``` + +You should see an output similar to: + +```output +openjdk version "21.0.8" 2025-07-15 +OpenJDK Runtime Environment (build 21.0.8+9-Ubuntu-0ubuntu124.04.1) +OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sharing) +``` + +Check to ensure that the JDK is properly installed: + +```console +javac -version +``` +You should see an output similar to: + +```output +javac 21.0.8 +``` + +Set Java Environment Variable for Arm: + +```console +export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-arm64 +export PATH=$JAVA_HOME/bin:$PATH +source ~/.bashrc +``` + +{{% notice Note %}} +Ubuntu Pro 24.04 LTS offers the default JDK version 21.0.8. It’s important to ensure that your version of OpenJDK for Arm is at least 11.0.9, or above. There is a large performance gap between OpenJDK-11.0.8 and OpenJDK 11.0.9. A patch added in 11.0.9 reduces false-sharing cache contention. +For more information, you can view this [Arm community blog](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/java-performance-on-neoverse-n1). + +The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) also recommends Java/OpenJDK version 11.0.9 as minimum recommended on the Arm platforms. +{{% /notice %}} + +Java installation is complete. You can now proceed with the baseline testing. diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/final-vm.png b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/final-vm.png new file mode 100644 index 0000000000..5207abfb41 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/final-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance.png b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance.png new file mode 100644 index 0000000000..285cd764a5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance1.png b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance1.png new file mode 100644 index 0000000000..b9d22c352d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance4.png b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance4.png new file mode 100644 index 0000000000..2a0ff1e3b0 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/instance4.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/ubuntu-pro.png b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/ubuntu-pro.png new file mode 100644 index 0000000000..d54bd75ca6 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-on-azure/images/ubuntu-pro.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md index b675188fa3..abf46482f9 100644 --- a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md @@ -24,8 +24,8 @@ armips: - Neoverse tools_software_languages: - - OpenJDK-21 - - Tomcat + - OpenJDK 21 + - Apache Tomcat - async-profiler - FlameGraph - wrk2 diff --git a/content/learning-paths/servers-and-cloud-computing/kafka/_index.md b/content/learning-paths/servers-and-cloud-computing/kafka/_index.md index 561092ebed..f2cafd65d5 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka/_index.md @@ -26,7 +26,7 @@ operatingsystems: - Linux tools_software_languages: - Kafka - - Zookeeper + - ZooKeeper further_reading: diff --git a/content/learning-paths/servers-and-cloud-computing/lambda_functions/_index.md b/content/learning-paths/servers-and-cloud-computing/lambda_functions/_index.md index 1765f5b84d..a5c8b07ca1 100644 --- a/content/learning-paths/servers-and-cloud-computing/lambda_functions/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/lambda_functions/_index.md @@ -22,8 +22,7 @@ armips: - Neoverse tools_software_languages: - Terraform - - Lambda - - Coding + - AWS Lambda operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md index 75904fd24f..f9a02504ab 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md @@ -27,7 +27,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Python - Demo - Hugging Face diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md new file mode 100644 index 0000000000..3773982b6f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md @@ -0,0 +1,204 @@ +--- +title: Analyze token generation at Prefill and Decode stage +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Analyze token generation at Prefill and Decode stage +To get a visible token generation view at Prefill and Decode stage, Annotation Marker feature of Streamline is used and the Annotation Marker generation code is integrated to the llama.cpp project. +You can find more information about Annotation Marker feature here, https://developer.arm.com/documentation/101816/9-7/Annotate-your-code?lang=en. + +## Steps of llama.cpp integration and Streamline setup + +### Step 1: Build Streamline Annotation library +Install ArmDS or Arm Streamline on your host PC first. +You can get Streamline Annotation support code in the installation directory such as *"Arm\Development Studio 2024.1\sw\streamline\gator\annotate"*. +You also can get the Annotation support code here, https://github.com/ARM-software/gator/tree/main , please download the right code that matches the version of Streamline tool on your host PC. + +Then you can build the Streamline Annotation Library by running +```bash +make CROSS_COMPILE=/path/to/aarch64_linux_gcc_tool +``` + +for example, +```bash +make CROSS_COMPILE=./Work/arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu- +``` +You can get the aarch64 gcc compiler toolchain here, https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads . + +The static linked library, libstreamline_annotate.a, will be produced. + +### Step 2: Integrate Annotation Marker code to llama.cpp +Download llama.cpp code from https://github.com/ggml-org/llama.cpp/archive/refs/tags/b6202.tar.gz +Go to llama.cpp root directory and create a directory ‘streamline_annotation’ there. +```bash +cd ./llama.cpp +mkdir streamline_annotation +``` + +Copy the library ‘libstreamline_annotate.a’ and the header file ‘streamline_annotate.h’ from Step 1 to the directory ‘streamline_annotation’. + +To link 'libstreamline_annotate.a' library when building llama-cli, change *llama.cpp\CMakeLists.txt* by adding following lines, + +```makefile +set(STREAMLINE_LIB_PATH ${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a) +target_include_directories(llama-cli PRIVATE ${CMAKE_SOURCE_DIR}/streamline_annotation) +target_link_libraries(${TARGET} PRIVATE ${STREAMLINE_LIB_PATH} ) +``` + +To add Annotation Markers to llama-cli, change the llama-cli code *llama.cpp/tools/main/main.cpp* by adding +```c +#include "streamline_annotate.h" +``` +and the Annotation Marker code in the 'main' function, + +Firstly, add the Streamline Annotation setup code after *common_init*, +```c + common_init(); + + //Add the Annotation setup code + ANNOTATE_SETUP; + +``` + + +then add the Annotation Marker generation code here, + + +```c + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); + + // Add annotation marker code for Streamline + { + char printf_buf[200]; + sprintf(printf_buf, "past %d, n_eval %d", n_past,n_eval ); + ANNOTATE_MARKER_STR(printf_buf); + } + // End of annotation marker + + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); + return 1; + } +``` + +A string is added to the Annotation Marker to record the position of input tokens and number of tokens to be processed. + +### Step 3: Build llama-cli executable +For convenience, llama-cli is static linked. + +Firstly, create a new directory ‘build’ understand llama.cpp root directory and go into it. +```bash +mkdir ./build & cd ./build +``` +Then configure the project by running +```bash +cmake .. -DCMAKE_SYSTEM_NAME=Linux -DCMAKE_SYSTEM_PROCESSOR=arm -DCMAKE_C_COMPILER=aarch64-none-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-none-linux-gnu-g++ -DLLAMA_NATIVE=OFF -DLLAMA_F16C=OFF -DLLAMA_GEMM_ARM=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_EXE_LINKER_FLAGS="-static -g" -DGGML_OPENMP=OFF -DCMAKE_C_FLAGS="-march=armv8.2-a+i8mm+dotprod -g" -DCMAKE_CXX_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" -DGGML_CPU_KLEIDIAI=ON -DGGML_OPENMP=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_CURL=OFF +``` + +Set CMAKE_C_COMPILER and DCMAKE_CXX_COMPILER to your cross compiler path. Make sure that “-march” in DCMAKE_C_FLAGS and CMAKE_CXX_FLAGS matches your Arm CPU hardware. + +In this guide, we run llama-cli on an Arm CPU which supports NEON Dotprod and I8MM instructions, so ‘-march’ is specified as ‘armv8.2-a+dotprod+i8mm’. We also specify ‘-static’ and ‘-g’ options so that the llama-cli executable is static linked and with debug info. This makes source code/function level profiling easier and the llama-cli executable runnable on various version of Arm64 Linux/Android. + +Now, we can build the project by running +```bash +cmake --build ./ --config Release +``` + +After the building process, you should find the llama-cli executable in *./build/bin/* directory. + +### Step 4: Run llama-cli and analyze the data with Streamline +Copy following files to your Arm64 platform, +* llama-cli executable +* the ‘gatord’ executable in Arm DS or Streamline installation folder, such as *Arm\Development Studio 2024.1\sw\streamline\bin\linux\arm64* for Linux and *Arm\Development Studio 2024.1\sw\streamline\bin\android\arm64* for Android +* the LLM model, Qwen1_5-0_5b-chat-q4_0.gguf + +Then run the gatord on your Arm64 target +```bash +./gatord +``` +You should see similar messages as below, + +``` bash +Streamline Data Recorder v9.4.0 (Build 9b1e8f8) +Copyright (c) 2010-2024 Arm Limited. All rights reserved. +Gator ready +``` + +Then launch the Streamline application on your host PC, connect to the gatord running on your Arm64 target with either TCP or ADB connection. You can select PMU events to be monitored at this point. + +![text#center](images/streamline_capture.png "Figure 6. Streamline Start Capture ") + +Set the path of llama-cli executable for Streamline so that its debug info can be used for analysis. + +![text#center](images/streamline_capture_image.png "Figure 7. Streamline image path") + +Click ‘Start Capture’ button on Streamline to start collecting data from the Arm64 target. + +*Note: This guide is not intended to introduce how to use Streamline, if you encounter any issue during setting up gatord or Streamline, please seek for help from Arm support.* + +Now, run the llama-cli executable as below, + +``` bash +./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 1 +``` + +After a while, you can stop the Streamline data collection by clicking ‘Stop’ button on Streamline. Then Streamline tool on your host PC will start the data analysis. + +## Analyze the data with Streamline +From the timeline view of Streamline, we can see some Annotation Markers. Since we add an Annotation Marker before llama_decode function, each Annotation Marker marks the start time of a token generation. + +![text#center](images/annotation_marker_1.png "Figure 8. Annotation Marker") + +The string in the Annotation Marker can be shown when clicking those Annotation Markers. For example, + +![text#center](images/annotation_marker_2.png "Figure 9. Annotation String") + +The number after ‘past’ indicates the position of input tokens, the number after ‘n_eval’ indicates the number of tokens to be processed this time. + +As shown in the timeline view below, with help of Annotation Markers, we can clearly identify the Prefill stage and Decode stage. + +![text#center](images/annotation_marker_prefill.png "Figure 10. Annotation Marker at Prefill and Decode stage") + +By checking the string of Annotation Marker, the first token generation at Prefill stage has 'past 0, n_eval 78', which means that the position of input tokens starts at 0 and there are 78 input tokens to be processed. +We can see that the first token generated at Prefill stage takes more time, since 78 input tokens have to be processed at Prefill stage, it performs lots of GEMM operations. At Decode stage, tokens are generated one by one at mostly equal speed, one token takes less time than that of Prefill stage, thanks to the effect of KV cache. At Decode stage, it performs many GEMV operations. + +We can further investigate it with PMU event counters that are captured by Streamline. At Prefill stage, the amount of computation, which are indicated by PMU event counters that count number of Advanced SIMD (NEON), Floating point, Integer data processing instruction, is large. However, the memory access is relatively low. Especially, the number of L3 cache refill/miss is much lower than that of Decode stage. + +At Decode stage, the amount of computation is relatively less (since the time of each token is less), but the number of L3 cache refill/miss goes much higher. +By monitoring other PMU events, Backend Stall Cycles and Backend Stall Cycles due to Memory stall, + +![text#center](images/annotation_pmu_stall.png "Figure 11. Backend stall PMU event") + +We can see that at Prefill stage, Backend Stall Cycles due to Memory stall are only about 10% of total Backend Stall Cycles. However, at Decode stage, Backend Stall Cycles due to Memory stall are around 50% of total Backend Stall Cycles. +All those PMU event counters indicate that it is compute-bound at Prefill stage and memory-bound at Decode stage. + +Now, let us further profile the code execution with Streamline. In the ‘Call Paths’ view of Streamline, we can see the percentage of running time of functions that are organized in form of call stack. + +![text#center](images/annotation_prefill_call_stack.png "Figure 12. Call stack") + +In the ‘Functions’ view of Streamline, we can see the overall percentage of running time of functions. + +![text#center](images/annotation_prefill_functions.png "Figure 13. Functions view") + +As we can see, the function, graph_compute, takes the largest portion of the running time. It shows that large amounts of GEMM and GEMV operations take most of the time. With Qwen1_5-0_5b-chat-q4_0 model, +* The computation (GEMM and GEMV) of Q, K, V vectors and most of FFN layers: their weights are with Q4_0 data type and the input activations are with FP32 data type. The computation is forwarded to KleidiAI trait by *ggml_cpu_extra_compute_forward*. KleidiAI ukernels implemented with NEON Dotprod and I8MM vector instructions are used to accelerate the computation. + - At Prefill stage, *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* KleidiAI ukernel is used for GEMM (Matrix Multiply) operators. It takes the advantage of NEON I8MM instruction. Since Prefill stage only takes small percentage of the whole time, the percentage of this function is small as shown in figures above. However, if we focus on Prefill stage only, with ‘Samplings’ view in Timeline. We can see *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* takes the largest portion of the whole Prefill stage. + + ![text#center](images/Prefill_only.png "Figure 14. Prefill only view") + + - At Decode stage, *kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod* KleidiAI ukernel is used for GEMV operators. It takes advantage of NEON Dotprod instruction. If we focus on Decode stage only, we can see this function takes the second largest portion. + + ![text#center](images/Decode_only.png "Figure 15. Decode only view") + +* There is a result_output linear layer in Qwen1_5-0_5b-chat-q4_0 model, the wights are with Q6_K data type. The layer computes a huge [1, 1024] x [1024, 151936] GEMV operation, where 1024 is the embedding size and 151936 is the vocabulary size. This operation cannot be handled by KleidiAI yet, it is handled by the ggml_vec_dot_q6_K_q8_K function in ggml-cpu library. +* The tensor nodes for computation of Multi-Head attention are presented as three-dimension matrices with FP16 data type (KV cache also holds FP16 values), they are computed by ggml_vec_dot_f16 function in ggml-cpu library. +* The computation of RoPE, Softmax, RMSNorm layers does not take significant portion of the running time. diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md new file mode 100644 index 0000000000..55adcb95bc --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md @@ -0,0 +1,13 @@ +--- +title: Conclusion +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Conclusion +By leveraging the Streamline tool together with a good understanding of the llama.cpp code, the execution process of the LLM model can be visualized, which helps analyze code efficiency and investigate potential optimization. + +Note that additional annotation code in llama.cpp and gatord might somehow affect the performance. + diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md new file mode 100644 index 0000000000..3802be4996 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md @@ -0,0 +1,132 @@ +--- +title: Deep dive into individual operator +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Deep dive into individual operator +This session provides a guide on how to use the Streamline Annotation Channel feature to analyze execution time of each node in the compute graph. +More information about Streamline Annotation Channel can be found here https://developer.arm.com/documentation/101816/9-7/Annotate-your-code/User-space-annotations/Group-and-Channel-annotations?lang=en + +## Integrate Annotation Channel code to llama.cpp +In llama.cpp project, tensor nodes in compute graph are computed by the function ggml_graph_compute_thread in CPU backend, *llama.cpp\ggml\src\ggml-cpu\ggml-cpu.c* +```c +for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + + ggml_compute_forward(¶ms, node); +``` +To monitor the execution time of each node, we create a annotation channel for each type of operators (such as GGML_OP_MUL_MAT, GGML_OP_SOFTMAX, GGML_OP_ROPE, GGML_OP_MUL), since GGML_OP_MUL_MAT including both GEMM and GEMV operation takes significant portion of execution time, two dedicated annotation channels are created for GEMM and GEMV respectively. + +The annotation channel starts at the beginning of 'ggml_compute_forward’, it stops at the end of ‘ggml_compute_forward’, so that the computation of tensor node/operator can be monitored. + +Firstly, add Streamline annotation header file to ggml-cpu.c, +```c +#include "streamline_annotate.h" +``` +Then add annotation channel code in ggml_graph_compute_thread function, +```c +for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + // Start Annotation Channel for Streamline + { + char printf_buf[256]; + sprintf(printf_buf," %s, %s", node->name, ggml_get_name(node)); + + if(node->op==GGML_OP_MUL_MAT ) + { + if (node->src[1]->ne[1] == 1) + ANNOTATE_CHANNEL(0, printf_buf); //It is GEMV + else + ANNOTATE_CHANNEL(1, printf_buf); //It is GEMM + } + else + ANNOTATE_CHANNEL((node->op)+2, printf_buf); + } + + + + ggml_compute_forward(¶ms, node); + + + // End Annotation Channel for Streamline + { + if(node->op==GGML_OP_MUL_MAT) + { + if (node->src[1]->ne[1] == 1) + ANNOTATE_CHANNEL_END(0); + else + ANNOTATE_CHANNEL_END(1); + } + else + ANNOTATE_CHANNEL_END((node->op)+2); + } +``` + + +We also add tensor node names and the names of operation to the string annotation channels. + +If information of the shape and size of source tensors is required, we can change the code as below, +```c + sprintf(printf_buf,"%s %s %d_%d_%d %d_%d_%d", node->name, ggml_get_name(node), \ + node->src[0]? node->src[0]->ne[0] : 0, \ + node->src[0]? node->src[0]->ne[1] : 0 , \ + node->src[0]? node->src[0]->ne[2] : 0 ,\ + node->src[1]? node->src[1]->ne[0] : 0, \ + node->src[1]? node->src[1]->ne[1] : 0, \ + node->src[1]? node->src[1]->ne[2] : 0 \ + ); +``` +Then we need to change *llama.cpp\ggml\src\ggml-cpu\CMakeLists.txt* to include Streamline Annotation header file and libstreamline_annotate.a library by adding codes as below, +```bash + set(STREAMLINE_LIB_PATH ${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a) + target_include_directories( ${GGML_CPU_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/streamline_annotation) + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${STREAMLINE_LIB_PATH} ) +``` + +Then build llama-cli executable, run llama-cli and collect profiling data with Streamline as previous session. + + +## Analyze the data with Streamline +String annotations are displayed as text overlays inside the relevant channels in the details panel of the Timeline view, for example inside Channel 0 in the following screenshot. +![text#center](images/deep_dive_1.png "Figure 16. Annotation Channel") + +The letter A is displayed in the process list to indicate the presence of annotations. +String annotations are also displayed in the Message column in the Log view. +![text#center](images/deep_dive_2.png "Figure 17. Annotation log") + +### View of individual operators at Prefill stage + +The screenshot of annotation channel view at Prefill stage is shown as below, +![text#center](images/prefill_annotation_channel.png "Figure 18. Annotation Channel at Prefill stage") + +Note that the name of operator in the screenshot above is manually edited. If the name of operator needs to be shown instead of Channel number by Streamline, ANNOTATE_NAME_CHANNEL can be added to ggml_graph_compute_thread function. +This annotation macro is defined as, +```c +ANNOTATE_NAME_CHANNEL(channel, group, string) +``` +For example, +```c + ANNOTATE_NAME_CHANNEL(0, 0, "MUL_MAT_GEMV"); + ANNOTATE_NAME_CHANNEL(1, 0, "MUL_MAT_GEMM"); +``` +The code above sets the name of annotation channel 0 as ‘MUL_MAT_GEMV’, the name of annotation channel 1 as ‘MUL_MAT_GEMM’. +We can get more detailed information by zooming in the view, +![text#center](images/prefill_annotation_channel_2.png "Figure 18. Annotation Channel at Decode stage") + +When moving the cursor to the Annotation channel, the tensor node name, the name of operation, the shape and size of source tensor nodes will be shown. +![text#center](images/prefill_annotation_channel_3.png "Figure 19. Annotation Channel Zoom in") + +The screenshot above shows a GGML_OP_MUL_MAT operator of FFN_UP node, whose source tensors shape/size is [1024, 2816] and [1024, 68]. +The view clearly shows that the major time was spent on MUL_MAT GEMM operations of attention layers and FFN layers at Prefill stage. There is a large MUL_MAT GEMV operation at result_output linear layer. Other operators such as MUL, Softmax, Norm, RoPE do not take significant time. + +### View of individual operators at Decode stage +The screenshot of annotation channel view at Decode stage is shown as below, +![text#center](images/decode_annotation_channel.png "Figure 20. Annotation Channel at Decode stage") + +We can get more detailed information by zooming in the view, +![text#center](images/decode_annotation_channel_2.png "Figure 21. Annotation Channel string") + +The view shows that the major time was spent on MUL_MAT GEMV operations of attention layers and FFN layers at Decode stage. Comparing with Prefill stage, there is no GEMM at those layers, GEMV operations are performed instead. The large MUL_MAT GEMV operation at result_output linear layer takes more significant portion of time at Decode stage, since the time spent on each token generation at Decode stage is less due to utilization of KV cache. This corresponds to the percentage of execution time of the function ggml_vec_dot_q6_K_q8_K that we observed in previous session. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md new file mode 100644 index 0000000000..bdc885dad5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md @@ -0,0 +1,20 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Overview +Large Language Models (LLM) run very smoothly on Arm CPUs. The framework that runs LLM models is usually complex. To analyze the execution of LLM and utilize profiling information for potential code optimization, a good understanding of transformer architecture and an appropriate analysis tool is required. +This guide uses llama-cli application from llama.cpp and Arm’s Streamline tool to analyze the efficiency of LLM running on arm CPU. + +The guide includes, +* How to profile LLM token generation at Prefill and Decode stage +* How to profile execution of individual tensor node/operator +* How to profile LLM execution with multi-thread/multi-core + +Understanding this guide requires prerequisite knowledge of transformer architecture, llama.cpp and Streamline. + +We run Qwen1_5-0_5b-chat-q4_0.gguf model with llama-cli on Arm64 Linux and use Streamline for analysis. This guide should also work on Arm64 Android platform. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md new file mode 100644 index 0000000000..15bc501c7d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md @@ -0,0 +1,39 @@ +--- +title: Introduction to llama.cpp +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Introduction to llama.cpp +llama.cpp is a LLM framework implemented in C++ that can be used for both training and inference. This guide only covers inference on the CPU. +llama-cli provides a terminal interface to interact with LLM using the llama.cpp inference engine. It enables LLM inference, chat mode, grammar-constrained generation directly from the command line. +![text#center](images/llama_structure.png "Figure 1. Annotation String") + +llama-cli does the following things, +* Load and interpret LLMs in .gguf format. +* Build a compute graph according to the model structure. The compute graph can be divided into subgraphs that are assigned to the most suitable backend devices. At this step, the model structure are converted into a compute graph with many tensor nodes/operators (such as ADD, MUL_MAT, NORM, SOFTMAX) that can be actually computed. +Since this guide only focuses on running LLM on CPU, all operators are assigned to CPU backend. +* Allocate memory for tensors nodes in the compute graph by the graph planner. +* Compute tensor nodes at the graph compute stage, where the ‘graph_compute’ function forwards the compute subgraphs to the backend devices. The computation is performed by traversing the tree of nodes in the compute graph. + +Those steps above are wrapped in the function ‘llama_decode’. At LLM Prefill and Decode stage, llama-cli calls ‘llama_decode’ repeatedly to generate tokens. However, the parameter ‘llama_batch’ passed to ‘llama_decode' is different at Prefill and Decode stage. ‘llama_batch’ includes information such as input tokens, number of input tokens, the position of input tokens. + +The components of llama.cpp include +![text#center](images/llama_componetns.jpg "Figure 2. llmama.cpp components") + +llama.cpp supports various backends such as CPU, GPU, CUDA, OpenCL etc. +For the CPU backend, it provides an optimized ggml-cpu library (mainly utilizing CPU vector instructions). For Arm CPUs, the ggml-cpu library also offers an aarch64 trait that leverages the new I8MM instructions for acceleration. The ggml-cpu library also integrates the Arm KleidiAI library as an additional trait. + +Most autoregressive LLMs are Decoder-only model. Here is a brief introduction to Prefill and Decode stage of autoregressive LLMs. +![text#center](images/llm_prefill_decode.jpg "Figure 3. Prefill and Decode stage") + +At the Prefill stage, multiple input tokens of the prompt are processed. It mainly performs GEMM (A matrix is multiplied by another matrix) operations to generate the first output token. +![text#center](images/transformer_prefill.jpg "Figure 4. Prefill stage") + + +At the Decode stage, by utilizing the KV cache, it mainly performs GEMV (A vector is multiplied by a matrix) operations to generate subsequent output tokens one by one. +![text#center](images/transformer_decode.jpg "Figure 5. Decode stage") + +Therefore, the prefill stage is compute-bound, while the decode stage has relatively less computation and is more memory-bound due to lots of KV cache memory access. This can be seen in the subsequent analysis with Streamline. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md new file mode 100644 index 0000000000..c3eee09f8c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md @@ -0,0 +1,39 @@ +--- +title: Use Streamline to analyze multi-core/multi-thread support in llama.cpp +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Use Streamline to analyze multi-core/multi-thread support in llama.cpp +The CPU backend in llama.cpp utilizes multi-core/multi-thread to accelerate the computation of operators. +llama.cpp creates a threadpool. The number of threads in threadpool is decided by ‘-t’ option, if ‘-t’ option is not specified, then it is set as the number of CPU cores in the system by default. +The entrypoint of secondary thread is ggml_graph_compute_secondary_thread. +When computing one tensor node/operator in the compute graph, if the worksize is big, llama.cpp splits its computation into multiple parts for those threads. +Here is an example of MUL_MAT operator to demonstrate how the splitting is done. + +![text#center](images/multi_thread.jpg "Figure 22. Multi-thread") + +In this example, the result matrix C is split equally between four threads, each thread computes a quarter of matrix C. +The execution of multi-threads on CPU cores can be observed by Streamline. Core Map and Cluster Map modes in the Streamline Timeline view map threads to CPU cores. + +More information about Core Map and Cluster Map modes can be found here +https://developer.arm.com/documentation/101816/9-7/Analyze-your-capture/Viewing-application-activity/Core-Map-and-Cluster-Map-modes + +Run llama-cli with ‘-t 2 -C 0x3’ to specify two threads and thread affinity as CPU core0 and core1, +```bash +./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 2 -C 0x3 +``` + +Collect profiling data with Streamline, then select Core Map and Cluster Map modes in the Streamline Timeline view. + +![text#center](images/multi_thread_core_map.png "Figure 23. Multi-thread") + +As shown in the screenshot above, two threads are created and running on CPU core0 and core1 respectively. +Furthermore, individual operator view with annotation channel can be used to view two threads’ operators in parallel. +Note that annotation channels are created independently per-thread. + +![text#center](images/multi_thread_annotation_channel.png "Figure 24. Multi-thread") + +As shown in screenshot above, at the specific time, both threads are computing for the same node. In this example, it is result_output linear node. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md new file mode 100644 index 0000000000..28843b0bde --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md @@ -0,0 +1,57 @@ +--- +title: Use Streamline to analyze LLM running on CPU with llama.cpp and KleidiAI + +draft: true +cascade: + draft: true + +minutes_to_complete: 50 + +who_is_this_for: Engineers who want to learn LLM inference on CPU or profile and optimize llama.cpp code. + +learning_objectives: + - Be able to use Streamline to profile llama.cpp code + - Learn the execution of LLM on CPU + +prerequisites: + - Understanding of llama.cpp + - Understanding of transformer model + - Knowledge of Streamline usage + +author: Zenon(Zhilong) Xiu + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Cortex-A + - Neoverse +tools_software_languages: + - Arm Streamline + - C++ +operatingsystems: + - Linux + - Android + +further_reading: + - resource: + title: llama.cpp project + link: https://github.com/ggml-org/llama.cpp + type: source code + - resource: + title: Qwen1_5-0_5b-chat-q4_0.gguf + link: https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/blob/main/qwen1_5-0_5b-chat-q4_0.gguf + type: LLM model + - resource: + title: Arm Streamline User Guide + link: https://developer.arm.com/documentation/101816/9-7 + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Decode_only.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Decode_only.png new file mode 100644 index 0000000000..3d084767d8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Decode_only.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Prefill_only.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Prefill_only.png new file mode 100644 index 0000000000..68b6d57957 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Prefill_only.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_1.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_1.png new file mode 100644 index 0000000000..8ee615057c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_2.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_2.png new file mode 100644 index 0000000000..c466f72fb1 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_prefill.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_prefill.png new file mode 100644 index 0000000000..2b425ddb29 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_marker_prefill.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_pmu_stall.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_pmu_stall.png new file mode 100644 index 0000000000..dc00fd642f Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_pmu_stall.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_call_stack.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_call_stack.png new file mode 100644 index 0000000000..f1c29741e8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_call_stack.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_functions.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_functions.png new file mode 100644 index 0000000000..f2c393f885 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/annotation_prefill_functions.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel.png new file mode 100644 index 0000000000..5dc572a063 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png new file mode 100644 index 0000000000..f6095be12c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_1.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_1.png new file mode 100644 index 0000000000..e63b2b7b26 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_2.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_2.png new file mode 100644 index 0000000000..1fc58df987 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/deep_dive_2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.jpg b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.jpg new file mode 100644 index 0000000000..55f56c2883 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png new file mode 100644 index 0000000000..5fdf8f3a66 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_structure.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_structure.png new file mode 100644 index 0000000000..67cea85969 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_structure.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llm_prefill_decode.jpg b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llm_prefill_decode.jpg new file mode 100644 index 0000000000..9be52a78fd Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llm_prefill_decode.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.jpg b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.jpg new file mode 100644 index 0000000000..7b6fc6a7f8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png new file mode 100644 index 0000000000..47188a01b8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png new file mode 100644 index 0000000000..1b435ae958 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_core_map.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_core_map.png new file mode 100644 index 0000000000..505de28210 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_core_map.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel.png new file mode 100644 index 0000000000..5bbca5fbb9 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_2.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_2.png new file mode 100644 index 0000000000..e32eed9703 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png new file mode 100644 index 0000000000..b42cff8220 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture.png new file mode 100644 index 0000000000..8deffcef4a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture_image.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture_image.png new file mode 100644 index 0000000000..1a6c359f52 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/streamline_capture_image.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_decode.jpg b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_decode.jpg new file mode 100644 index 0000000000..4618ca890f Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_decode.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_prefill.jpg b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_prefill.jpg new file mode 100644 index 0000000000..f501973bb4 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/transformer_prefill.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/migration/_index.md b/content/learning-paths/servers-and-cloud-computing/migration/_index.md index c5bd50dd1a..9361277b38 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/_index.md @@ -24,8 +24,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - Coding - - Neon + - NEON - SVE - Go - Runbook diff --git a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md index fc2f68b7d8..64476da84c 100644 --- a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md @@ -23,7 +23,7 @@ armips: - Neoverse tools_software_languages: - Python - - GenAI + - Generative AI - RAG - Hugging Face diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md new file mode 100644 index 0000000000..de5c91fb33 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md @@ -0,0 +1,59 @@ +--- +title: Run MongoDB on the Microsoft Azure Cobalt 100 processors + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This Learning Path is designed for software developers looking to migrate their MongoDB workloads to Arm-based platforms, specifically on the Microsoft Azure Cobalt 100 processors. + +learning_objectives: + - Provision an Azure Arm64 Cobalt 100 based virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image. + - Deploy MongoDB on an Azure Cobalt 100 based virtual machine. + - Perform MongoDB baseline testing and benchmarking on the Arm64 virtual machine. + +prerequisites: + - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6). + - Familiarity with the [MongoDB architecture](https://www.mongodb.com/) and deployment practices on Arm64 platforms. + +author: Pareena Verma + +### Tags +skilllevels: Introductory +subjects: Databases +cloud_service_providers: Microsoft Azure + +armips: + - Neoverse + +tools_software_languages: + - MongoDB + - mongotop + - mongostat + +operatingsystems: + - Linux + +further_reading: + - resource: + title: MongoDB Manual + link: https://www.mongodb.com/docs/manual/ + type: documentation + - resource: + title: MongoDB Performance Tool + link: https://github.com/idealo/mongodb-performance-test#readme + type: documentation + - resource: + title: MongoDB on Azure + link: https://azure.microsoft.com/en-us/solutions/mongodb + type: documentation + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md new file mode 100644 index 0000000000..fa257f0c98 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md @@ -0,0 +1,20 @@ +--- +title: "Overview" + +weight: 2 + +layout: "learningpathall" +--- + +## Cobalt 100 Arm-based processor + +Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU, ensuring consistent and predictable performance. + +To learn more about Cobalt 100, refer to the blog [Announcing the preview of new Azure virtual machine based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353). + +## MongoDB +MongoDB is a popular open-source NoSQL database designed for high performance, scalability, and flexibility. + +It stores data in JSON-like BSON documents, making it ideal for modern applications that require dynamic, schema-less data structures. + +MongoDB is widely used for web, mobile, IoT, and real-time analytics workloads. Learn more from the [MongoDB official website](https://www.mongodb.com/) and its [official documentation](https://www.mongodb.com/docs/). diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md new file mode 100644 index 0000000000..4b981f3286 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md @@ -0,0 +1,227 @@ +--- +title: MongoDB Baseline Testing +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +### Baseline testing of MongoDB +In this section you will perform baseline testing by verifying MongoDB is running, logging into the shell, executing a few test queries, and monitoring live performance. This ensures the database is functioning correctly before starting any benchmarks. + +1. Verify Installation & Service Health + +```console +ps -ef | grep mongod +mongod --version +netstat -tulnp | grep 27017 +``` +An explanation of what each command is doing: +- **ps -ef | grep mongod** – Checks if the MongoDB server process is running. +- **mongod --version** – Shows the version of MongoDB installed. +- **netstat -tulnp | grep 27017** – Checks if MongoDB is listening for connections on its default port 27017. + +You should see output similar to: + +```output +mongod --version +netstat -tulnp | grep 27017 +ubuntu 4288 1 0 10:40 ? 00:00:01 mongod --dbpath /var/lib/mongo --logpath /var/log/mongodb/mongod.log --fork +ubuntu 4545 1764 0 10:43 pts/0 00:00:00 grep --color=auto mongod +db version v8.0.12 +Build Info: { + "version": "8.0.12", + "gitVersion": "b60fc6875b5fb4b63cc0dbbd8dda0d6d6277921a", + "openSSLVersion": "OpenSSL 3.0.13 30 Jan 2024", + "modules": [], + "allocator": "tcmalloc-google", + "environment": { + "distmod": "ubuntu2404", + "distarch": "aarch64", + "target_arch": "aarch64" + } +} +(Not all processes could be identified, non-owned process info + will not be shown, you would have to be root to see it all.) +tcp 0 0 127.0.0.1:27017 0.0.0.0:* LISTEN 4288/mongod +``` + +2. Storage and Health Check + +To perform a storage and health check, run the command below. This command checks how fast your storage can randomly read small 4KB chunks from a 100 MB file for 30 seconds, using one job, followed by a summary report: + +```console +fio --name=baseline --rw=randread --bs=4k --size=100M --numjobs=1 --time_based --runtime=30 --group_reporting +``` +You should see output similar to: + +```output +baseline: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=psync, iodepth=1 +fio-3.36 +Starting 1 process +Jobs: 1 (f=1): [r(1)][100.0%][r=14.8MiB/s][r=3799 IOPS][eta 00m:00s] +baseline: (groupid=0, jobs=1): err= 0: pid=3753: Mon Sep 1 10:25:07 2025 + read: IOPS=4255, BW=16.6MiB/s (17.4MB/s)(499MiB/30001msec) + clat (usec): min=88, max=46246, avg=234.23, stdev=209.81 + lat (usec): min=88, max=46246, avg=234.28, stdev=209.81 + clat percentiles (usec): + | 1.00th=[ 99], 5.00th=[ 111], 10.00th=[ 126], 20.00th=[ 167], + | 30.00th=[ 190], 40.00th=[ 229], 50.00th=[ 243], 60.00th=[ 253], + | 70.00th=[ 269], 80.00th=[ 289], 90.00th=[ 318], 95.00th=[ 330], + | 99.00th=[ 416], 99.50th=[ 490], 99.90th=[ 799], 99.95th=[ 1106], + | 99.99th=[ 3884] + bw ( KiB/s): min=14536, max=19512, per=100.00%, avg=17046.10, stdev=1359.69, samples=59 + iops : min= 3634, max= 4878, avg=4261.53, stdev=339.92, samples=59 + lat (usec) : 100=1.27%, 250=56.61%, 500=41.65%, 750=0.34%, 1000=0.06% + lat (msec) : 2=0.04%, 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01% + cpu : usr=0.33%, sys=2.93%, ctx=127668, majf=0, minf=8 + IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% + submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + issued rwts: total=127661,0,0,0 short=0,0,0,0 dropped=0,0,0,0 + latency : target=0, window=0, percentile=100.00%, depth=1 + +Run status group 0 (all jobs): + READ: bw=16.6MiB/s (17.4MB/s), 16.6MiB/s-16.6MiB/s (17.4MB/s-17.4MB/s), io=499MiB (523MB), run=30001-30001msec + +Disk stats (read/write): + sda: ios=127195/29, sectors=1017560/552, merge=0/15, ticks=29133/8, in_queue=29151, util=96.37% +``` +The output shows how fast it read data (**16.6 MB/s**) and how many reads it did per second (**~4255 IOPS**), which tells you how responsive your storage is for random reads. + +3. Connectivity and CRUD Sanity Check + +To verify that the MongoDB server is reachable you will perform a connectivity check. You will run a sanity test of core database functionality and permissions, refered to as CRUD: + +C - Create: Insert a new record/document into the database. +R - Read: Query the database to retrieve data. +U - Update: Modify an existing record. +D - Delete: Remove a record. + +```console +mongosh --host localhost --port 27017 +``` + +Inside shell: + +```javascript +use baselineDB +db.testCollection.insertOne({ name: "baseline-check", value: 1 }) +db.testCollection.find() +db.testCollection.updateOne({ name: "baseline-check" }, { $set: { value: 2 } }) +db.testCollection.deleteOne({ name: "baseline-check" }) +exit +``` +These commands create a test record, read it, update its value, and then delete it a simple way to check if MongoDB’s basic **add, read, update, and delete** operations are working. + +You should see output similar to: + +```output +test> use baselineDB +switched to db baselineDB +baselineDB> db.testCollection.insertOne({ name: "baseline-check", value: 1 }) +{ + acknowledged: true, + insertedId: ObjectId('689acdae6a86b49bca74e39a') +} +baselineDB> db.testCollection.find() +[ + { + _id: ObjectId('689acdae6a86b49bca74e39a'), + name: 'baseline-check', + value: 1 + } +] +baselineDB> db.testCollection.updateOne({ name: "baseline-check" }, { $set: { value: 2 } }) +... +{ + acknowledged: true, + insertedId: null, + matchedCount: 1, + modifiedCount: 1, + upsertedCount: 0 +} +baselineDB> db.testCollection.deleteOne({ name: "baseline-check" }) +... +{ acknowledged: true, deletedCount: 1 } +``` + +4. Basic Query Performance Test + +You will now perform a lightweight query performance check: + +```console +mongosh --eval ' +db = db.getSiblingDB("baselineDB"); +for (let i=0; i<1000; i++) { db.perf.insertOne({index:i, value:Math.random()}) }; +var start = new Date(); +db.perf.find({ value: { $gt: 0.5 } }).count(); +print("Query Time (ms):", new Date() - start); +' +``` +The command connected to MongoDB, switched to the `baselineDB` database, inserted 1,000 documents into the perf collection, and then measured the execution time for counting documents where value > 0.5. The final output displayed the query execution time in milliseconds. + +You should see the Query Time output similar to: + +```output +Query Time (ms): 2 +``` + +5. Index Creation Speed Test + +You will now run a performance sanity check that measures how long MongoDB takes to create an index on a given collection: +```console +mongosh --eval ' +db = db.getSiblingDB("baselineDB"); +var start = new Date(); +db.perf.createIndex({ value: 1 }); +print("Index Creation Time (ms):", new Date() - start); +' +``` +The test connected to MongoDB, switched to the `baselineDB` database, and created an index on the value field in the `perf` collection. The index creation process completed in 22 milliseconds, indicating relatively fast index building for the dataset size. + +You should see output similar to: + +```output +Index Creation Time (ms): 22 +``` + +6. Concurrency Smoke Test + +You will now verify that MongoDB can handle concurrent client connections and inserts without errors: + +```console +for i in {1..5}; do + mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' & +done +wait +``` +This command runs five MongoDB insert jobs at the same time, each adding 1,000 new records to the `baselineDB.concurrent` collection. +It is a quick way to test how MongoDB handles multiple users writing data at once. + +You should see an output similar to: + +```output +[1] 3818 +[2] 3819 +[3] 3820 +[4] 3821 +[5] 3822 +switched to db baselineDB; +[1] Done mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' +switched to db baselineDB; +switched to db baselineDB; +switched to db baselineDB; +[2] Done mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' +[4]- Done mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' +[3]- Done mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' +switched to db baselineDB; +[5]+ Done mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))' +``` + +Five parallel MongoDB shell sessions were executed, each inserting 1,000 test documents into the baselineDB.concurrent collection. All sessions completed successfully, confirming that concurrent data insertion works as expected. + +With these tests you have confirmed that MongoDB is installed successfully and is functioning as expected on the Azure Cobalt 100 (Arm64) environment. + +You are now ready to perform further benchmarking for MongoDB. diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md new file mode 100644 index 0000000000..877920677a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md @@ -0,0 +1,261 @@ +--- +title: MongoDB Benchmarking +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Benchmark MongoDB with **mongotop** and **mongostat** + +In this section, you will measure MongoDB's performance in real time. +You will install the official MongoDB database tools, start MongoDB and run a script to simulate heavy load. With the script running you will then measure the database's live performance using **mongotop** and **mongostat**. + +1. Install MongoDB Database Tools + +```console +wget https://fastdl.mongodb.org/tools/db/mongodb-database-tools-ubuntu2404-arm64-100.13.0.deb +sudo apt update +sudo apt install -y ./mongodb-database-tools-ubuntu2404-arm64-100.13.0.deb +echo 'export PATH=$PATH:~/mongodb-database-tools-ubuntu2404-arm64-100.13.0/bin' >> ~/.bashrc +source ~/.bashrc +``` +These commands download and unpack MongoDB's official monitoring tools (**mongotop** & **mongostat**), then add them to your PATH so you can run them from any terminal. + +2. Verify the Installation + +```console +mongotop --version +mongostat --version +``` +This checks that both tools were installed correctly and are ready to use. + +You should see output similar to: +```output +mongostat --version +mongotop version: 100.13.0 +git version: 23008ff975be028544710a5da6ae749dc7e90ab7 +Go version: go1.23.11 + os: linux + arch: arm64 + compiler: gc +mongostat version: 100.13.0 +git version: 23008ff975be028544710a5da6ae749dc7e90ab7 +Go version: go1.23.11 + os: linux + arch: arm64 + compiler: gc +``` + +3. Make sure that the MongoDB Server that you started in the previous section is still running. If not, start it again, using the command as shown: + +```console +mongod --dbpath /var/lib/mongo --logpath /var/log/mongodb/mongod.log --fork +``` +These commands create a folder for MongoDB's data, then start the database server in the background, allowing connections from any IP, and save logs for troubleshooting. + +4. Create a Long-Running Load Script for Benchmarking + +Use a file editor of your choice and create a file named `long_system_load.js` with the content below: + +```javascript +function randomString(len) { + return Math.random().toString(36).substring(2, 2 + len); +} + +var systemCollections = [ + { db: "admin", coll: "atlascli" }, + { db: "config", coll: "system_sessions_bench" }, + { db: "config", coll: "transactions_bench" }, + { db: "local", coll: "system_replset_bench" }, + { db: "benchmarkDB", coll: "testCollection" }, + { db: "benchmarkDB", coll: "cursorTest" }, + { db: "test", coll: "atlascli" }, + { db: "test", coll: "system_sessions_bench" }, + { db: "test", coll: "admin_system_version_test" } +]; + +systemCollections.forEach(function(ns) { + let col = db.getSiblingDB(ns.db).getCollection(ns.coll); + col.drop(); + for (let i = 0; i < 100; i++) { + col.insertOne({ rnd: randomString(10), ts: new Date(), idx: i }); + } + col.findOne(); +}); + +var totalCycles = 50; +var pauseMs = 1000; + +for (let cycle = 0; cycle < totalCycles; cycle++) { + systemCollections.forEach(function(ns) { + let col = db.getSiblingDB(ns.db).getCollection(ns.coll); + + col.insertOne({ cycle, action: "insert", value: randomString(8), ts: new Date() }); + col.find({ cycle: { $lte: cycle } }).limit(10).toArray(); + col.updateMany({}, { $set: { updatedAt: new Date() } }); + col.deleteMany({ idx: { $gt: 80 } }); + + let cursor = col.find().batchSize(5); + while (cursor.hasNext()) { + cursor.next(); + } + }); + + print(`Cycle ${cycle + 1} / ${totalCycles} completed`); + sleep(pauseMs); +} + +print("=== Long load generation completed ==="); +``` + +This is the load generator script, it creates several collections and repeatedly inserts, queries, updates and deletes data. Running it simulates real application traffic so the monitors have something to measure. + +{{% notice Note %}} +Before proceeding, the load script and the monitoring tools must be run in separate terminals simultaneously. + +- The load script continuously generates activity in MongoDB, keeping the database busy with multiple operations. +- The mongotop and mongostat tools monitor and report this activity in real time as it happens. + +If all commands are run in the same terminal, the monitoring tools will only start after the script finishes, preventing real-time observation of MongoDB's performance. +{{% /notice %}} + +### Run the load script (start the workload) — Terminal 1 + +```console +mongosh < long_system_load.js +``` + +This command tells the MongoDB shell to execute the entire script. The script will run through its cycles and print the progress while generating the read/write activity on the server. + +You should see output similar to: +```output +test> // long_system_load.js + +test> // Run with: mongosh < long_system_load.js + +test> + +test> function randomString(len) { +... return Math.random().toString(36).substring(2, 2 + len); +... } +[Function: randomString] +test> + +test> // ---------- 1. Safe shadow "system-like" namespaces ---------- + +test> var systemCollections = [ +... { db: "admin", coll: "atlascli" }, +... { db: "config", coll: "system_sessions_bench" }, +... { db: "config", coll: "transactions_bench" }, +... { db: "local", coll: "system_replset_bench" }, +... { db: "benchmarkDB", coll: "testCollection" }, +... { db: "benchmarkDB", coll: "cursorTest" }, +... { db: "test", coll: "atlascli" }, +... { db: "test", coll: "system_sessions_bench" }, +... { db: "test", coll: "admin_system_version_test" } +... ]; + +test> + +test> // Create and warm up + +test> systemCollections.forEach(function(ns) { +... let col = db.getSiblingDB(ns.db).getCollection(ns.coll); +... col.drop(); +... for (let i = 0; i < 100; i++) { +... col.insertOne({ rnd: randomString(10), ts: new Date(), idx: i }); +... } +... col.findOne(); +... }); + +test> + +test> // ---------- 2. Generate load loop ---------- + +test> var totalCycles = 50; // increase this for longer runs + +test> var pauseMs = 1000; // 1 second pause between cycles + +test> + +test> for (let cycle = 0; cycle < totalCycles; cycle++) { +... systemCollections.forEach(function(ns) { +... let col = db.getSiblingDB(ns.db).getCollection(ns.coll); +... +... col.insertOne({ cycle, action: "insert", value: randomString(8), ts: new Date() }); +... col.find({ cycle: { $lte: cycle } }).limit(10).toArray(); +... col.updateMany({}, { $set: { updatedAt: new Date() } }); +... col.deleteMany({ idx: { $gt: 80 } }); +... +... let cursor = col.find().batchSize(5); +... while (cursor.hasNext()) { +... cursor.next(); +... } +... }); +... +... print(`Cycle ${cycle + 1} / ${totalCycles} completed`); +... sleep(pauseMs); +... } +Cycle 1 / 50 completed +Cycle 2 / 50 completed +Cycle 3 / 50 completed +Cycle 4 / 50 completed +Cycle 5 / 50 completed +Cycle 6 / 50 completed +Cycle 7 / 50 completed +Cycle 8 / 50 completed +Cycle 9 / 50 completed +Cycle 10 / 50 completed +Cycle 11 / 50 completed +Cycle 12 / 50 completed +Cycle 13 / 50 completed +Cycle 14 / 50 completed +Cycle 15 / 50 completed +Cycle 16 / 50 completed +Cycle 17 / 50 completed +Cycle 18 / 50 completed +Cycle 19 / 50 completed +Cycle 20 / 50 completed +Cycle 21 / 50 completed +Cycle 22 / 50 completed +Cycle 23 / 50 completed +Cycle 24 / 50 completed +Cycle 25 / 50 completed +Cycle 26 / 50 completed +Cycle 27 / 50 completed +Cycle 28 / 50 completed +Cycle 29 / 50 completed +Cycle 30 / 50 completed +Cycle 31 / 50 completed +Cycle 32 / 50 completed +Cycle 33 / 50 completed +Cycle 34 / 50 completed +Cycle 35 / 50 completed +Cycle 36 / 50 completed +Cycle 37 / 50 completed +Cycle 38 / 50 completed +Cycle 39 / 50 completed +Cycle 40 / 50 completed +Cycle 41 / 50 completed +Cycle 42 / 50 completed +Cycle 43 / 50 completed +Cycle 44 / 50 completed +Cycle 45 / 50 completed +Cycle 46 / 50 completed +Cycle 47 / 50 completed +Cycle 48 / 50 completed +Cycle 49 / 50 completed +Cycle 50 / 50 completed + +test> + +test> print("=== Long load generation completed ==="); +=== Long load generation completed === + +``` + +The load has been generated successfully. Now, you can proceed to the next section where you will monitor this running workload with: + +- **mongotop** to observe activity per collection. +- **mongostat** to monitor overall operations per second, memory usage, and network activity. diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md new file mode 100644 index 0000000000..55f6b3cadf --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md @@ -0,0 +1,46 @@ +--- +title: Create an Arm based cloud virtual machine using Microsoft Cobalt 100 CPU +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +There are several ways to create an Arm-based Cobalt 100 virtual machine : the Microsoft Azure console, the Azure CLI tool, or using your choice of IaC (Infrastructure as Code). In this section, you will use the Azure console to create a virtual machine with Arm-based Azure Cobalt 100 Processor. + +This learning path focuses on the general-purpose virtual machine of the D series. Please read the guide on [Dpsv6 size series](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) offered by Microsoft Azure. + +While the steps to create this instance are included here for your convenience, you can also refer to the [Deploy a Cobalt 100 Virtual Machine on Azure Learning Path](/learning-paths/servers-and-cloud-computing/cobalt/) + +#### Create an Arm-based Azure Virtual Machine + +Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to "Virtual Machines". +1. Select "Create", and click on "Virtual Machine" from the drop-down list. +2. Inside the "Basic" tab, fill in the Instance details such as "Virtual machine name" and "Region". +3. Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select “Arm64” as the VM architecture. +4. In the “Size” field, click on “See all sizes” and select the D-Series v6 family of virtual machines. Select “D4ps_v6” from the list. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Figure 1: Select the D-Series v6 family of virtual machines") + +5. Select "SSH public key" as an Authentication type. Azure will automatically generate an SSH key pair for you and allow you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine. +6. Fill in the Administrator username for your VM. +7. Select "Generate new key pair", and select "RSA SSH Format" as the SSH Key Type. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key. +8. In the "Inbound port rules", select HTTP (80) and SSH (22) as the inbound ports. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance1.png "Figure 2: Allow inbound port rules") + +9. Click on the "Review + Create" tab and review the configuration for your virtual machine. It should look like the following: + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/ubuntu-pro.png "Figure 3: Review and Create an Azure Cobalt 100 Arm64 VM") + +10. Finally, when you are confident about your selection, click on the "Create" button, and click on the "Download Private key and Create Resources" button. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Figure 4: Download Private key and Create Resources") + +11. Your virtual machine should be ready and running within no time. You can SSH into the virtual machine using the private key, along with the Public IP details. + +![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "Figure 5: VM deployment confirmation in Azure portal") + +While the virtual machine ready, proceed to the next section to deploy MongoDB on your running instance. diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md new file mode 100644 index 0000000000..2bee07f312 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md @@ -0,0 +1,133 @@ +--- +title: Install MongoDB and Mongosh +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Install MongoDB and Mongosh on the Ubuntu Pro 24.04 LTS Arm instance + +Install MongoDB and mongosh on Ubuntu Pro 24.04 LTS Arm64 by downloading the binaries, setting up environment paths, configuring data and log directories, and starting the server for local access and verification. + +1. Install System Dependencies + +Install required system packages to support MongoDB: +```console +sudo apt update +sudo apt install -y curl wget tar fio openssl libcurl4 net-tools +``` + +2. Download and Extract MongoDB + +Fetch and unpack the MongoDB binaries for Arm64: +```console +wget https://fastdl.mongodb.org/linux/mongodb-linux-aarch64-ubuntu2404-8.0.12.tgz +tar -xvzf mongodb-linux-aarch64-ubuntu2404-8.0.12.tgz +sudo mv mongodb-linux-aarch64-ubuntu2404-8.0.12 /usr/local/mongodb +``` + +3. Add MongoDB to System PATH + +Enable running MongoDB from any terminal session: +```console +echo 'export PATH=/usr/local/mongodb/bin:$PATH' | sudo tee /etc/profile.d/mongodb.sh +source /etc/profile.d/mongodb.sh +``` + +4. Create data and log directories to use with MongoDB: + +Set up the database data directory: +```console +sudo mkdir -p /var/lib/mongo +sudo mkdir -p /var/log/mongodb +sudo chown -R $USER:$USER /var/lib/mongo /var/log/mongodb +``` + +5. Start MongoDB Server + +You can start MongoDB manually as shown: +```console +mongod --dbpath /var/lib/mongo --logpath /var/log/mongodb/mongod.log --fork +``` + +The output from this command should look like: +```output +about to fork child process, waiting until server is ready for connections. +forked process: 3356 +child process started successfully, parent exiting +``` + +6. Install mongosh + +**mongosh** is the MongoDB Shell used to interact with your MongoDB server. It provides a modern, user-friendly CLI for running queries and database operations. + +Download and install MongoDB’s command-line shell for Arm: +```console +wget https://downloads.mongodb.com/compass/mongosh-2.3.8-linux-arm64.tgz +tar -xvzf mongosh-2.3.8-linux-arm64.tgz +sudo mv mongosh-2.3.8-linux-arm64 /usr/local/mongosh +``` +Add mongosh to System `PATH` +```console +echo 'export PATH=/usr/local/mongosh/bin:$PATH' | sudo tee /etc/profile.d/mongosh.sh +source /etc/profile.d/mongosh.sh +``` + +### Verify MongoDB and mongosh Installation + +Check if MongoDB and mongosh are properly installed on your machine: +```console +mongod --version +mongosh --version +``` +You should see output similar to: +```output +db version v8.0.12 +Build Info: { + "version": "8.0.12", + "gitVersion": "b60fc6875b5fb4b63cc0dbbd8dda0d6d6277921a", + "openSSLVersion": "OpenSSL 3.0.13 30 Jan 2024", + "modules": [], + "allocator": "tcmalloc-google", + "environment": { + "distmod": "ubuntu2404", + "distarch": "aarch64", + "target_arch": "aarch64" + } +} +2.3.8 +``` + +### Connect to MongoDB via mongosh + +You can now start interacting with MongoDB through its shell interface: +```console +mongosh mongodb://127.0.0.1:27017 +``` +You should see output on your terminal similar to: +```output +Current Mongosh Log ID: 68b573411523231d81a00aa0 +Connecting to: mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.8 +Using MongoDB: 8.0.12 +Using Mongosh: 2.3.8 +mongosh 2.5.7 is available for download: https://www.mongodb.com/try/download/shell + +For mongosh info see: https://www.mongodb.com/docs/mongodb-shell/ + +------ + The server generated these startup warnings when booting + 2025-09-01T09:45:32.382+00:00: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine. See http://dochub.mongodb.org/core/prodnotes-filesystem + 2025-09-01T09:45:33.012+00:00: Access control is not enabled for the database. Read and write access to data and configuration is unrestricted + 2025-09-01T09:45:33.012+00:00: This server is bound to localhost. Remote systems will be unable to connect to this server. Start the server with --bind_ip
to specify which IP addresses it should serve responses from, or with --bind_ip_all to bind to all interfaces. If this behavior is desired, start the server with --bind_ip 127.0.0.1 to disable this warning + 2025-09-01T09:45:33.012+00:00: Soft rlimits for open file descriptors too low + 2025-09-01T09:45:33.012+00:00: For customers running the current memory allocator, we suggest changing the contents of the following sysfsFile + 2025-09-01T09:45:33.012+00:00: For customers running the current memory allocator, we suggest changing the contents of the following sysfsFile + 2025-09-01T09:45:33.012+00:00: We suggest setting the contents of sysfsFile to 0. + 2025-09-01T09:45:33.012+00:00: Your system has glibc support for rseq built in, which is not yet supported by tcmalloc-google and has critical performance implications. Please set the environment variable GLIBC_TUNABLES=glibc.pthread.rseq=0 +------ +test> +``` + +With this you have verified that the MongoDB installation is complete. You can now proceed with the baseline testing of MongoDB on your Azure Cobalt 100 based VM. diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/final-vm.png b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/final-vm.png new file mode 100644 index 0000000000..5207abfb41 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/final-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance.png b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance.png new file mode 100644 index 0000000000..285cd764a5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance1.png b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance1.png new file mode 100644 index 0000000000..b9d22c352d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance4.png b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance4.png new file mode 100644 index 0000000000..2a0ff1e3b0 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/instance4.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/ubuntu-pro.png b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/ubuntu-pro.png new file mode 100644 index 0000000000..d54bd75ca6 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/images/ubuntu-pro.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md new file mode 100644 index 0000000000..73fff11963 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md @@ -0,0 +1,94 @@ +--- +title: Monitor MongoDB with mongostat +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Monitoring MongoDB Performance using mongostat +In this section you will monitor MongoDB in real-time using **mongostat** on Arm64 Azure virtual machines. You will notice low-latency, stable insert, query, update, and delete operations, with consistent memory usage and network throughput. + +## Monitor with mongostat — Terminal 3 + +With the workload script running on your first terminal, you will now run mongostat on another terminal to view the real-time performance: + +```console +mongostat 2 +``` +**mongostat** gives a one-line summary every 2 seconds of inserts, queries, updates, deletes, memory use and network I/O. It is your quick health-and-throughput dashboard during the test. + +You should see output similar to: +```output +insert query update delete getmore command dirty used flushes vsize res qrw arw net_in net_out conn time + 8 16 8 8 182 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 34.0k 172k 11 Sep 4 04:57:56.761 + 4 8 4 4 98 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 18.3k 116k 11 Sep 4 04:57:58.762 + 9 18 9 9 198 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 36.4k 179k 11 Sep 4 04:58:00.760 + 4 9 4 4 99 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 18.3k 117k 11 Sep 4 04:58:02.760 + 8 17 8 8 202 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 37.0k 183k 11 Sep 4 04:58:04.762 + 4 9 4 4 103 2|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 19.0k 119k 11 Sep 4 04:58:06.760 + 8 15 7 7 183 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 33.5k 171k 11 Sep 4 04:58:08.761 + 5 11 5 5 126 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 23.1k 135k 11 Sep 4 04:58:10.760 + 6 12 6 6 133 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 24.5k 138k 11 Sep 4 04:58:12.760 + 7 14 7 7 190 1|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 34.1k 174k 11 Sep 4 04:58:14.761 +insert query update delete getmore command dirty used flushes vsize res qrw arw net_in net_out conn time + 4 9 4 4 108 2|0 0.0% 0.0% 0 3.54G 146M 0|0 0|0 19.6k 123k 11 Sep 4 04:58:16.760 + 9 18 9 9 220 2|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 39.7k 195k 11 Sep 4 04:58:18.760 + 4 8 4 4 112 0|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 20.1k 125k 11 Sep 4 04:58:20.762 + 7 15 7 7 179 1|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 32.4k 169k 11 Sep 4 04:58:22.760 + 5 11 5 5 158 1|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 28.1k 155k 11 Sep 4 04:58:24.761 + 5 9 4 4 117 2|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 21.1k 128k 11 Sep 4 04:58:26.761 + 4 8 4 4 117 1|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 20.7k 127k 6 Sep 4 04:58:28.761 + *0 *0 *0 *0 0 0|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 98b 53.3k 6 Sep 4 04:58:30.762 + *0 *0 *0 *0 0 1|0 0.0% 0.0% 0 3.54G 147M 0|0 0|0 87b 51.0k 3 Sep 4 04:58:32.761 +``` + +## Explanation of mongostat Metrics + +- **insert** - Number of document insert operations per second. +- **query** - Number of query operations (reads) per second. +- **update** - Number of document update operations per second. +- **delete** - Number of delete operations per second. +- **getmore** - Number of getMore operations per second (used when fetching more results from a cursor). +- **command** - Number of database commands executed per second (e.g., createIndex, count, aggregate). + - command = number of regular commands | number of getLastError (GLE) commands +- **dirty/used** - Percentage of the WiredTiger cache that is dirty (not yet written to disk) and the percentage actively used. +- **flushes** - How many times data has been flushed to disk (per second). +- **vsize** - Virtual memory size of the mongod process. +- **res** - Resident memory size (actual RAM in use). +- **qrw arw** - Queued and active readers/writers: + - `qrw` = queued read | queued write. + - `arw` = active read | active write. +- **net_in/net_out** - Amount of network traffic coming into (net_in) and going out of (net_out) the database per second. +- **conn** - Number of active client connections. +- **time** - Timestamp of the sample. + +## Benchmark summary on Arm64 +Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**. + +| insert | query | update | delete | getmore | command | dirty | used | flushes | vsize | res | qrw | arw | net_in | net_out | conn | time | +|--------|-------|--------|--------|---------|---------|-------|------|---------|-------|------|------|------|--------|---------|------|----------------------| +| 50 | 0 | 0 | 0 | 0 | 7/0 | 0.0% | 0.0% | 0 | 3.53G | 141M | 0/0 | 0/0 | 10.9k | 57.8k | 10 | Sep 4 04:57:18.761 | +| 404 | 13 | 4 | 4 | 71 | 8/0 | 0.0% | 0.0% | 0 | 3.53G | 143M | 0/0 | 0/0 | 96.3k | 114k | 10 | Sep 4 04:57:20.761 | +| 7 | 14 | 7 | 7 | 108 | 2/0 | 0.0% | 0.0% | 0 | 3.53G | 143M | 0/0 | 0/0 | 21.8k | 118k | 10 | Sep 4 04:57:22.760 | +| 6 | 12 | 6 | 6 | 112 | 0/0 | 0.0% | 0.0% | 0 | 3.53G | 143M | 0/0 | 0/0 | 21.9k | 120k | 10 | Sep 4 04:57:24.760 | +| 8 | 16 | 8 | 8 | 136 | 1/0 | 0.0% | 0.0% | 0 | 3.53G | 144M | 0/0 | 0/0 | 27.1k | 137k | 10 | Sep 4 04:57:26.762 | +| 5 | 10 | 5 | 5 | 93 | 2/0 | 0.0% | 0.0% | 0 | 3.54G | 144M | 0/0 | 0/0 | 18.2k | 111k | 11 | Sep 4 04:57:28.760 | +| 7 | 15 | 7 | 7 | 135 | 0/0 | 0.0% | 0.0% | 0 | 3.54G | 144M | 0/0 | 0/0 | 26.5k | 139k | 11 | Sep 4 04:57:30.761 | +| 5 | 11 | 5 | 5 | 102 | 1/0 | 0.0% | 0.0% | 0 | 3.54G | 144M | 0/0 | 0/0 | 19.7k | 118k | 11 | Sep 4 04:57:32.761 | +| 7 | 16 | 10 | 7 | 138 | 2/0 | 0.0% | 0.0% | 0 | 3.54G | 145M | 0/0 | 0/0 | 27.0k | 143k | 11 | Sep 4 04:57:34.761 | +| 5 | 10 | 5 | 5 | 104 | 1/0 | 0.0% | 0.0% | 0 | 3.54G | 145M | 0/0 | 0/0 | 20.1k | 121k | 11 | Sep 4 04:57:36.761 | + + +### Highlights from Azure Ubuntu Pro 24.04 LTS Arm64 Benchmarking + + +- **Insert, Query, Update, Delete Rates:** Throughput remains consistent, with inserts and queries ranging from **5–50 ops/sec**, while updates and deletes generally track queries. A workload burst is observed with an **insert spike of 404**, highlighting MongoDB’s ability to handle sudden surges. +- **Memory Usage:** Resident memory remains stable at **141–145 MB**, with virtual memory steady at **3.53–3.54 GB**, confirming efficient memory allocation and stability. +- **Network Activity:** Network traffic scales proportionally with workload, with **net_in ranging ~18k–96k** and **net_out ~111k–143k**, showing balanced data flow. +- **Connections:** Active connections hold steady at **10–11**, indicating reliable support for concurrent client sessions without instability. +- **Command Execution & System Load:** Command executions (0–8) stay minimal, with dirty/used at **0.0%** and no flushes recorded, reflecting efficient internal resource handling. +- **Overall System Behavior:** MongoDB demonstrates stable throughput, predictable memory usage, and balanced network performance, while also showcasing resilience under workload bursts on Arm64. + + +You have now successfully benchmarked MongoDB on an Azure Cobalt 100 Arm64 virtual machine. diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md new file mode 100644 index 0000000000..5ce917d298 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md @@ -0,0 +1,105 @@ +--- +title: Monitor MongoDB with mongotop +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Monitor MongoDB Performance using Mongotop +This guide demonstrates how to monitor MongoDB performance using **mongotop**, showing read and write activity across collections in real time. It includes benchmark results collected on Azure Arm64 virtual machines, providing a reference for expected latencies. + +## Run mongotop — Terminal 2 + +```console +mongotop 2 +``` +**mongotop** shows how much time the server spends reading and writing each collection (refreshes every 2 seconds here). It helps you see which collections are busiest and whether reads or writes dominate. + +The tail end of the output should look like: +```output + + ns total read write 2025-09-04T04:58:23Z +test.admin_system_version_test 5ms 2ms 3ms + test.system_sessions_bench 5ms 2ms 3ms + admin.atlascli 3ms 1ms 1ms + config.system_sessions_bench 3ms 1ms 1ms + test.atlascli 3ms 1ms 1ms + benchmarkDB.cursorTest 2ms 1ms 1ms + benchmarkDB.testCollection 2ms 1ms 1ms + config.transactions_bench 2ms 1ms 1ms + local.system_replset_bench 2ms 1ms 1ms + admin.system.version 0ms 0ms 0ms + + ns total read write 2025-09-04T04:58:25Z + admin.atlascli 5ms 2ms 3ms + config.system_sessions_bench 4ms 1ms 3ms + test.system_sessions_bench 3ms 1ms 1ms + benchmarkDB.cursorTest 2ms 1ms 1ms + benchmarkDB.testCollection 2ms 1ms 1ms + config.transactions_bench 2ms 1ms 1ms + local.system_replset_bench 2ms 1ms 1ms +test.admin_system_version_test 2ms 1ms 1ms + test.atlascli 2ms 1ms 1ms + admin.system.version 0ms 0ms 0ms + + ns total read write 2025-09-04T04:58:27Z +test.admin_system_version_test 6ms 2ms 3ms + benchmarkDB.cursorTest 5ms 2ms 3ms + benchmarkDB.testCollection 5ms 2ms 3ms + config.transactions_bench 5ms 2ms 3ms + local.system_replset_bench 5ms 2ms 3ms + test.atlascli 5ms 2ms 3ms + test.system_sessions_bench 5ms 2ms 3ms + admin.atlascli 3ms 1ms 1ms + config.system_sessions_bench 3ms 2ms 1ms + admin.system.version 0ms 0ms 0ms +``` +## Explanation of Metrics and Namespaces + +**Metrics** + + - **ns (Namespace)** – Identifies the specific database and collection being measured. + - **total** – Total time spent on both read and write operations. + - **read** – Time taken by read operations like queries or fetches. + - **write** – Time taken by write operations like inserts, updates, or deletes. + - **timestamp** – Marks when the metric snapshot was captured. + +**Namespaces** + + - **benchmarkDB.testCollection** – Core benchmark collection with balanced read/write load. + - **admin.atlascli** – Tracks admin-level client activity. + - **benchmarkDB.cursorTest** – Measures cursor operations during benchmarking. + - **config.system_sessions_bench** – Benchmarks session handling in config DB. + - **config.transactions_bench** – Evaluates transaction performance in config DB. + - **local.system_replset_bench** – Tests replication set metadata access. + - **test.admin_system_version_test** – Monitors versioning metadata in test DB. + - **test.atlascli** – Simulates client-side workload in test DB. + - **test.system_sessions_bench** – Benchmarks session handling in test DB. + - **admin.system.version** – Static metadata collection with minimal activity. + +## Benchmark summary on Arm64 +For easier visualization, shown here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Azure Ubuntu Pro 24.04 LTS virtual machine**. + +| Namespace (ns) | Total Time Range | Read Time Range | Write Time Range | Notes | +| :------------------------------- | :--------------- | :-------------- | :--------------- | :------------------------------------------------------------ | +| **admin.atlascli** | 2–6 ms | 0–2 ms | 1–3 ms | Admin CLI operations. | +| **benchmarkDB.cursorTest** | 2–5 ms | 0–2 ms | 1–3 ms | Cursor benchmark load. | +| **benchmarkDB.testCollection** | 2–5 ms | 0–2 ms | 1–3 ms | Main benchmark workload. | +| **config.system_sessions_bench** | 2–6 ms | 0–2 ms | 1–3 ms | System/benchmark sessions. | +| **config.transactions_bench** | 2–6 ms | 0–2 ms | 1–3 ms | Internal transaction benchmark. | +| **local.system_replset_bench** | 2–5 ms | 0–2 ms | 1–3 ms | Local replica set benchmark. | +| **test.admin_system_version_test** | 2–5 ms | 0–2 ms | 1–3 ms | Version check workload. | +| **test.atlascli** | 2–5 ms | 0–2 ms | 1–3 ms | CLI/system background operations (test namespace). | +| **test.system_sessions_bench** | 2–5 ms | 0–2 ms | 1–3 ms | Session benchmark (test namespace). | +| **admin.system.version** | 0 ms | 0 ms | 0 ms | Appears inactive or instantaneous responses. | + + + +With the MongoDB performance summary of the results on your Arm-based Azure Cobalt 100 VM, you will notice: + - Stable, low-latency behavior across all tested namespaces. + - Read operations are near-instant (sub-2 ms), showing efficient query performance. + - Write operations remain consistently low, supporting reliable data modifications. + - System and transaction overheads are predictable, indicating a well-tuned environment for concurrent/replicated workloads. + +**Overall observation:** MongoDB operations on Arm64 are lightweight with predictable, low-latency reads and writes, confirming efficient performance on Azure Ubuntu Pro 24.04 LTS Arm64 Virtual machines. diff --git a/content/learning-paths/servers-and-cloud-computing/mpi/_index.md b/content/learning-paths/servers-and-cloud-computing/mpi/_index.md index 943ce437d9..cebe99bdb6 100644 --- a/content/learning-paths/servers-and-cloud-computing/mpi/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mpi/_index.md @@ -25,7 +25,6 @@ armips: operatingsystems: - Linux tools_software_languages: - - Coding - Fortran - GCC - Linaro Forge diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_ollama_on_gke/_index.md b/content/learning-paths/servers-and-cloud-computing/multiarch_ollama_on_gke/_index.md index 7baa78d63b..a7d21651b6 100644 --- a/content/learning-paths/servers-and-cloud-computing/multiarch_ollama_on_gke/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_ollama_on_gke/_index.md @@ -36,7 +36,7 @@ operatingsystems: tools_software_languages: - LLM - Ollama - - GenAI + - Generative AI further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/1_introduction_rdv3.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/1_introduction_rdv3.md index c36adf0b1a..d97765cf77 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/1_introduction_rdv3.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/1_introduction_rdv3.md @@ -1,85 +1,78 @@ --- -title: Learn about the Arm RD‑V3 Platform +title: Learn about the Arm RD-V3 Platform weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Introduction to the Arm RD‑V3 Platform +## Introduction to the Arm RD-V3 Platform -In this section, you will learn about the Arm [Neoverse CSS V3](https://www.arm.com/products/neoverse-compute-subsystems/css-v3) subsystem and the RD‑V3 [Reference Design Platform Software](https://neoverse-reference-design.docs.arm.com/en/latest/index.html) that implements it. You'll learn how these components enable scalable, server-class system design, and how to simulate and validate the full firmware stack using Fixed Virtual Platforms (FVP), well before hardware is available. +In this section, you will learn about the Arm [Neoverse CSS-V3](https://www.arm.com/products/neoverse-compute-subsystems/css-v3) subsystem and the RD-V3 [Reference Design Platform Software](https://neoverse-reference-design.docs.arm.com/en/latest/index.html) that implements it. You’ll learn how these components enable scalable, server-class system design, and how to simulate and validate the full firmware stack using Fixed Virtual Platforms (FVPs) before hardware is available. -Arm Neoverse is designed to meet the demanding requirements of data center and edge computing, delivering high performance and efficiency. Widely adopted in servers, networking, and edge devices, the Neoverse architecture provides a solid foundation for modern infrastructure. +Arm Neoverse is designed for the demanding requirements of data-center and edge computing, delivering high performance and efficiency. Widely adopted in servers, networking, and edge devices, the Neoverse architecture provides a solid foundation for modern infrastructure. Using Arm Fixed Virtual Platforms (FVPs), you can explore system bring-up, boot flow, and firmware customization well before physical silicon becomes available. -This module also introduces the key components involved, from Neoverse V3 cores to secure subsystem controllers, and shows how these elements work together in a fully virtualized system simulation. +This Learning Path also introduces the key components involved, from Neoverse V3 cores to secure subsystem controllers, and shows how these elements work together in a fully virtualized system simulation. -### Neoverse CSS-V3 Platform Overview +## Neoverse CSS-V3 platform overview -[Neoverse CSS-V3](https://www.arm.com/products/neoverse-compute-subsystems/css-v3) (Compute Subsystem Version 3) is the core subsystem architecture underpinning the Arm RD-V3 platform. It is specifically optimized for high-performance server and data center applications, providing a highly integrated solution combining processing cores, memory management, and interconnect technology. +[Neoverse CSS-V3](https://www.arm.com/products/neoverse-compute-subsystems/css-v3) (Compute Subsystem Version 3) is the core subsystem architecture underpinning the Arm RD-V3 platform. It is optimized for high-performance server and data-center applications, providing an integrated solution that combines processing cores, memory management, and interconnect technology. -CSS V3 forms the key building block for specialized computing systems. It reduces design and validation costs for the general-purpose compute subsystem, allowing partners to focus on their specialization and acceleration while reducing risk and accelerating time to deployment. +CSS-V3 forms the key building block for specialized computing systems. It reduces design and validation costs for the general-purpose compute subsystem, allowing partners to focus on specialization and acceleration while reducing risk and time to deployment. -CSS‑V3 is available in configurable subsystems, supporting up to 64 Neoverse V3 cores per die. It also enables integration of high-bandwidth DDR5/LPDDR5 memory (up to 12 channels), PCIe Gen5 or CXL I/O (up to 64 lanes), and high-speed die-to-die links with support for UCIe 1.1 or custom PHYs. Designs can be scaled down to smaller core-count configurations, such as 32-core SoCs, or expanded through multi-die integration. +CSS-V3 is available in configurable subsystems, supporting up to 64 Neoverse V3 cores per die. It also enables integration of high-bandwidth DDR5/LPDDR5 memory (up to 12 channels), PCIe Gen5 or CXL I/O (up to 64 lanes), and high-speed die-to-die links with support for UCIe 1.1 or custom PHYs. Designs can scale down to smaller core-count configurations, such as 32-core SoCs, or expand through multi-die integration. Key features of CSS-V3 include: -* High-performance CPU clusters: Optimized for server workloads and data throughput. +- High-performance CPU clusters optimized for server workloads and data throughput +- Advanced memory management for efficient handling across multiple processing cores +- High-speed, low-latency interconnect within the subsystem -* Advanced memory management: Efficient handling of data across multiple processing cores. +The CSS-V3 subsystem is fully supported by Arm’s Fixed Virtual Platforms (FVPs), enabling pre-silicon testing of these capabilities. -* Interconnect technology: Enabling high-speed, low-latency communication within the subsystem. +## RD-V3 platform introduction -The CSS‑V3 subsystem is fully supported by Arm's Fixed Virtual Platform, enabling pre-silicon testing of these capabilities. +The RD-V3 platform is a comprehensive reference design built around Arm’s [Neoverse V3](https://www.arm.com/products/silicon-ip-cpu/neoverse/neoverse-v3) CPUs, along with [Cortex-M55](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m55) and [Cortex-M7](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m7) microcontrollers. This platform enables efficient high-performance computing and robust platform management: -### RD‑V3 Platform Introduction +| Component | Description | +|-------------------|--------------------------------------------------------------------------------------------------| +| Neoverse V3 | Primary application processor responsible for executing the OS and payloads | +| Cortex-M7 | Implements the System Control Processor (SCP) for power, clocks, and initialization | +| Cortex-M55 | Hosts the Runtime Security Engine (RSE), providing secure boot and runtime integrity | +| Cortex-M55 (LCP) | Acts as the Local Control Processor, enabling per-core power and reset management for AP cores | -The RD‑V3 platform is a comprehensive reference design built around Arm’s [Neoverse V3](https://www.arm.com/products/silicon-ip-cpu/neoverse/neoverse-v3) CPUs, along with [Cortex-M55](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m55) and [Cortex-M7](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m7) microcontrollers. This platform enables efficient high-performance computing and robust platform management: +These subsystems work together in a coordinated architecture, communicating through shared memory regions, control buses, and platform protocols. This enables multi-stage boot processes and robust secure-boot implementations. - -| Component | Description | -|------------------|------------------------------------------------------------------------------------------------| -| Neoverse V3 | The primary application processor responsible for executing OS and payloads | -| Cortex M7 | Implements the System Control Processor (SCP) for power, clocks, and init | -| Cortex M55 | Hosts the Runtime Security Engine (RSE), providing secure boot and runtime integrity | -| Cortex M55 (LCP) | Acts as the Local Control Processor, enabling per-core power and reset management for AP cores | - - -These subsystems work together in a coordinated architecture, communicating through shared memory regions, control buses, and platform protocols. This enables multi-stage boot processes and robust secure boot implementations. - -Here is the Neoverse Reference Design Platform [Software Stack](https://neoverse-reference-design.docs.arm.com/en/latest/about/software_stack.html#sw-stack) for your reference. +Here is the Neoverse Reference Design Platform [software stack](https://neoverse-reference-design.docs.arm.com/en/latest/about/software_stack.html#sw-stack) for reference. ![img1 alt-text#center](rdinfra_sw_stack.jpg "Neoverse Reference Design Software Stack") +## Develop and validate without hardware -### Develop and Validate Without Hardware - -In traditional development workflows, system validation cannot begin until silicon is available, often introducing risk and delay. - -To address this, Arm provides Fixed Virtual Platforms ([FVP](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms)), complete simulations model that emulates Arm SoC behavior on a host machine. The CSS‑V3 platform is available in multiple FVP configurations, allowing developers to select the model that best fits their specific development and validation needs. +In traditional workflows, system validation often cannot begin until silicon is available, introducing risk and delay. +To address this, Arm provides Fixed Virtual Platforms ([FVPs](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms)), a set of simulation models that emulate Arm SoC behavior on a host machine. The CSS-V3 platform is available in multiple FVP configurations, allowing you to select the model that best fits specific development and validation needs. -Key Capabilities of FVP: -* Multi-core CPU simulation with SMP boot -* Multiple UART interfaces for serial debug and monitoring -* Compatible with TF‑A, UEFI, GRUB, and Linux kernel images -* Provides boot logs, trace outputs, and interrupt event visibility for debugging +Key capabilities of FVPs: -FVP enables developers to verify boot sequences, debug firmware handoffs, and even simulate RSE (Runtime Security Engine) behaviors, all pre-silicon. +- Multi-core CPU simulation with SMP boot +- Multiple UART interfaces for serial debug and monitoring +- Compatibility with TF-A, UEFI, GRUB, and Linux kernel images +- Boot logs, trace outputs, and interrupt event visibility for debugging -### Comparing different version of RD-V3 FVP +FVPs enable developers to verify boot sequences, debug firmware handoffs, and even simulate RSE (Runtime Security Engine) behaviors, all pre-silicon. -To support different use cases and levels of platform complexity, Arm offers three virtual models based on the CSS V3 architecture: RD‑V3, RD-V3-Cfg1, and RD‑V3‑R1. While they share a common foundation, they differ in chip count, system topology, and simulation flexibility. +## Compare RD-V3 FVP variants -| Model | Description | Recommended Use Cases | -|-------------|------------------------------------------------------------------|--------------------------------------------------------------------| -| RD‑V3 | Standard single-die platform with full processor and security blocks | Ideal for newcomers, firmware bring-up, and basic validation | -| RD‑V3‑R1 | Dual-die platform simulating chiplet-based architecture | Suitable for multi-node, interconnect, and advanced boot tests | -| CFG1 | Lightweight model with reduced control complexity for fast startup | Best for CI pipelines, unit testing, and quick validations | -| CFG2 | Quad-chip platform with 4×32-core Poseidon-V CPUs connected via CCG links | Designed for advanced multi-chip validation, CML-based coherence, and high-performance platform scaling | +To support different use cases and levels of platform complexity, Arm offers several virtual models based on the CSS-V3 architecture: RD-V3, RD-V3-R1, RD-V3-Cfg1 (CFG1), and RD-V3-Cfg2 (CFG2). While they share a common foundation, they differ in chip count, system topology, and simulation flexibility. +| Model | Description | Recommended use cases | +|----------------|-----------------------------------------------------------------------------|----------------------------------------------------------------------------------| +| RD-V3 | Standard single-die platform with full processor and security blocks | Ideal for newcomers, firmware bring-up, and basic validation | +| RD-V3-R1 | Dual-die platform simulating chiplet-based architecture | Suitable for multi-node, interconnect, and advanced boot tests | +| RD-V3-Cfg1 (CFG1) | Lightweight model with reduced control complexity for fast startup | Best for CI pipelines, unit testing, and quick validations | +| RD-V3-Cfg2 (CFG2) | Quad-chip platform with 4×32-core Poseidon-V CPUs connected via CCG links | Designed for advanced multi-chip validation, CMN-based coherence, and scaling | -In this Learning Path you will use RD‑V3 as the primary platform for foundational exercises, guiding you through the process of building the software stack and simulating it on an FVP to verify the boot sequence. -In later modules, you’ll transition to RD‑V3‑R1 to more advanced system simulation, multi-node bring-up, and firmware coordination across components like MCP and SCP. +In this Learning Path you will use RD-V3 as the primary platform for foundational exercises, guiding you through building the software stack and simulating it on an FVP to verify the boot sequence. In later modules, you’ll transition to RD-V3-R1 for more advanced system simulation, multi-node bring-up, and firmware coordination across components like LCP and SCP. diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/2_rdv3_bootseq.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/2_rdv3_bootseq.md index eff3635e51..63f307520f 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/2_rdv3_bootseq.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/2_rdv3_bootseq.md @@ -1,83 +1,78 @@ --- -title: Understanding the CSS V3 Boot Flow and Firmware Stack +title: Understand the CSS-V3 boot flow and firmware stack weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Firmware Stack Overview and Boot Sequence Coordination +## Firmware stack overview and boot sequence coordination -To ensure the platform transitions securely and reliably from power-on to operating system launch, this section introduces the roles and interactions of each firmware component within the RD‑V3 boot process. -You’ll learn how each component contributes to system initialization and how control is systematically handed off across the boot chain. +To ensure the platform transitions securely and reliably from power-on to operating system launch, this section introduces the roles and interactions of each firmware component within the RD-V3 boot process. You’ll learn how each component contributes to system initialization and how control is systematically handed off across the boot chain. +## Booting the system up -## How the System Boots Up +In the RD-V3 platform, each firmware component such as TF-A, RSE, SCP, MCP, LCP, and UEFI operates independently but participates in a well-defined sequence. Each is delivered as a separate firmware image, yet they coordinate tightly through a structured boot flow and inter-processor signaling. -In the RD‑V3 platform, each firmware component—such as TF‑A, RSE, SCP, LCP, and UEFI—operates independently but functions together through a well-defined sequence. -Each component is delivered as a separate firmware image, yet they coordinate tightly through a structured boot flow and inter-processor signaling. - -The following diagram from the [Neoverse Reference Design Documentation](https://neoverse-reference-design.docs.arm.com/en/latest/shared/boot_flow/rdv3_single_chip.html?highlight=boot) illustrates the progression of component activation from initial reset to OS handoff: +The following diagram from the [Neoverse Reference Design documentation](https://neoverse-reference-design.docs.arm.com/en/latest/shared/boot_flow/rdv3_single_chip.html?highlight=boot) illustrates the progression of component activation from initial reset to OS handoff: ![img1 alt-text#center](rdf_single_chip.png "Boot Flow for RD-V3 Single Chip") -### Stage 1. Security Validation Starts First (RSE) +## Stage 1: Security validation starts (RSE) -The first firmware module triggered after BL2 is the Runtime Security Engine (RSE), executing on Cortex‑M55. RSE authenticates all critical firmware components—including SCP, UEFI, and kernel images—using secure boot mechanisms. It performs cryptographic measurements and builds a Root of Trust before allowing any other processors to start. +After BL2, the Runtime Security Engine (RSE, Cortex-M55) authenticates critical firmware components that include SCP, UEFI, and kernel images, using secure-boot mechanisms. It performs cryptographic measurements and establishes a Root of Trust (RoT) before allowing other processors to start. ***RSE acts as the platform’s security gatekeeper.*** -### Stage 2. Early Hardware Initialization (SCP / MCP) +## Stage 2: Early hardware initialization (SCP/MCP) -Once RSE completes verification, the System Control Processor (SCP) and Management Control Processor (MCP) are released from reset. +Once RSE completes verification, the System Control Processor (SCP, Cortex-M7) and the Management Control Processor (MCP, where present) are released from reset. -These controllers perform essential platform bring-up: -* Initialize clocks, reset lines, and power domains -* Prepare DRAM and interconnect -* Enable the application cores and signal readiness to TF‑A +They perform essential bring-up: +* Initializing clocks, reset lines, and power domains +* Preparing DRAM and interconnect +* Enabling the application processor (AP) cores and signaling readiness to TF-A ***SCP/MCP are the ground crew bringing hardware systems online.*** -### Stage 3. Secure Execution Setup (TF‑A) - -Once the AP is released, it begins executing Trusted Firmware‑A (TF‑A) at EL3, starting from the reset vector address programmed during boot image layout. -TF‑A configures the secure world, sets up exception levels, and prepares for handoff to UEFI. +## Stage 3: Secure execution setup (TF-A) -***TF‑A is the ignition controller, launching the next stages securely.*** +When the AP is released, it begins executing Trusted Firmware-A (TF-A) at EL3 from the reset vector address programmed during boot-image layout. TF-A configures the secure world, sets up exception levels, and prepares for handoff to UEFI. -### Stage 4. Firmware and Bootloader (EDK2 / GRUB) +***TF-A is the ignition controller, launching the next stages securely.*** -TF‑A hands off control to UEFI firmware (EDK2), which performs device discovery and launches GRUB. +## Stage 4: Firmware and Bootloader (EDK II/GRUB) -Responsibilities: -* Detect and initialize memory, PCIe, and boot devices -* Generate ACPI and platform configuration tables -* Locate and launch GRUB from storage or flash +TF-A hands off control to UEFI firmware (EDK II), which performs device discovery and launches GRUB. -***EDK2 and GRUB are like the first- and second-stage rockets launching the payload.*** +Responsibilities here include: +* Detecting and initializing memory, PCIe, and boot devices +* Generating ACPI and platform configuration tables +* Locating and launching GRUB from storage or flash -### Stage 5. Linux Kernel Boot +***EDK II and GRUB are like the first- and second-stage rockets launching the payload.*** +## Stage 5: Linux kernel boot + GRUB loads the Linux kernel and passes full control to the OS. -Responsibilities: -* Initialize device drivers and kernel subsystems -* Mount the root filesystem -* Start user-space processes (e.g., BusyBox) - -***The Linux kernel is the spacecraft—it takes over and begins its mission.*** +Responsibilities include: +* Initializing device drivers and kernel subsystems +* Mounting the root filesystem +* Starting user-space processes (for example, BusyBox) -## Firmware Module Responsibilities in Detail +***The Linux kernel is the spacecraft - it takes over and begins its mission.*** -Now that we’ve examined the high-level boot stages, let’s break down each firmware module’s role in more detail. +## In detail: firmware module responsibilities +Now that you’ve examined the high-level boot stages, you can now examine each firmware module’s role in more detail. -Each stage of the boot chain is backed by a dedicated component—either a secure bootloader, platform controller, or operating system manager—working together to ensure a reliable system bring-up. +Each stage of the boot chain is backed by a dedicated component, such as secure bootloader, platform controller, or OS manager, and they work together to ensure reliable system bring-up. -### RSE: Runtime Security Engine (Cortex‑M55) (Stage 1: Security Validation) +### RSE: Runtime Security Engine (Cortex-M55) (Stage 1: Security Validation) RSE firmware runs on the Cortex‑M55 and plays a critical role in platform attestation and integrity enforcement. * Authenticates BL2, SCP, and UEFI firmware images (Secure Boot) -* Records boot-time measurements (e.g., PCRs, ROT) +* Records boot-time measurements (for example, PCRs, ROT) * Releases boot authorization only after successful validation RSE acts as the second layer of the chain of trust, maintaining a monitored and secure environment throughout early boot. @@ -85,76 +80,62 @@ RSE acts as the second layer of the chain of trust, maintaining a monitored and ### SCP: System Control Processor (Cortex‑M7) (Stage 2: Early Hardware Bring-up) -SCP firmware runs on the Cortex‑M7 core and performs early hardware initialization and power domain control. * Initializes clocks, reset controllers, and system interconnect -* Manages DRAM setup and enables power for the application processor -* Coordinates boot readiness with RSE via MHU (Message Handling Unit) - -SCP is central to bring-up operations and ensures the AP starts in a stable hardware environment. - -### TF-A: Trusted Firmware-A (BL1 / BL2) (Stage 3: Secure Execution Setup) +* Manages DRAM setup and enables power for the AP +* Coordinates boot readiness with RSE via the Message Handling Unit (MHU) -TF‑A is the entry point of the boot chain and is responsible for establishing the system’s root of trust. -* BL1 (Boot Loader Stage 1): Executes from ROM, initializing minimal hardware such as clocks and serial interfaces, and loads BL2. -* BL2 (Boot Loader Stage 2): Validates and loads SCP, RSE, and UEFI images, setting up secure handover to later stages. +### TF-A: Trusted Firmware-A (BL1/BL2) (Stage 3) -TF‑A ensures all downstream components are authenticated and loaded from trusted sources, laying the foundation for a secure boot. +* **BL1** executes from ROM, initializes minimal hardware (clocks, UART), and loads BL2 +* **BL2** validates and loads SCP, RSE, and UEFI images, setting up secure handover to later stages +TF-A establishes the system’s chain of trust and ensures downstream components are authenticated and loaded from trusted sources. -### UEFI / GRUB / Linux Kernel (Stage 4–5: Bootloader and OS Handoff) +### UEFI, GRUB, and the Linux kernel (Stages 4–5) -After SCP powers on the application processor, control passes to the main bootloader and operating system: -* UEFI (EDK2): Provides firmware abstraction, hardware discovery, and ACPI table generation -* GRUB: Selects and loads the Linux kernel image -* Linux Kernel: Initializes the OS, drivers, and launches the userland (e.g., BusyBox) +* **UEFI (EDK II):** firmware abstraction, hardware discovery, ACPI table generation +* **GRUB:** selects and loads the Linux kernel image +* **Linux kernel:** initializes the OS, drivers, and launches userland (for example, BusyBox) -On the FVP, you can observe this process via UART logs, helping validate each stage’s success. +On the FVP you can see this process through UART logs to validate each stage. +### LCP: Low-Power Controller (optional) -### LCP: Low Power Controller (Optional Component) - -If present in the configuration, LCP handles platform power management at a finer granularity: +If present, the LCP provides fine-grained platform power management: * Implements sleep/wake transitions * Controls per-core power gating -* Manages transitions to ACPI power states (e.g., S3, S5) - -LCP support depends on the FVP model and may be omitted in simplified virtual setups. - +* Manages transitions to ACPI power states (for example, S3, S5) -### Coordination and Handoff Logic +LCP support depends on the FVP model and can be omitted in simplified setups. -The RD‑V3 boot sequence follows a multi-stage, dependency-driven handshake model, where each firmware module validates, powers, or authorizes the next. +## Coordination and handoff logic -| Stage | Dependency Chain | Description | -|-------|----------------------|-------------------------------------------------------------------------| -| 1 | RSE ← BL2 | RSE is loaded and triggered by BL2 to begin security validation | -| 2 | SCP ← BL2 + RSE | SCP initialization requires both BL2 and authorization from RSE | -| 3 | AP ← SCP + RSE | The application processor starts only after SCP sets power and RSE permits | -| 4 | UEFI → GRUB → Linux | UEFI launches GRUB, which loads the kernel and enters the OS | +The RD-V3 boot sequence follows a multi-stage, dependency-driven handshake model, where each firmware module validates, powers, or authorizes the next. -This handshake model ensures that no firmware stage proceeds unless its dependencies have securely initialized and authorized the next step. +| Stage(s) | Dependency chain | Description | +|------:|----------------------|-------------------------------------------------------------------------------| +| 1 | RSE ← BL2 | RSE is loaded and triggered by BL2 to begin security validation | +| 2 | SCP ← BL2 + RSE | SCP initialization requires BL2 and authorization from RSE | +| 3 | AP ← SCP + RSE | The AP starts only after SCP sets power and RSE permits | +| 4-5 | UEFI → GRUB → Linux | UEFI launches GRUB, which loads the kernel and enters the OS | -{{% notice Note %}} -In the table above, arrows (←) represent **dependency relationships**—the component on the left **depends on** the component(s) on the right to be triggered or authorized. -For example, `RSE ← BL2` means that RSE is loaded and triggered by BL2; -`AP ← SCP + RSE` means the application processor can only start after SCP has initialized the hardware and RSE has granted secure boot authorization. -These arrows do not represent execution order but indicate **which component must be ready for another to begin**. -{{% /notice %}} +This handshake ensures no stage proceeds unless its dependencies have securely initialized and authorized the next step. {{% notice Note %}} -Once the firmware stack reaches UEFI, it performs hardware discovery and launches GRUB. -GRUB then selects and boots the Linux kernel. Unlike the previous dependency arrows (←), this is a **direct execution path**—each stage passes control directly to the next. +In the table, arrows (←) indicate **dependency** - the component on the left depends on the component(s) on the right to be triggered or authorized. +For example, `RSE ← BL2` means BL2 loads/triggers RSE; `AP ← SCP + RSE` means the AP can start only after SCP has initialized hardware and RSE has granted authorization. +The right-facing arrows in `UEFI → GRUB → Linux` indicate a **direct execution path**—each stage passes control directly to the next. {{% /notice %}} -This layered approach supports modular testing, independent debugging, and early-stage simulation—all essential for secure and robust platform bring-up. +This layered approach supports modular testing, independent debugging, and early simulation, which is essential for secure and robust platform bring-up. +## Summary In this section, you have: -* Explored the full boot sequence of the RD‑V3 platform, from power-on to Linux login -* Understood the responsibilities of key firmware components such as TF‑A, RSE, SCP, LCP, and UEFI -* Learned how secure boot is enforced and how each module hands off control to the next +* Explored the full boot sequence of the RD-V3 platform, from power-on to Linux login +* Learned about the responsibilities of TF-A, RSE, SCP, MCP, LCP, and UEFI +* Learned how secure boot is enforced and how each module hands off control * Interpreted boot dependencies using FVP simulation and UART logs -With an understanding of full boot sequence and firmware responsibilities, you're ready to apply these insights. -In the next section, you'll fetch the RD‑V3 codebase and start building the firmware stack for simulation. +With an understanding of the full boot sequence and firmware responsibilities, you’re ready to apply these insights. In the next section, you’ll fetch the RD-V3 codebase and start building the firmware stack for simulation. diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build.md index 75a0c1de08..ef11a13325 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build.md @@ -1,36 +1,47 @@ --- -title: Build the RD‑V3 Reference Platform Software Stack +title: Build the RD-V3 Reference Platform Software Stack weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Building the RD‑V3 Reference Platform Software Stack -In this module, you’ll set up your development environment on any Arm-based server and build the firmware stack required to simulate the RD‑V3 platform. This Learning Path was tested on an AWS `m7g.4xlarge` Arm-based instance running Ubuntu 22.04 +## Building the RD-V3 Reference Platform Software Stack +In this module, you’ll set up your development environment on any Arm-based server and build the firmware stack required to simulate the RD-V3 platform. This Learning Path was tested on an AWS `m7g.4xlarge` Arm-based instance running Ubuntu 22.04. -### Step 1: Prepare the Development Environment +## Step 1: Set up your development environment -First, ensure your system is up-to-date and install the required tools and libraries: +First, check that your system is current and install the required dependencies: ```bash sudo apt update -sudo apt install curl git +sudo apt install -y curl git ``` -Configure git as follows. +Configure git(optional): ```bash git config --global user.name "" git config --global user.email "" ``` -### Step 2: Fetch the Source Code +## Step 2: Fetch the source code -The RD‑V3 platform firmware stack consists of many independent components—such as TF‑A, SCP, RSE, UEFI, Linux kernel, and Buildroot. Each component is maintained in a separate Git repository. To manage and synchronize these repositories efficiently, we use the `repo` tool. It simplifies syncing the full platform software stack from multiple upstreams. +The RD‑V3 platform firmware stack consists of multiple components, most maintained in separate Git repositories, such as: -If repo is not installed, you can download it manually: +- TF‑A +- SCP/MCP +- RSE (TF-M) +- UEFI (EDK II) +- Linux kernel +- Buildroot +- kvmtool (lkvm) +- RMM (optional) + +Use the repo tool with the RD-V3 manifest to sync these sources from multiple upstreams consistently (typically to a pinned release tag). It simplifies syncing the full platform software stack from multiple upstreams. + +If `repo` is not installed, you can download it and add it to your `PATH`: ```bash mkdir -p ~/.bin @@ -39,11 +50,9 @@ curl https://storage.googleapis.com/git-repo-downloads/repo > ~/.bin/repo chmod a+rx ~/.bin/repo ``` -Once ready, create a workspace and initialize the repo manifest: +Once ready, create a workspace and initialize the repo manifest. This Learning Path uses a pinned manifest to ensure reproducibility across different environments. This locks all component repositories to known-good commits that are validated and aligned with a specific FVP version. -We use a pinned manifest to ensure reproducibility across different environments. This locks all component repositories to known-good commits that are validated and aligned with a specific FVP version. - -For this session, we will use `pinned-rdv3.xml` and `RD-INFRA-2025.07.03`. +For this session, use `pinned-rdv3.xml` and `RD-INFRA-2025.07.03`: ```bash cd ~ @@ -63,19 +72,18 @@ Syncing: 100% (83/83) 2:52 | 1 job | 0:01 platsw/edk2-platforms @ uefi/edk2/edk2 ``` {{% notice Note %}} -As of the time of writing, the latest official release tag is RD-INFRA-2025.07.03. -Please note that newer tags may be available as future platform updates are published. +As of the time of writing, the latest release tag is `RD-INFRA-2025.07.03`. Newer tags might be available in future updates. {{% /notice %}} -This manifest will fetch all required sources including: +This manifest fetches the required sources, including: * TF‑A * SCP / RSE firmware -* EDK2 (UEFI) +* EDK II (UEFI) * Linux kernel * Buildroot and platform scripts -### Step 3: Build the Docker Image +## Step 3: Build the Docker Image There are two supported methods for building the reference firmware stack: **host-based** and **container-based**. @@ -84,7 +92,7 @@ There are two supported methods for building the reference firmware stack: **hos In this Learning Path, you will use the **container-based** approach. -The container image is designed to use the source directory from the host (`~/rdv3`) and perform the build process inside the container. Make sure Docker is installed on your Linux machine. You can follow this [installation guide](https://learn.arm.com/install-guides/docker/). +The container image uses your host source directory (~/rdv3) and performs the build inside Docker. Ensure Docker is installed on your machine. You can follow this [installation guide](https://learn.arm.com/install-guides/docker/). After Docker is installed, you’re ready to build the container image. @@ -104,7 +112,9 @@ To build the container image: ./container.sh build ``` -The build procedure may take a few minutes, depending on network bandwidth and CPU performance. This Learning Path was tested on an AWS `m7g.4xlarge` instance, and the build took 250 seconds. The output from the build looks like: +The build procedure can take a few minutes, depending on network bandwidth and CPU performance. This Learning Path was tested on an AWS `m7g.4xlarge` instance, and the build took 250 seconds. + +Expected output: ```output Building docker image: rdinfra-builder ... @@ -142,7 +152,7 @@ Building docker image: rdinfra-builder ... => => naming to docker.io/library/rdinfra-builder 0.0s ``` -Verify the docker image build completed successfully: +Verify the image: ```bash docker images @@ -155,14 +165,13 @@ REPOSITORY TAG IMAGE ID CREATED SIZE rdinfra-builder latest 3a395c5a0b60 4 minutes ago 8.12GB ``` -To quickly test the Docker image you just built, run the following command to enter the docker container interactively: +Quick interactive test: ```bash ./container.sh -v ~/rdv3 run ``` -This script mounts your source directory (~/rdv3) into the container and opens a shell session at that location. -Inside the container, you should see a prompt like this: +This script mounts your source directory (~/rdv3) into the container and opens a shell session at that location. Inside the container, you should see a prompt like this: ```output Running docker image: rdinfra-builder ... @@ -172,18 +181,17 @@ See "man sudo_root" for details. your-username:hostname:/home/your-username/rdv3$ ``` -You can explore the container environment if you wish, then type exit to return to the host system. +You can explore the container environment if you wish, then type `exit` to return to the host. -### Step 4: Build Firmware +## Step 4: Build firmware -Building the full firmware stack involves compiling several components and preparing them for simulation. Rather than running each step manually, you can use a single Docker command to automate the build and package phases. +Building the full firmware stack involves compiling several components and packaging them for simulation. The following command runs build and then package inside the Docker image: -- **build**: This phase compiles all individual components of the firmware stack, including TF‑A, SCP, RSE, UEFI, Linux kernel, and rootfs. +- **build**compiles all individual components of the firmware stack, including TF‑A, SCP, RSE, UEFI, Linux kernel, and rootfs +- **package** consolidates outputs into simulation-ready artifacts for FVP -- **package**: This phase consolidates the build outputs into simulation-ready formats and organizes boot artifacts for FVP. - -Ensure you’re back in the host OS, then run the following command: +Ensure you’re back in the host OS, then run: ```bash cd ~/rdv3 @@ -201,13 +209,13 @@ docker run --rm \ The build artifacts will be placed under `~/rdv3/output/rdv3/rdv3/`, where the last `rdv3` in the directory path corresponds to the selected platform name. -After a successful build, inspect the artifacts generated under `~/rdv3/output/rdv3/rdv3/` +Inspect the artifacts: ```bash ls ~/rdv3/output/rdv3/rdv3 -al ``` -The directory contents should look like: +Expected output: ```output total 7092 drwxr-xr-x 2 ubuntu ubuntu 4096 Aug 12 13:15 . @@ -229,7 +237,7 @@ lrwxrwxrwx 1 ubuntu ubuntu 48 Aug 12 13:15 tf_m_vm0_0.bin -> ../components/ lrwxrwxrwx 1 ubuntu ubuntu 48 Aug 12 13:15 tf_m_vm1_0.bin -> ../components/arm/rse/neoverse_rd/rdv3/vm1_0.bin lrwxrwxrwx 1 ubuntu ubuntu 33 Aug 12 13:15 uefi.bin -> ../components/css-common/uefi.bin ``` -Here's a reference of what each file refers to: +Reference mapping: | Component | Output Files | Description | |----------------------|----------------------------------------------|-----------------------------| @@ -240,9 +248,9 @@ Here's a reference of what each file refers to: | Initrd | `rootfs.cpio.gz` | Minimal filesystem | -### Optional: Run the Build Manually from Inside the Container +## Optional: run the build manually from inside the container -You can also perform the build manually after entering the container: +You can also build from within an interactive container session (useful for debugging or partial builds): Start your docker container. In your running container shell: ```bash @@ -251,7 +259,4 @@ cd ~/rdv3 ./build-scripts/rdinfra/build-test-buildroot.sh -p rdv3 package ``` -This manual workflow is useful for debugging, partial builds, or making custom modifications to individual components. - - -You’ve now successfully prepared and built the full RD‑V3 firmware stack. In the next section, you’ll install the appropriate FVP and simulate the full boot sequence, bringing the firmware to life on a virtual platform. +You’ve now prepared and built the full RD-V3 firmware stack. In the next section, you’ll install the appropriate FVP and simulate the full boot sequence, bringing the firmware to life on a virtual platform. diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp.md index d773322a21..e5fe961b28 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp.md @@ -1,47 +1,43 @@ --- -title: Simulate RD‑V3 Boot Flow on Arm FVP +title: Simulate RD-V3 Boot Flow on Arm FVP weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Simulating RD‑V3 with an Arm FVP +## Simulating RD-V3 with an Arm FVP -In the previous section, you built the complete CSS‑V3 firmware stack. -Now, you’ll use Arm Fixed Virtual Platform (FVP) to simulate the system, allowing you to verify the boot sequence without any physical silicon. -This simulation brings up the full stack from BL1 to Linux shell using Buildroot. +In the previous section, you built the complete CSS-V3 firmware stack. Now you’ll use an Arm Fixed Virtual Platform (FVP) to simulate the system, allowing you to verify the boot sequence without any physical silicon. This simulation brings up the full stack from BL1 to a Linux shell using Buildroot. -### Step 1: Download and Install the FVP Model +## Step 1: Download and install the FVP model -Before downloading the RD‑V3 FVP, it’s important to understand that each reference design release tag corresponds to a specific version of the FVP model. +Each reference design release tag corresponds to a specific FVP model version. +For example, the **RD-INFRA-2025.07.03** tag is designed to work with **FVP version 11.29.35**. -For example, the **RD‑INFRA‑2025.07.03** release tag is designed to work with **FVP version 11.29.35**. +See the [RD-V3 Release Tags](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html#release-tags) for a full list of release tags, corresponding FVP versions, and their associated release notes, which summarize changes and validated test cases. -You can refer to the [RD-V3 Release Tags](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html#release-tags) for a full list of release tags, corresponding FVP versions, and their associated release notes, which summarize changes and validated test cases. - -Download the matching FVP binary for your selected release tag using the link provided: +Download and install the matching FVP: ```bash mkdir -p ~/fvp cd ~/fvp wget https://developer.arm.com/-/cdn-downloads/permalink/FVPs-Neoverse-Infrastructure/RD-V3/FVP_RD_V3_11.29_35_Linux64_armv8l.tgz - tar -xvf FVP_RD_V3_11.29_35_Linux64_armv8l.tgz ./FVP_RD_V3.sh ``` -The FVP installation may prompt you with a few questions,choosing the default options is sufficient for this learning path. By default, the FVP will be installed in `/home/ubuntu/FVP_RD_V3`. +The FVP installation might prompt you with a few questions, choose the default settings. By default, the FVP installs under `/home/ubuntu/FVP_RD_V3`. -### Step 2: Remote Desktop Set Up +## Step 2: set up remote desktop -The RD‑V3 FVP model launches multiple UART consoles—each mapped to a separate terminal window for different subsystems (e.g., Neoverse V3, Cortex‑M55, Cortex‑M7, panel). +The RD‑V3 FVP model launches multiple UART consoles. Each console is mapped to a separate terminal window for different subsystems (for example, Neoverse V3, Cortex‑M55, Cortex‑M7, panel). If you’re accessing the platform over SSH, these UART consoles can still be displayed, but network latency and graphical forwarding can severely degrade performance. -To interact with different UARTs more efficiently, it is recommend to install a remote desktop environment using `XRDP`. This provides a smoother user experience when dealing with multiple terminal windows and system interactions. +To interact with different UARTs more efficiently, install a remote desktop environment using `XRDP`. This provides a smoother user experience when dealing with multiple terminal windows and system interactions. -You will need to install the required packages: +Install required packages and enable XRDP: ```bash @@ -51,45 +47,48 @@ sudo systemctl enable --now xrdp ``` To allow remote desktop connections, you need to open port 3389 (RDP) in your AWS EC2 security group: + - Go to the EC2 Dashboard → Security Groups -- Select the security group associated with your instance -- Under the Inbound rules tab, click Edit inbound rules +- Select your instance’s group → **Inbound rules** → **Edit inbound rules** +- Add a rule: Type: RDP, Port: 3389, Source: your public IP (recommended) - Add the following rule: - - Type: RDP - - Port: 3389 - - Source: your local machine IP + - **Type**: RDP + - **Port**: 3389 + - **Source**: your local machine IP For better security, limit the source to your current public IP instead of 0.0.0.0/0. -***Switch to Xorg (required on Ubuntu 22.04):*** +## Switch to Xorg (required on Ubuntu 22.04) Wayland is the default display server on Ubuntu 22.04, but it is not compatible with XRDP. -To enable XRDP remote sessions, you need to switch to Xorg by modifying the GDM configuration. +To enable XRDP remote sessions, you must switch to Xorg by modifying the GDM configuration: + +Open the `/etc/gdm3/custom.conf` in a text editor. -Open the `/etc/gdm3/custom.conf` in a text editor. Find the line: ```output #WaylandEnable=false ``` -Uncomment it by removing the # so it becomes: +Uncomment it: ```output WaylandEnable=false ``` -Then restart the GDM display manager for the change to take effect: +Restart the GDM display manager: + ```bash sudo systemctl restart gdm3 ``` -After reboot, XRDP will use Xorg and you should be able to connect to the Arm server via Remote Desktop. +After restart, XRDP sessions will use Xorg and you can connect to it in the Arm server using a remote desktop. -### Step 3: Launch the Simulation +## Step 3: launch the simulation -Once connected via Remote Desktop, open a terminal and launch the RD‑V3 FVP simulation: +Once connected using a remote desktop, open a terminal and launch the RD‑V3 FVP simulation: ```bash cd ~/rdv3/model-scripts/rdinfra @@ -97,26 +96,27 @@ export MODEL=/home/ubuntu/FVP_RD_V3/models/Linux64_armv8l_GCC-9.3/FVP_RD_V3 ./boot-buildroot.sh -p rdv3 & ``` -The command will launch the simulation and open multiple xterm windows, each corresponding to a different CPU. -You can start by locating the ***terminal_ns_uart0*** window — in it, you should see the GRUB menu. +The command launches the simulation and opens multiple xterm windows, each corresponding to a different CPU. -From there, select RD-V3 Buildroot in the GRUB menu and press Enter to proceed. +Start by locating the ***terminal_ns_uart0*** window. In it, you should see the GRUB menu. + +Select **RD-V3 Buildroot** in the GRUB menu and press **Enter** to proceed. ![img3 alt-text#center](rdv3_sim_run.jpg "GRUB Menu") -Booting Buildroot will take a little while — you’ll see typical Linux boot messages scrolling through. +Booting Buildroot takes a short while as Linux messages scroll by. + Eventually, the system will stop at the `Welcome to Buildroot` message on the ***terminal_ns_uart0*** window. -At the `buildroot login:` prompt, type `root` and press Enter to log in. -![img4 alt-text#center](rdv3_sim_login.jpg "Buildroot login") +Log in at the `buildroot login:` prompt with user `root`. -Congratulations — you’ve successfully simulated the boot process of the RD-V3 software you compiled earlier, all on FVP! +![img4 alt-text#center](rdv3_sim_login.jpg "Buildroot login") -### Step 4: Understand the UART Outputs +Congratulations - you’ve now successfully simulated the boot of the RD-V3 software you built earlier, all on FVP! -When you launch the RD‑V3 FVP model, it opens multiple terminal windows—each connected to a different UART channel. -These UARTs provide console logs from various firmware components across the system. +## Step 4: Understand the UART Outputs -Below is the UART-to-terminal mapping based on the default FVP configuration: +The RD-V3 FVP opens multiple terminals, each connected to a different UART that carries logs from specific firmware components. +UART-to-terminal mapping based on the default FVP configuration: | Terminal Window Title | UART | Output Role | Connected Processor | |----------------------------|------|------------------------------------|-----------------------| diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/5_rdv3_modify.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/5_rdv3_modify.md index ca1d9d1bb6..cce9ab4d05 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/5_rdv3_modify.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/5_rdv3_modify.md @@ -6,37 +6,37 @@ weight: 6 layout: learningpathall --- -## Build and Run RDV3-R1 Dual Chip Platform +## The RD-V3-R1 dual-chip platform -The RD‑V3‑R1 platform is a dual-chip simulation environment built to model multi-die Arm server SoCs. It expands on the single-die RD‑V3 design by introducing a second application processor and a Management Control Processor (MCP). +The RD-V3-R1 platform is a dual-chip simulation environment built to model multi-die Arm server SoCs. It expands on the single-die RD-V3 design by introducing a second application processor and a Management Control Processor (MCP). -***Key Use Cases*** +Key use cases of RD-V3-R1 are: -- Simulate chiplet-style boot flow with two APs -- Observe coordination between SCP and MCP across dies -- Test secure boot in a distributed firmware environment +- Simulating a chiplet-style boot flow with two APs +- Observing coordination between SCP and MCP across dies +- Testing secure boot in a distributed firmware environment -***Differences from RD‑V3*** -- Dual AP boot flow instead of single AP -- Adds MCP (Cortex‑M7) to support cross-die management +Key differences from RD-V3 are: + +- Dual AP boot flow instead of a single AP +- MCP (Cortex-M7) to support cross-die management - More complex power/reset coordination -### Step 1: Clone the RD‑V3‑R1 Firmware Stack +## Step 1: Clone the RD-V3-R1 firmware stack -Initialize and sync the codebase for RD‑V3‑R1: +Initialize and sync the codebase for RD-V3-R1: ```bash cd ~ mkdir rdv3r1 cd rdv3r1 repo init -u https://git.gitlab.arm.com/infra-solutions/reference-design/infra-refdesign-manifests.git -m pinned-rdv3r1.xml -b refs/tags/RD-INFRA-2025.07.03 --depth=1 -repo sync -c -j $(nproc) --fetch-submodules --force-sync --no-clone-bundle +repo sync -c -j "$(nproc)" --fetch-submodules --force-sync --no-clone-bundle ``` -### Step 2: Install RD-V3-R1 FVP +## Step 2: Install the RD-V3-R1 FVP -Refer to the [RD-V3-R1 Release Tags](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html#release-tags) to determine which FVP model version matches your selected release tag. -Then download and install the corresponding FVP binary. +Refer to the [RD-V3-R1 Release Tags](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html#release-tags) to pick the FVP version that matches your tag, then download and install it: ```bash mkdir -p ~/fvp @@ -46,11 +46,11 @@ tar -xvf FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz ./FVP_RD_V3_R1.sh ``` -### Step 3: Build the Firmware +## Step 3: Build the firmware -Since you have already created the Docker image for firmware building in a previous section, there is no need to rebuild it for RD‑V3‑R1. +If you built the Docker image earlier, you can reuse it for RD-V3-R1. -Run the full firmware build and packaging process: +Run the full build and package flow: ```bash cd ~/rdv3r1 @@ -66,31 +66,28 @@ docker run --rm \ ./build-scripts/rdinfra/build-test-buildroot.sh -p rdv3r1 package" ``` -### Step 4: Launch the Simulation +## Step 4: Launch the simulation -Once connected via Remote Desktop, open a terminal and launch the RD‑V3‑R1 FVP simulation: +From a desktop session on the build host, start the RD-V3-R1 FVP: ```bash cd ~/rdv3r1/model-scripts/rdinfra -export MODEL=/home/ubuntu/FVP_RD_V3_R1/models/Linux64_armv8l_GCC-9.3/FVP_RD_V3_R1_R1 +export MODEL="$HOME/FVP_RD_V3_R1/models/Linux64_armv8l_GCC-9.3/FVP_RD_V3_R1" # adjust if your path/toolchain differs ./boot-buildroot.sh -p rdv3r1 & ``` -This command starts the dual-chip simulation. -You’ll observe additional UART consoles for components like the MCP, and you can verify that both application processors (AP0 and AP1) are brought up in a coordinated manner. - -![img5 alt-text#center](rdv3r1_sim_login.jpg "RDV3 R1 buildroot login") +This starts the dual-chip simulation. You’ll see additional UART consoles (for example, MCP) and can verify both application processors (AP0 and AP1) boot in a coordinated manner. -Similar to the previous session, the terminal logs are stored in `~/rdv3r1/model-scripts/rdinfra/platforms/rdv3r1/rdv3r1`. +![img5 alt-text#center](rdv3r1_sim_login.jpg "RD-V3-R1 Buildroot login") +As before, the terminal logs are stored under `~/rdv3r1/model-scripts/rdinfra/platforms/rdv3r1/rdv3r1`. -### Step 5: Customize Firmware and Confirm MCP Execution -To wrap up this learning path, let’s verify that your firmware changes can be compiled and simulated successfully within the RD‑V3‑R1 environment. +## Step 5: Customize firmware and confirm MCP execution -Edit the MCP source file `~/rdv3r1/host/scp/framework/src/fwk_module.c` +To validate a firmware change in the RD-V3-R1 environment, edit the MCP source file `~/rdv3r1/host/scp/framework/src/fwk_module.c` -Locate the function `fwk_module_start()`. Add the following logging line just before `return FWK_SUCCESS;`: +Locate the function `fwk_module_start()` and add the following logging line just before `return FWK_SUCCESS;`: ```c int fwk_module_start(void) @@ -120,13 +117,11 @@ docker run --rm \ ./build-scripts/rdinfra/build-test-buildroot.sh -p rdv3r1 package" ``` -Launch the FVP simulation again and observe the UART output for MCP. +Launch the FVP simulation again and check the MCP UART output for MCP. ![img6 alt-text#center](rdv3r1_sim_codechange.jpg "RDV3 R1 modify firmware") -If the change was successful, your custom log line will appear in the MCP console—confirming that your code was integrated and executed as part of the firmware boot process. - -You’ve now successfully simulated a dual-chip Arm server platform using RD‑V3‑R1 on FVP—from cloning firmware sources to modifying secure control logic. +If the change was successful, your custom log line will appear in the MCP console - confirming that your code was integrated and executed as part of the firmware boot process. +You’ve now successfully simulated a dual-chip Arm server platform using RD‑V3‑R1 on FVP and validated a firmware change end-to-end—setting you up for deeper customization (for example, BMC integration) in future development cycles. -This foundation sets the stage for deeper exploration, such as customizing platform firmware or integrating BMC workflows in future development cycles. diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md index 473bd2f67e..b2974b3c3b 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md @@ -1,25 +1,23 @@ --- -title: CSS-V3 Pre-Silicon Software Development Using Neoverse Servers - -draft: true -cascade: - draft: true +title: Develop and Validate Firmware Pre-Silicon on Arm Neoverse CSS V3 minutes_to_complete: 90 -who_is_this_for: This Learning Path is for firmware developers, system architects, and silicon validation engineers building Arm Neoverse CSS platforms. It focuses on pre-silicon development using Fixed Virtual Platforms (FVPs) for the CSS‑V3 reference design. You’ll learn how to build, customize, and validate firmware on the RD‑V3 platform using Fixed Virtual Platforms (FVPs) before hardware is available. +who_is_this_for: This advanced topic is for firmware developers, system architects, and silicon validation engineers working on Arm Neoverse CSS platforms who require a pre-silicon workflow for the CSS-V3 reference design using Fixed Virtual Platforms (FVPs). learning_objectives: - - Understand the architecture of Arm Neoverse CSS‑V3 as the foundation for scalable server-class platforms - - Build and boot the RD‑V3 firmware stack using TF‑A, SCP, RSE, and UEFI - - Simulate multi-core, multi-chip systems with Arm FVP models and interpret boot logs - - Modify platform control firmware to test custom logic and validate it via pre-silicon simulation - + - Explain the CSS-V3 architecture and the RD-V3 firmware boot sequence (TF-A, RSE, SCP/MCP/LCP, UEFI/GRUB, Linux) + - Set up a containerized build environment and sync sources with a pinned manifest using repo + - Build and boot the RD-V3 firmware stack on FVP and map UART consoles to components + - Interpret boot logs to verify bring-up and diagnose boot-stage issues + - Modify platform control firmware (for example, SCP/MCP) and validate changes via pre-silicon simulation + - Launch a dual-chip RD-V3-R1 simulation and verify AP/MCP coordination + prerequisites: - - Access to an Arm Neoverse-based Linux machine (cloud or local), with at least 80 GB of storage + - Access to an Arm Neoverse-based Linux machine (cloud or local) with at least 80 GB of free storage - Familiarity with Linux command-line tools and basic scripting - Understanding of firmware boot stages and SoC-level architecture - - Docker installed, or GitHub Codespaces-compatible development environment + - Docker installed, or a GitHub Codespaces-compatible development environment author: - Odin Shen diff --git a/content/learning-paths/servers-and-cloud-computing/nginx/_index.md b/content/learning-paths/servers-and-cloud-computing/nginx/_index.md index 1dacc52b05..2cd79f0d39 100644 --- a/content/learning-paths/servers-and-cloud-computing/nginx/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nginx/_index.md @@ -23,7 +23,7 @@ subjects: Web armips: - Neoverse tools_software_languages: - - Nginx + - NGINX operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md index 2c65a7d8ba..18b93a4195 100644 --- a/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md @@ -24,7 +24,7 @@ subjects: Web armips: - Neoverse tools_software_languages: - - Nginx + - NGINX - Runbook operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md new file mode 100644 index 0000000000..2a67263998 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md @@ -0,0 +1,70 @@ +--- +title: Introduction to OpenBMC and UEFI +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction to OpenBMC and UEFI + +This section explains the roles of OpenBMC and UEFI in the Arm server boot flow, and highlights why simulating their integration is essential for early-stage development. + +### OpenBMC + +[OpenBMC](https://www.openbmc.org/) is a collaborative open-source firmware stack for Baseboard Management Controllers (BMC), hosted by the Linux Foundation. +BMCs are embedded microcontrollers on server motherboards that enable both in-band and out-of-band system management. +Out-of-band access allows remote management even when the host system is powered off or unresponsive, while in-band interfaces support communication with the host operating system during normal operation. + +The OpenBMC stack is built using the Yocto Project and includes a Linux kernel, system services, D-Bus interfaces, and support for industry-standard APIs such as Redfish and IPMI. It provides features like hardware monitoring, fan control, power sequencing, sensor telemetry, event logging, BIOS configuration, and more. + +Its architecture is modular by design—each board or platform can define its own layers and packages through Yocto recipes, enabling custom extensions to firmware functionality without modifying upstream code. + +It is widely adopted by hyperscalers and enterprise vendors to manage servers, storage systems, and network appliances. +OpenBMC is particularly well-suited to Arm-based server platforms like **[Neoverse RD-V3](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html)**, where it provides early-stage platform control and boot orchestration even before silicon is available. + +**Key features of OpenBMC include:** +- **Remote management:** power control, Serial over LAN (SOL), and virtual media +- **Hardware health monitoring:** sensors, fans, temperature, voltage, and power rails +- **Firmware update mechanisms:** support for signed image updates and secure boot +- **Industry-standard APIs:** IPMI, Redfish, PLDM, and MCTP +- **Modular and extensible design:** device tree-based configuration and layered architecture + +OpenBMC enables faster development cycles, open innovation, and reduced vendor lock-in across data centers, cloud platforms, and edge environments. + +In this Learning Path, you’ll simulate how OpenBMC manages the early stage boot process, power sequencing, and remote access for a virtual Neoverse RD-V3 server. You will interact with the BMC console, inspect boot logs, and verify serial-over-LAN and UART communication with the host. + +### UEFI + +The [Unified Extensible Firmware Interface (UEFI)](https://uefi.org/) is the modern replacement for legacy BIOS, responsible for initializing hardware and loading the operating system. +UEFI provides a robust, modular, and extensible interface between platform firmware and OS loaders. It supports: + +- A modular and extensible architecture +- Faster boot times and reliable system initialization +- Large storage device support using GPT (GUID Partition Table) +- Secure Boot for verifying boot integrity +- Pre-boot networking and diagnostics via UEFI Shell or applications + +UEFI executes after the platform powers on and before the OS kernel takes over. +It discovers and initializes system hardware, configures memory and I/O, and launches the bootloader. +It is governed by the UEFI Forum and is now the standard firmware interface across server-class, desktop, and embedded systems. + +In platforms that integrate OpenBMC, the BMC operates independently from the host CPU and manages platform power, telemetry, and recovery. +During system boot, UEFI and OpenBMC coordinate via mechanisms such as IPMI over KCS, PLDM over MCTP, or shared memory buffers. + +These interactions are especially critical in Arm server-class platforms—like Neoverse RD-V3—for secure boot, remote diagnostics, and system recovery during pre-silicon or bring-up phases. + +### Key Interactions Between OpenBMC and UEFI + +| **Interaction** | **Direction** | **Description** | +|---------------------------|-------------------|---------------------------------------------------------------------------------| +| Boot power sequencing | BMC → Host | BMC controls host power-on flow, ensuring UEFI starts in the correct sequence. | +| Boot status reporting | UEFI → BMC | UEFI sends boot state and progress via IPMI (KCS) or PLDM. | +| Serial-over-LAN (SOL) | BMC ↔ Host | BMC bridges host UART console to remote clients over the network. | +| Pre-boot configuration | BMC ↔ UEFI | BMC may inject or read boot config settings via shared memory or commands. | +| System recovery signaling | UEFI → BMC | UEFI can request BMC to initiate reboot, NMI, or recovery actions. | + + +In this Learning Path, you will build and run the UEFI firmware on the RD-V3 FVP host platform. + +You will use OpenBMC to power on the virtual Arm server, access the serial console, and monitor the host boot progress like real hardware platform. By inspecting the full boot log and observing system behavior in simulation, you will gain valuable insights into how BMC and UEFI coordinate during early firmware bring-up. diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md new file mode 100644 index 0000000000..1b0e0d99a3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md @@ -0,0 +1,322 @@ +--- +title: Set Up the Pre-Silicon Development Environment for OpenBMC and UEFI +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Set Up Development Environment + +In this section, you’ll prepare your workspace to build and simulate OpenBMC and UEFI firmware on the Neoverse RD-V3 platform using Arm Fixed Virtual Platforms (FVPs). +You will install the required tools, configure repositories, and set up a Docker-based build environment for both BMC and host firmware. + +Before getting started, it’s strongly recommended to review the previous Learning Path: [CSS-V3 Pre-Silicon Software Development Using Neoverse Servers](https://learn.arm.com/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack). +It walks you through how to use the CSSv3 reference design on FVP to perform early-stage development and validation. + +You will perform the steps outlined below on your Arm Neoverse-based Linux machine running Ubuntu 22.04 LTS. You will need at least 80 GB of free disk space, 48 GB of RAM. + +### Install Required Packages + +Install the base packages for building OpenBMC with the Yocto Project: + +```bash +sudo apt update +sudo apt install -y git gcc g++ make file wget gawk diffstat bzip2 cpio chrpath zstd lz4 bzip2 unzip +``` + +Install [Docker](/install-guides/docker) + +### Set Up the repo Tool + +```bash +mkdir -p ~/.bin +PATH="${HOME}/.bin:${PATH}" +curl https://storage.googleapis.com/git-repo-downloads/repo > ~/.bin/repo +chmod a+rx ~/.bin/repo +``` + +### Download and Install the Arm FVP Model (RD-V3) + +Download and extract the RD-V3 FVP: +```bash +mkdir ~/fvp +cd ~/fvp +wget https://developer.arm.com/-/cdn-downloads/permalink/FVPs-Neoverse-Infrastructure/RD-V3-r1/FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz +tar -xvf FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz +./FVP_RD_V3_R1.sh +``` + +The FVP installation may prompt you with a few questions, choosing the default options is sufficient for this learning path. By default, the FVP will be installed in `$HOME/FVP_RD_V3_R1`. + + +### Initialize the Host Build Environment + +Set up a workspace for host firmware builds: + +```bash +mkdir ~/host +cd host +~/.bin/repo init -u "https://git.gitlab.arm.com/infra-solutions/reference-design/infra-refdesign-manifests.git" \ + -m "pinned-rdv3r1-bmc.xml" \ + -b "refs/tags/RD-INFRA-2025.07.03" \ + --depth=1 +repo sync -c -j $(nproc) --fetch-submodules --force-sync --no-clone-bundle +``` + +### Apply Required Patches + +To enable platform-specific functionality such as Redfish support and UEFI enhancements, apply a set of pre-defined patches from Arm’s GitLab repository. + +Use sparse checkout to download only the `patch/` folder: + +```bash +cd ~/host +git init +git remote add -f origin https://gitlab.arm.com/server_management/PoCs/fvp-poc +git config core.sparsecheckout true +echo /patch >> .git/info/sparse-checkout +git pull origin main +``` + +This approach allows you to fetch only the `patch` folder from the remote Git repository—saving time and disk space. + +Next, using a file editor of your choice, create an `apply_patch.sh` script inside the `~` directory and paste in the following content. +This script will automatically apply the necessary patches to each firmware component. + +```bash +FVP_DIR="host" +SOURCE=${PWD} + +GREEN='\033[0;32m' +NC='\033[0m' + +pushd ${FVP_DIR} > /dev/null +echo -e "${GREEN}\n===== Apply patches to edk2 =====\n${NC}" +pushd uefi/edk2 +git am --keep-cr ${SOURCE}/patch/edk2/*.patch +popd > /dev/null + +echo -e "${GREEN}\n===== Apply patches to edk2-platforms =====\n${NC}" +pushd uefi/edk2/edk2-platforms > /dev/null +git am --keep-cr ${SOURCE}/patch/edk2-platforms/*.patch +popd > /dev/null + +echo -e "${GREEN}\n===== Apply patches to edk2-redfish-client =====\n${NC}" +git clone https://github.com/tianocore/edk2-redfish-client.git +pushd edk2-redfish-client > /dev/null +git checkout 4f204b579b1d6b5e57a411f0d4053b0a516839c8 +git am --keep-cr ${SOURCE}/patch/edk2-redfish-client/*.patch +popd > /dev/null + +echo -e "${GREEN}\n===== Apply patches to buildroot =====\n${NC}" +pushd buildroot > /dev/null +git am ${SOURCE}/patch/buildroot/*.patch +popd > /dev/null + +echo -e "${GREEN}\n===== Apply patches to build-scripts =====\n${NC}" +pushd build-scripts > /dev/null +git am ${SOURCE}/patch/build-scripts/*.patch +popd > /dev/null +popd > /dev/null +``` + +Run the patch script: + +```bash +cd ~ +chmod +x ./apply_patch.sh +./apply_patch.sh +``` + +This script automatically applies patches to edk2, edk2-platforms, buildroot, and related components. +These patches enable additional UEFI features, integrate the Redfish client, and align the build system with the RD-V3 simulation setup. + +### Build RDv3 R1 Host Docker Image + +Before building the host image, update the following line in `~/host/grub/bootstrap` to replace the `git://` protocol. +Some networks may restrict `git://` access due to firewall or security policies. Switching to `https://` ensures reliable and secure access to external Git repositories. + +```bash +diff --git a/bootstrap b/bootstrap +index 5b08e7e2d..031784582 100755 +--- a/bootstrap ++++ b/bootstrap +@@ -47,7 +47,7 @@ PERL="${PERL-perl}" + me=$0 +-default_gnulib_url=git://git.sv.gnu.org/gnulib ++default_gnulib_url=https://git.savannah.gnu.org/git/gnulib.git +usage() { + cat < ../components/linux/Image +lrwxrwxrwx 1 ubuntu ubuntu 35 Aug 18 10:19 Image.defconfig -> ../components/linux/Image.defconfig +-rw-r--r-- 1 ubuntu ubuntu 4402315 Aug 18 10:19 fip-uefi.bin +lrwxrwxrwx 1 ubuntu ubuntu 34 Aug 18 10:19 lcp_ramfw.bin -> ../components/rdv3r1/lcp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 33 Aug 18 10:19 lcp_ramfw_ns -> ../components/rdv3r1/lcp_ramfw_ns +lrwxrwxrwx 1 ubuntu ubuntu 26 Aug 18 10:19 lkvm -> ../components/kvmtool/lkvm +lrwxrwxrwx 1 ubuntu ubuntu 34 Aug 18 10:19 mcp_ramfw.bin -> ../components/rdv3r1/mcp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 33 Aug 18 10:19 mcp_ramfw_ns -> ../components/rdv3r1/mcp_ramfw_ns +lrwxrwxrwx 1 ubuntu ubuntu 28 Aug 18 10:19 rmm.img -> ../components/rdv3r1/rmm.img +lrwxrwxrwx 1 ubuntu ubuntu 34 Aug 18 10:19 scp_ramfw.bin -> ../components/rdv3r1/scp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 33 Aug 18 10:19 scp_ramfw_ns -> ../components/rdv3r1/scp_ramfw_ns +lrwxrwxrwx 1 ubuntu ubuntu 41 Aug 18 10:19 signed_lcp_ramfw.bin -> ../components/rdv3r1/signed_lcp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 41 Aug 18 10:19 signed_mcp_ramfw.bin -> ../components/rdv3r1/signed_mcp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 41 Aug 18 10:19 signed_scp_ramfw.bin -> ../components/rdv3r1/signed_scp_ramfw.bin +lrwxrwxrwx 1 ubuntu ubuntu 31 Aug 18 10:19 tf-bl1.bin -> ../components/rdv3r1/tf-bl1.bin +lrwxrwxrwx 1 ubuntu ubuntu 30 Aug 18 10:19 tf-bl1_ns -> ../components/rdv3r1/tf-bl1_ns +lrwxrwxrwx 1 ubuntu ubuntu 31 Aug 18 10:19 tf-bl2.bin -> ../components/rdv3r1/tf-bl2.bin +lrwxrwxrwx 1 ubuntu ubuntu 32 Aug 18 10:19 tf-bl31.bin -> ../components/rdv3r1/tf-bl31.bin +lrwxrwxrwx 1 ubuntu ubuntu 55 Aug 18 10:19 tf_m_flash.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/tf_m_flash.bin +lrwxrwxrwx 1 ubuntu ubuntu 48 Aug 18 10:19 tf_m_rom.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/rom.bin +lrwxrwxrwx 1 ubuntu ubuntu 50 Aug 18 10:19 tf_m_vm0_0.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/vm0_0.bin +lrwxrwxrwx 1 ubuntu ubuntu 50 Aug 18 10:19 tf_m_vm0_1.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/vm0_1.bin +lrwxrwxrwx 1 ubuntu ubuntu 50 Aug 18 10:19 tf_m_vm1_0.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/vm1_0.bin +lrwxrwxrwx 1 ubuntu ubuntu 50 Aug 18 10:19 tf_m_vm1_1.bin -> ../components/arm/rse/neoverse_rd/rdv3r1/vm1_1.bin +lrwxrwxrwx 1 ubuntu ubuntu 33 Aug 18 10:19 uefi.bin -> ../components/css-common/uefi.bin +``` + + +{{% notice Note %}} +This [Arm Learning Path](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build/) provides a complete introduction to setting up the RDv3 development environment, please refer to it for more details. +{{% /notice %}} + + +### Build OpenBMC Image + +OpenBMC is built on the Yocto Project, which uses `BitBake` as its build tool. +You don’t need to download BitBake separately, as it is included in the OpenBMC build environment. +Once you’ve set up the OpenBMC repository and initialized the build environment, BitBake is already available for building images, compiling packages, or running other tasks. + +Start by cloning and building the OpenBMC image using the bitbake build system: + +```bash +cd ~ +git clone https://github.com/openbmc/openbmc.git +cd ~/openbmc +source setup fvp +bitbake obmc-phosphor-image +``` + +During the OpenBMC build process, you may encounter a native compilation error when building `Node.js` (especially version 22+) due to high memory usage during the V8 engine build phase. + +```output +g++: fatal error: Killed signal terminated program cc1plus +compilation terminated. +ERROR: oe_runmake failed +``` + +This is a typical Out-of-Memory (OOM) failure, where the system forcibly terminates the compiler due to insufficient available memory. + +To reduce memory pressure, explicitly limit parallel tasks in `conf/local.conf`: + +```bash +BB_NUMBER_THREADS = "2" +PARALLEL_MAKE = "-j2" +``` + +This ensures that BitBake only runs two parallel tasks and that each Makefile invocation limits itself to two threads. It significantly reduces peak memory usage and avoids OOM terminations. + +With a successful build, you should see output similar to: + +```output +Loading cache: 100% | | ETA: --:--:-- +Loaded 0 entries from dependency cache. +Parsing recipes: 100% |#############################################################################################################| Time: 0:00:09 +Parsing of 3054 .bb files complete (0 cached, 3054 parsed). 5148 targets, 770 skipped, 0 masked, 0 errors. +NOTE: Resolving any missing task queue dependencies + +Build Configuration: +BB_VERSION = "2.12.0" +BUILD_SYS = "aarch64-linux" +NATIVELSBSTRING = "ubuntu-22.04" +TARGET_SYS = "aarch64-openbmc-linux" +MACHINE = "fvp" +DISTRO = "openbmc-phosphor" +DISTRO_VERSION = "nodistro.0" +TUNE_FEATURES = "aarch64 armv8-4a" +TARGET_FPU = "" +meta +meta-oe +meta-networking +meta-python +meta-phosphor +meta-arm +meta-arm-toolchain +meta-arm-bsp +meta-evb +meta-evb-fvp-base = "master:1b6b75a7d22262ec1bf5ab8e2bfa434ac84d981b" + +Sstate summary: Wanted 0 Local 0 Mirrors 0 Missed 0 Current 2890 (0% match, 100% complete)############################### | ETA: 0:00:00 +Initialising tasks: 100% |##########################################################################################################| Time: 0:00:03 +NOTE: Executing Tasks +``` + +This confirms that the OpenBMC image was built successfully. + +{{% notice Note %}} +The first build may take up to an hour depending on your system performance, as it downloads and compiles the entire firmware stack. +{{% /notice %}} + +Your workspace should now be structured to separate the FVP, host build system, OpenBMC source, and patches—simplifying organization, maintenance, and troubleshooting. + +```output +├── FVP_RD_V3_R1 +├── apply_patch.sh +├── fvp +│   ├── FVP_RD_V3_R1.sh +│   ├── FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz +│   └── license_terms +├── host +│   ├── build-scripts +│   ├── buildroot +│   ├── patch +│ │   ├── build-scripts +│ │   ├── buildroot +│ │   ├── edk2 +│ │   ├── edk2-platforms +│ │   └── edk2-redfish-client +│   ├── ... +├── openbmc +│   ├── ... +│   ├── build +│   ├── meta-arm +│   ├── ... +│   ├── poky +│   └── setup +└── run.sh +``` + +With both the OpenBMC and host firmware environments built and configured, you’re now fully prepared to launch the full system simulation and observe the boot process in action. diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md new file mode 100644 index 0000000000..881cd079f9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md @@ -0,0 +1,119 @@ +--- +title: Run OpenBMC and Host UEFI Simulation on RD-V3 FVP +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Prepare Pre-Silicon OpenBMC Simulation + +With your environment prepared, you can now simulate the full pre-silicon firmware boot flow using the Arm Neoverse RD-V3 reference design. +You’ll build the OpenBMC image, launch the Arm Fixed Virtual Platform (FVP), and observe the full boot process of both the BMC and host UEFI firmware in a simulated environment. + +This simulation launches multiple UART consoles,each mapped to a separate terminal window for different subsystems (e.g., Neoverse V3, Cortex‑M55, Cortex‑M7, and the Cortex-A BMC). + +These graphical terminal windows require a desktop session. If you're accessing the simulation over SSH (e.g., on a cloud instance), they may not display properly. + +To ensure proper display and interactivity, it is recommended to install a Remote Desktop environment using XRDP. + +On an Arm cloud Ubuntu 22.04 instance, you will need to install required packages: + +```bash +sudo apt update +sudo apt install -y ubuntu-desktop xrdp xfce4 xfce4-goodies pv xterm sshpass socat retry +sudo systemctl enable --now xrdp +``` + +You may need to follow the Step 2 on the [RD-V3 learning path](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp/) to setup the networking and GDM configuration. + +Once connected via Remote Desktop, open a terminal and launch the RD‑V3 FVP simulation: + +## Execute Pre-Silicon OpenBMC Simulation + +To make the simulation process more intuitive, you’ll need to modify a script from Arm’s GitLab repository: + +```bash +cd ~ +wget https://gitlab.arm.com/server_management/PoCs/fvp-poc/-/raw/2a79ae93560969a3b802dfb3d7d89f9fd9dee5a6/run.sh +``` + +Before running the simulation, open the `run.sh` script and locate the line that defines FVP_KEYWORD. +This variable determines when the host FVP should be launched by monitoring OpenBMC’s console output. +If not set correctly, the script may hang or fail to start the host simulation. +Update the line to: + +```output +FVP_KEYWORD="terminal2: Listening for serial connection on port" +``` + +Then, execute the script. + +```bash +chmod +x ./run.sh +./run.sh -m ~/FVP_RD_V3_R1/models/Linux64_GCC-9.3/FVP_RD_V3_R1 +``` + +The `run.sh` script will: + +- Launch the OpenBMC FVP and wait for BMC boot +- Automatically start the host FVP for RD-V3 (running UEFI) +- Connect the UART consoles between the BMC and host via virtual pipes +- Connect MCTP and IPMI tunnels between the OpenBMC FVP and the RD-V3 host FVP +- Stop the OpenBMC FVP and RD-V3 host FVP when CTRL+C is pressed + + +Once the simulation is running, the `OpenBMC FVP console` will stop at the Linux login prompt: + +```output +[ OK ] Started phosphor systemd target monitor. +[ OK ] Started Sensor Monitor. + Starting Hostname Service... + Starting Phosphor Software Manager... + Starting Phosphor BMC State Manager... + Starting Phosphor Time Manager daemon... +[ OK ] Finished SSH Key Generation. +[ OK ] Finished Wait for /xyz/openbmc_project/state/chassis0. +[ 27.454083] mctpserial0: invalid tx state 0 +[FAILED] Failed to start OpenBMC ipKVM daemon. +Phosphor OpenBMC (Phosphor OpenBMC Project Reference Distro) nodistro.0 fvp ttyAMA0 + Starting Time & Date Service... +fvp login: +``` + +Enter the OpenBMC default username `root` and password, which is `0penBmc`. + + +{{% notice Note %}} +The first character of the password is the number ***0***, not a capital ***O***. +{{% /notice %}} + +After login, you will be dropped into the OpenBMC shell, a minimal Linux environment running inside the simulated BMC. + +The host-side UEFI simulation will appear in the `FVP terminal_ns_uart0` console. +You may briefly see the UEFI Firmware Setup Menu—select `Continue` to proceed with boot. +The system will then enter GRUB and begin booting Linux. + +![img2 alt-text#center](openbmc_hostuefi.jpg "UEFI Firmware Setup Menu") + +The simulation will carry on the CSS-V3-R1 part, enter the GRUB menu. Press Enter to proceed. + +A successful simulation will show login prompts on both BMC and host consoles. You can also confirm success by seeing the final system state in the Web UI or UART output. + +![img2 alt-text#center](openbmc_cssv3_sim.jpg "Simuation Success") + +Shown here is a simulation recording. It gives you a quick visual overview of how OpenBMC and UEFI boot and interact during pre-silicon execution. + +![img1 alt-text#center](openbmc_cssv3_running.gif "Simuation Running") + +After simulation completes, logs for both the BMC and host will be stored in `~/logs`. These are useful for verifying boot success or troubleshooting issues. + +- `obmc_boot.log`: BMC boot output +- `obmc_console.log`: BMC serial output +- `fvp_boot.log`: Host UEFI boot output + +By reviewing the contents of the logs folder, you can verify the expected system behavior or quickly diagnose +any anomalies that arise during boot or runtime. + +With the simulation running successfully, you are now ready to perform real-time testing between the host and the BMC. +In the next section, you will explore how to interact with the BMC using UART and IPMI from the host side, validating communication channels in a pre-silicon context. diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md new file mode 100644 index 0000000000..2ff9a0729f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md @@ -0,0 +1,71 @@ +--- +title: Monitor and Control the Host CPU via OpenBMC SOL and Web UI +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Access the Host Console via OpenBMC SOL + +The OpenBMC platform provides `Serial over LAN` (SOL), allowing you to access the host console (RD-V3 FVP) remotely through the BMC, without needing a physical serial cable. +In this section, you will use `socat` to create a virtual UART bridge, verify port mappings, and access the host console via the BMC Web UI. + +### Step 1: Connect the BMC and Host Consoles + +Run the following command on your development Linux machine (where the simulation is running) to bridge the BMC and host UART ports: + +```bash +socat -x tcp:localhost:5005 tcp:localhost:5067 +``` + +This command connects the host-side UART port (5005) to the BMC-side port (5067), allowing bi-directional serial communication. + +{{% notice Note %}} +If you see a Connection refused error, check the FVP logs to verify the port numbers: +* In `fvp_boot.log`, look for a line like: +terminal_ns_uart0: Listening for serial connection on port 5005 +* In `obmc_boot.log`, confirm the corresponding line: +terminal_3: Listening for serial connection on port 5067 +{{% /notice %}} + +Ensure both ports are active and match the socat command arguments. + + +### Step 2: Manually Set Host Power State + +Once the SOL bridge is established, run the following command from the OpenBMC console shell to simulate the host being powered on: + +```bash +busctl set-property xyz.openbmc_project.State.Host \ +/xyz/openbmc_project/state/host0 xyz.openbmc_project.State.Host \ +CurrentHostState s xyz.openbmc_project.State.Host.HostState.Running +``` + +This updates the BMC’s internal host state, allowing UEFI to begin execution. + +### Step 3: Access Host Console from Web UI + +- From your simulation host, launch a browser and open the BMC Web UI at: + https://127.0.0.1:4223 + ![img3 alt-text#center](openbmc_webui_login.jpg "WebUI login") + +- Login using the default credentials: + - Username: root + - Password: 0penBmc + {{% notice Note %}} + As a reminder, the first character of the password is the number ***0***, not a capital ***O***. + {{% /notice %}} + After login, you should see the Web UI dashboard: + +- From the Overview page, click the `SOL Console` button. + ![img4 alt-text#center](openbmc_webui_overview.jpg "WebUI Overview") + +- The SOL terminal in the Web UI will display the host console output (UEFI shell or Linux login). You can type commands directly as if you were connected over a physical serial line. + ![img5 alt-text#center](openbmc_webui_sol.jpg "WebUI SOL") + +Once connected to the SOL terminal, you can monitor the UEFI boot sequence, interact with the host shell, and run diagnostic or recovery workflows, just as if you were connected to a physical serial port. + +This confirms that OpenBMC is fully managing host power and console access in your simulated environment. + +In the next module, you'll expand this control further by sending IPMI commands to the BMC—allowing you to test low-level system interactions and even implement your own OEM command handlers. diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md new file mode 100644 index 0000000000..1eafa587c4 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md @@ -0,0 +1,172 @@ +--- +title: Customize IPMI Commands in OpenBMC +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Customize IPMI Commands in OpenBMC + +With the host console accessible through OpenBMC, you are now ready to extend its functionality by implementing a custom IPMI command handler. + +The Intelligent Platform Management Interface ([IPMI](https://en.wikipedia.org/wiki/Intelligent_Platform_Management_Interface)) is a standardized protocol for managing and monitoring servers, even when the operating system is not running. In OpenBMC, IPMI support is built-in and can be extended using custom handlers through the D-Bus/IPMI infrastructure. + +In this module, you'll implement a custom IPMI command handler that returns a simple string response. You will write the handler in C++, package it with a BitBake recipe, build it into the OpenBMC image, and test it using `ipmitool` inside the simulated FVP environment. + +### Step 1: Create a BitBake Recipe + +Create a new file named `phosphor-ipmi-example.bb` in the same folder. + +```bash +touch ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/phosphor-ipmi-example.bb +``` + +Paste the following content into it: + +```bash +SUMMARY = "Custom IPMI commands" +LICENSE = "CLOSED" +PR = "r1" +SRC_URI = "file://fvp-ipmi.cpp" +S = "${UNPACKDIR}" + +DEPENDS += "phosphor-ipmi-host sdbusplus systemd" +TARGET_CXXFLAGS += " -std=c++23" +TARGET_LDFLAGS += " -lsystemd -lsdbusplus" + +do_compile() { + ${CXX} ${TARGET_CXXFLAGS} -fPIC -shared \ + -o libmyipmi.so ${UNPACKDIR}/fvp-ipmi.cpp \ + -I${STAGING_INCDIR} -L${STAGING_LIBDIR} \ + ${TARGET_LDFLAGS} +} + +do_install() { + install -d ${D}${libdir}/ipmid-providers + install -m 0644 libmyipmi.so ${D}${libdir}/ipmid-providers/ +} + +FILES:${PN} += "${libdir}/ipmid-providers/libmyipmi.so" +``` + +### Step 2: Create a Custom IPMI Handler + +Create a folder `phosphor-ipmi-example` at the same path, and add a new file called `fvp-ipmi.cpp`: + +```bash +mkdir ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/phosphor-ipmi-example +touch ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/phosphor-ipmi-example/fvp-ipmi.cpp +``` + +Add the contents below into `fvp-ipmi.cpp`: + +```cpp +#include +#include +#include + +// Example handler: return a string +ipmi::RspType myIpmiCommand() { + std::string reply = "Hello from OpenBMC IPMI!"; + return ipmi::responseSuccess(reply); +} + +void register_my_ipmi() __attribute__((constructor)); +void register_my_ipmi() { + ipmi::registerHandler( + ipmi::prioOemBase, + 0x30, // NetFn code + 0x20, // command code + ipmi::Privilege::Admin, + myIpmiCommand + ); +} +``` + +This function registers a custom IPMI handler using NetFn `0x30` and Command `0x20`. +When triggered, it returns a static ASCII string: `"Hello from OpenBMC IPMI!"`. +At runtime, this string is encoded as a sequence of hex bytes and sent back through the IPMI response. +You will observe this by running `ipmitool raw` and decoding the output. + +### Step 3: Add to Build Configuration + +To verify the IPMI command, you need to add the following to the configuration to install `ipmitool` and `phosphor-ipmi-example`. + +Edit `fvp.conf` at `~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/conf/machine/fvp.conf` + +Append the following packages: + +```bash +IMAGE_INSTALL:append = "\ + phosphor-ipmi-example \ + ipmitool \ +" +``` + +Now rebuild the OpenBMC image with your IPMI handler included: + +```bash +cd ~/openbmc +source setup fvp +bitbake obmc-phosphor-image +``` + +After the build completes, the generated image will contain both `ipmitool` and `phosphor-ipmi-example`. + +For more details about the final image configuration, you can inspect the generated FVP configuration file at `~/openbmc/build/fvp/tmp/deploy/images/fvp/obmc-phosphor-image-fvp.fvpconf`. + +### Step 4: Verify the IPMI Command in Simulation + +After launching the FVP simulation and logging into the OpenBMC console, run the following command to invoke your custom IPMI handler: + +```bash +ipmitool raw 0x30 0x20 +``` + +This command invokes your custom IPMI handler registered under: +* NetFn: 0x30 (OEM function) +* Command: 0x20 + +You should see a response similar to: +```output +root@fvp:~# ipmitool raw 0x30 0x20 + 18 48 65 6c 6c 6f 20 66 72 6f 6d 20 4f 70 65 6e + 42 4d 43 20 49 50 4d 49 21 +``` + +This response is a sequence of hexadecimal bytes returned by the BMC: +* The first byte indicates the length of the payload — in this case `0x18`, 24 bytes. +* The remaining 24 bytes represent the actual data payload, encoded as ASCII. + +![img6 alt-text#center](openbmc_ipmi.jpg "OpenBMC IPMI command") + +To decode the message, copy the payload portion (excluding the first byte) and run: + +```bash +echo "48 65 6c 6c 6f 20 66 72 6f 6d 20 4f 70 65 6e 42 4d 43 20 49 50 4d 49 21" | tr -d ' ' | xxd -r -p +``` + +The output will be: +```output +"Hello from OpenBMC IPMI!" +``` +This output confirms that the custom string returned by your `myIpmiCommand()` function has been correctly encoded and transmitted via IPMI: +```bash +std::string reply = "Hello from OpenBMC IPMI!"; +return ipmi::responseSuccess(reply); +``` + +The response from `ipmitool raw` confirms that your custom IPMI handler was: + +- Successfully compiled and included in the OpenBMC image +- Properly registered to respond to NetFn `0x30`, Command `0x20` +- Correctly executed in the simulated environment via IPMI raw access +- Returning the intended payload, encoded as ASCII and received in hex format + +By decoding the hex payload into ASCII, you have verified the full path from handler registration to command execution and payload delivery. + +You have now successfully implemented and tested a custom IPMI command in OpenBMC using pre-silicon simulation. + +This sets the foundation for adding OEM commands or platform-specific extensions to your BMC firmware. +You can now expand this pattern to support argument parsing, custom data formats, or system-level control—enabling rapid prototyping of features such as sensor telemetry, power domain control, or boot policy configuration. diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md new file mode 100644 index 0000000000..879924ddb0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md @@ -0,0 +1,67 @@ +--- +title: Pre-Silicon Simulation of OpenBMC and UEFI on Neoverse RD-V3 + +draft: true +cascade: + draft: true + +minutes_to_complete: 120 + +who_is_this_for: This Learning Path is for firmware developers, platform software engineers, and system integrators working on Arm Neoverse-based platforms. It is especially useful for those exploring pre-silicon development, testing, and integration of Baseboard Management Controllers (BMC) with UEFI firmware. If you are building or validating server-class reference platforms, such as RD-V3—before hardware is available, this guide will help you simulate and debug the full boot path using Fixed Virtual Platforms (FVPs). + +learning_objectives: + - Understand the role of OpenBMC and UEFI in the boot flow for an Arm server + - Simulate the firmware using the RD-V3 FVP + - Build and launch OpenBMC and UEFI images on the RD-V3 FVP + - Validate host-BMC communication via UART and Serial-over-LAN + - Implement and validate a custom IPMI command in OpenBMC + +prerequisites: + - Access to an Arm Neoverse-based Linux machine (either cloud-based or local) is required, with at least 80 GB of free disk space, 48 GB of RAM, and running Ubuntu 22.04 LTS. + - Working knowledge of Docker, Git, and Linux terminal tools + - Basic understanding of server firmware stack (UEFI, BMC, TF-A, etc.) + +author: + - Odin Shen + - Ken Zhang + +### Tags +skilllevels: Advanced +subjects: Containers and Virtualization +armips: + - Neoverse +tools_software_languages: + - C + - Docker + - FVP +operatingsystems: + - Linux + +further_reading: + - resource: + title: Reference Design software stack architecture + link: https://neoverse-reference-design.docs.arm.com/en/latest/about/software_stack.html + type: website + - resource: + title: OpenBMC website + link: https://www.openbmc.org/ + type: website + - resource: + title: Meta FVP base + link: https://github.com/openbmc/openbmc/tree/master/meta-evb/meta-evb-arm/meta-evb-fvp-base + type: website + - resource: + title: OpenBMC on FVP PoC + link: https://gitlab.arm.com/server_management/PoCs/fvp-poc + type: website + - resource: + title: ipmitool documentation + link: https://linux.die.net/man/1/ipmitool + type: website + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_running.gif b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_running.gif new file mode 100644 index 0000000000..c22bb5baeb Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_running.gif differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_sim.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_sim.jpg new file mode 100644 index 0000000000..36d70252da Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_cssv3_sim.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_hostuefi.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_hostuefi.jpg new file mode 100644 index 0000000000..7dc02997af Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_hostuefi.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_ipmi.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_ipmi.jpg new file mode 100644 index 0000000000..9582a4a603 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_ipmi.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_login.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_login.jpg new file mode 100644 index 0000000000..b9051538a7 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_login.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_overview.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_overview.jpg new file mode 100644 index 0000000000..123b465a18 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_overview.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_sol.jpg b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_sol.jpg new file mode 100644 index 0000000000..e696134f79 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/openbmc_webui_sol.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/pac/_index.md b/content/learning-paths/servers-and-cloud-computing/pac/_index.md index b593845f51..97aee0e01c 100644 --- a/content/learning-paths/servers-and-cloud-computing/pac/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pac/_index.md @@ -25,7 +25,6 @@ armips: operatingsystems: - Linux tools_software_languages: - - Coding - Runbook diff --git a/content/learning-paths/servers-and-cloud-computing/processwatch/_index.md b/content/learning-paths/servers-and-cloud-computing/processwatch/_index.md index 224d9cc524..342e128a4c 100644 --- a/content/learning-paths/servers-and-cloud-computing/processwatch/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/processwatch/_index.md @@ -25,7 +25,8 @@ tools_software_languages: - bpftool - libbpf - Capstone - - C/C++ + - C + - C++ - Runbook operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md index 59ac9e3129..364004c9e8 100644 --- a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md @@ -29,7 +29,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Python - PyTorch - Hugging Face diff --git a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md index ebd2ade135..5dae64ab7b 100644 --- a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md @@ -25,7 +25,7 @@ operatingsystems: - Linux tools_software_languages: - LLM - - GenAI + - Generative AI - Python - Hugging Face diff --git a/content/learning-paths/servers-and-cloud-computing/sve/_index.md b/content/learning-paths/servers-and-cloud-computing/sve/_index.md index 5fb8a7512c..020420362a 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sve/_index.md @@ -26,8 +26,7 @@ operatingsystems: - Linux tools_software_languages: - SVE - - Neon - - Coding + - NEON - armie - GCC - armclang diff --git a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md index af33b1c966..902cefcd4f 100644 --- a/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/tune-network-workloads-on-bare-metal/_index.md @@ -25,7 +25,7 @@ subjects: Performance and Architecture armips: - Neoverse tools_software_languages: - - Tomcat + - Apache Tomcat - wrk2 - OpenJDK 21 operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md b/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md index d1f6393013..4078ce4a98 100644 --- a/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md @@ -36,7 +36,7 @@ operatingsystems: tools_software_languages: - vLLM - LLM - - GenAI + - Generative AI - Python - PyTorch - OpenBLAS diff --git a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md index 4ab33e47ca..622d248dcf 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md @@ -26,7 +26,7 @@ operatingsystems: tools_software_languages: - vLLM - LLM - - GenAI + - Generative AI - Python - Hugging Face diff --git a/tools/check_open_category.py b/tools/check_open_category.py new file mode 100644 index 0000000000..ac0b784db6 --- /dev/null +++ b/tools/check_open_category.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python3 +""" +Simple checker for tools_software_languages. + +Usage: + python tools/check_tools_software_languages.py /abs/or/rel/path/to/content/
///_index.md + +Behavior: + - Fails LOUDLY if OPENAI_API_KEY is not set or OpenAI call cannot be made. + - Prints each step and the key variables/paths. + - Exits 0 if nothing to change; exits 1 if suggestions/replacements are recommended; exits 2 for usage/path errors; exits 3 for OpenAI errors. + +Requires: + pip install pyyaml openai + export OPENAI_API_KEY=sk-... +""" + +from __future__ import annotations +import os +import re +import sys +import json +import difflib +from pathlib import Path +from typing import Dict, List, Tuple + +import yaml +from openai import OpenAI + + +# ---------------------------- +# 1) File helpers (front matter) +# ---------------------------- + +FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) + +def read_front_matter(md_path: Path) -> Tuple[Dict, str]: + text = md_path.read_text(encoding="utf-8") + m = FRONTMATTER_RE.search(text) + if not m: + print(f"[ERROR] No YAML front matter found in {md_path}") + sys.exit(2) + front = yaml.safe_load(m.group(1)) or {} + body = text[m.end():] + return front, body + + +# ---------------------------- +# 2) Path resolution +# ---------------------------- + +def resolve_category_index(lp_index_path: Path) -> Path: + """ + Works for: + .../content/
///_index.md + or + .../content///_index.md + + The category file is the _index.md one level above the LP folder. + """ + category_index = lp_index_path.parent.parent / "_index.md" + print(f"[step] Resolved category index -> {category_index}") + if not category_index.exists(): + print(f"[ERROR] Category _index.md not found at {category_index}") + sys.exit(2) + return category_index + + +# ---------------------------- +# 3) Canonical list loading +# ---------------------------- + +def load_canonical_map(category_index_path: Path) -> Dict[str, int]: + """ + Expects in category front matter: + tools_software_languages_filter: + - Label A: 3 + - Label B: 1 + ... + Returns dict {label: count} + """ + front, _ = read_front_matter(category_index_path) + raw = front.get("tools_software_languages_filter") + if raw is None: + print("[WARN] 'tools_software_languages_filter' not found in category front matter. Using empty list.") + return {} + + canonical: Dict[str, int] = {} + if isinstance(raw, list): + for item in raw: + if isinstance(item, dict): + for k, v in item.items(): + canonical[str(k)] = int(v) + elif isinstance(item, str): + if ":" in item: + k, v = item.split(":", 1) + canonical[k.strip()] = int(v.strip()) + else: + canonical[item.strip()] = 0 + elif isinstance(raw, dict): + canonical = {str(k): int(v) for k, v in raw.items()} + else: + print("[WARN] Unexpected format for tools_software_languages_filter; treating as empty.") + print(f"[step] Loaded {len(canonical)} canonical labels from category.") + return canonical + + +# ---------------------------- +# 4) Normalization & similarity +# ---------------------------- + +def normalize_label(s: str) -> str: + s = s.strip().lower() + s = re.sub(r"\s*&\s*|\s*and\s*", "/", s) # use slash for combined items + s = re.sub(r"[–—-]", "-", s) # normalize dashes + s = re.sub(r"\s*[\/|]\s*", "/", s) # normalize separators + s = re.sub(r"\s+", " ", s) + return s + +def shortlist_similar(entry: str, canon_labels: List[str], n: int = 5) -> List[str]: + return difflib.get_close_matches(entry, canon_labels, n=n, cutoff=0.6) + +def title_like(label: str) -> str: + out = [] + for tok in re.split(r"(\s+)", label): + if not tok.strip(): + out.append(tok) + continue + if re.search(r"[^A-Za-z]", tok) or tok.isupper(): + out.append(tok) + else: + out.append(tok.capitalize()) + return "".join(out).strip() + + +# ---------------------------- +# 5) OpenAI call (FAILS if key missing or call fails) +# ---------------------------- + +def require_openai_client(model: str = "gpt-4o-2024-08-06") -> OpenAI: + key = os.environ.get("OPENAI_API_KEY") + if not key: + print("ERROR: OPENAI_API_KEY is not set. This checker requires AI. Exiting.") + sys.exit(3) + try: + client = OpenAI(api_key=key) + # quick no-op to surface auth issues early (optional) + # (we rely on the first call below; keeping it simple) + print(f"[step] OpenAI client ready. Model = {model}") + return client + except Exception as e: + print(f"ERROR: Failed to init OpenAI client: {e}") + sys.exit(3) + +def ai_decide_label(client: OpenAI, entry: str, candidates: List[str], examples: List[str], + model: str = "gpt-4o-2024-08-06") -> Dict: + """ + Structured decision: + action: use_existing | improve_new | ok + suggested_label: string + confidence: 0..1 + reason: string + """ + schema = { + "name": "ToolsLabelDecision", + "schema": { + "type": "object", + "properties": { + "action": {"type": "string", "enum": ["use_existing", "improve_new", "ok"]}, + "suggested_label": {"type": "string"}, + "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0}, + "reason": {"type": "string"} + }, + "required": ["action", "suggested_label", "confidence", "reason"], + "additionalProperties": False, + }, + "strict": True, + } + + system = ( + "You standardize taxonomy labels for 'tools_software_languages' on a public site. " + "Prefer an existing canonical label if the user's entry is a near-duplicate. " + "Flag entries that are not tools, software, or langages (such as hardware, devices, or generic lables like 'mobile' or 'coding')." + "Otherwise, minimally rewrite to match brand-correctness (NGINX not ngnix; AWS Lambda not Lambda; Apache Tomcat not Tomcat; Visual Studio Code not VS Code)." + ) + + payload = { + "entry": entry, + "candidate_existing_labels": candidates, + "canonical_examples": examples[:40] + } + + print(f"[ai] entry={entry!r}, candidates={candidates}") + + try: + resp = client.responses.create( + model=model, + instructions=system, + input=[{"role": "user", "content": [{"type": "input_text", "text": json.dumps(payload, ensure_ascii=False)}]}], + response_format={"type": "json_schema", **schema}, + temperature=0.0, + ) + out = json.loads(resp.output_text) + print(f"[ai] decision={out}") + return out + except Exception as e: + print(f"ERROR: OpenAI call failed: {e}") + sys.exit(3) + + +# ---------------------------- +# 6) Core check logic +# ---------------------------- + +def check_entries(lp_index_path: Path, model: str = "gpt-4o-2024-08-06") -> int: + """ + Returns number of issues (suggestions or replacements). + """ + print(f"[step] LP index path -> {lp_index_path}") + front, _ = read_front_matter(lp_index_path) + + # Read LP entries + entries = front.get("tools_software_languages") + if entries is None: + print("[WARN] No 'tools_software_languages' field found in LP. Treating as empty.") + entries = [] + if not isinstance(entries, list): + print("[ERROR] 'tools_software_languages' must be a list.") + sys.exit(2) + print(f"[step] Found {len(entries)} LP entries.") + + # Resolve category and load canonicals + category_index = resolve_category_index(lp_index_path) + canonical_map = load_canonical_map(category_index) + canonical_labels = sorted(canonical_map.keys()) + canonical_norm = {normalize_label(k): k for k in canonical_labels} + print(f"[step] Canonical labels (print up to 10): {canonical_labels[:10]}") + + # Require OpenAI (fail loudly if missing) + client = require_openai_client(model=model) + + # Analyze entries + issues = [] + for raw in entries: + if not isinstance(raw, str): + issues.append({"entry": raw, "status": "error", "message": "Non-string value."}) + print(f"[entry] {raw!r} -> ERROR non-string") + continue + + entry = raw.strip() + if not entry: + issues.append({"entry": raw, "status": "error", "message": "Empty string."}) + print(f"[entry] {raw!r} -> ERROR empty") + continue + + print(f"[entry] Checking: {entry!r}") + + # Exact match + if entry in canonical_labels: + print(" - exact canonical: OK") + continue + + # Normalized match + norm = normalize_label(entry) + if norm in canonical_norm: + suggested = canonical_norm[norm] + print(f" - normalized match -> suggest canonical '{suggested}'") + issues.append({"entry": entry, "status": "replace", "suggested": suggested, "why": "Normalization match."}) + continue + + # Similar candidates + AI decision + sims = shortlist_similar(entry, canonical_labels, n=7) + decision = ai_decide_label(client, entry, sims, canonical_labels, model=model) + + action = decision.get("action") + suggested = decision.get("suggested_label", entry) + conf = decision.get("confidence", 0.0) + reason = decision.get("reason", "") + + if action == "use_existing": + print(f" - AI: use existing -> '{suggested}' (conf={conf:.2f})") + issues.append({"entry": entry, "status": "replace", "suggested": suggested, + "why": f"Near-duplicate per AI (conf {conf:.2f})", "reason": reason}) + elif action == "improve_new": + # If the rewrite equals a canonical, treat as replace; else rewrite + if suggested in canonical_labels: + print(f" - AI: rewrite equals canonical -> replace with '{suggested}' (conf={conf:.2f})") + issues.append({"entry": entry, "status": "replace", "suggested": suggested, + "why": f"AI rewrite aligns with canonical (conf {conf:.2f})", "reason": reason}) + else: + print(f" - AI: suggest rewrite -> '{suggested}' (conf={conf:.2f})") + issues.append({"entry": entry, "status": "rewrite", "suggested": suggested, + "why": f"Style-aligned rewrite (conf {conf:.2f})", "reason": reason}) + else: + # ok; minor style nits? + improved = title_like(entry).replace(" & ", "/").replace("&", "/") + improved = re.sub(r"\s*/\s*", "/", improved) + if improved != entry: + print(f" - OK but style tweak -> '{improved}'") + issues.append({"entry": entry, "status": "rewrite", "suggested": improved, + "why": "Minor style tweak."}) + else: + print(" - OK as-is.") + + # Report + print("\n[report]") + if not issues: + print("✓ tools_software_languages: all entries look good.") + return 0 + + for i, it in enumerate(issues, 1): + print(f"{i}. Entry: {it['entry']!r}") + print(f" Status: {it['status']}") + if "suggested" in it: + print(f" Suggested: {it['suggested']}") + if "why" in it: + print(f" Why: {it['why']}") + if "reason" in it and it["reason"]: + print(f" Model: {it['reason']}") + print() + + print("[json]") + print(json.dumps({"file": str(lp_index_path), "issues": issues}, ensure_ascii=False, indent=2)) + return len(issues) + + +# ---------------------------- +# 7) main() +# ---------------------------- + +def main(): + if len(sys.argv) != 2: + print("Usage: python tools/check_tools_software_languages.py /path/to/content/
///_index.md") + sys.exit(2) + + lp_index_path = Path(sys.argv[1]).resolve() + if not lp_index_path.exists(): + print(f"[ERROR] File not found: {lp_index_path}") + sys.exit(2) + + # Single sequential flow so you can follow the variables: + problems = check_entries(lp_index_path, model="gpt-4o-2024-08-06") + + # Exit codes: 0 ok, 1 issues found + sys.exit(1 if problems > 0 else 0) + + +if __name__ == "__main__": + main()