diff --git a/.github/workflows/roadmap-update.yml b/.github/workflows/roadmap-update.yml index 979cfa6f87..e5f30bd448 100644 --- a/.github/workflows/roadmap-update.yml +++ b/.github/workflows/roadmap-update.yml @@ -171,13 +171,23 @@ jobs: console.log('Start Date field not found'); } } else if (labelName === 'publish') { - const endDateFieldId = await getFieldId(projectId, 'Publish Date'); - if (endDateFieldId) { - await updateDateField(projectId, itemId, endDateFieldId, today); + // Publish Date + const publishDateFieldId = await getFieldId(projectId, 'Publish Date'); + if (publishDateFieldId) { + await updateDateField(projectId, itemId, publishDateFieldId, today); console.log('Updated Publish Date to', today); } else { console.log('Publish Date field not found'); } + + // Last Reviewed Date (same as Publish Date) + const lastReviewedFieldId = await getFieldId(projectId, 'Last Reviewed Date'); + if (lastReviewedFieldId) { + await updateDateField(projectId, itemId, lastReviewedFieldId, today); + console.log('Updated Last Reviewed Date to', today); + } else { + console.log('Last Reviewed Date field not found'); + } } else { console.log('No action taken for label:', labelName); } @@ -187,4 +197,4 @@ jobs: } } - main(); \ No newline at end of file + main(); diff --git a/.wordlist.txt b/.wordlist.txt index 1718a909fc..96e6a2ba7a 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -5055,4 +5055,104 @@ Zynq ZYNQ zypper ZZa -ZZZZZ \ No newline at end of file +ZZZZZ +Abouelseoud +ActiveRecord +Axion's +Ayoub +BeagleBone +Bourjilat +Buildroot's +Bundler +CGO +ConfigMap +CubeProgrammer +DEBLOAT +DGX +DTC +Disaggregated +EP +Espressif +FPARMv +FRDM +FirewallD +Flink's +IDQ +Kustomize +LinkServer +MCXN +MIS +MVC +NVLink +OpenOCD +PRED +PackageCloud +Puncover +PyOCD +RemoteChannelThroughputBenchmark +Retbleed +SEGGER +SVD +Skaffold +Skylake +Spectre +Superchip +TARGETPLATFORM +TFLOPs +TMA +TMAM +UNHALTED +VPCs +Vmlinux +WordCount +YJIT +Yahya +backend's +btop +carkservice +cartservice +checkpointing +cours +dgx +dtlb +emailservice +firewalld +fopenmp +fs +htop +ident +insmod +itlb +ko +kube +loadgen +loadgenerator +microbenchmarks +microcoded +microoperation +mknod +mpki +mtune +mvn +mychardrv +nRF +narrowings +navpane +nginx's +nvcc +nvtop +oRT +petaFLOP +plaintext +rbenv +recommendationservice +remoteRebalance +residually +rubyonrails +sdcard +smi +spiko +transpile +tsc +typescriptlang +vmlinux diff --git a/assets/contributors.csv b/assets/contributors.csv index a9e1572658..fca2f42a34 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -104,4 +104,7 @@ Alejandro Martinez Vicente,Arm,,,, Mohamad Najem,Arm,,,, Ruifeng Wang,Arm,,,, Zenon Zhilong Xiu,Arm,,zenon-zhilong-xiu-491bb398,, -Zbynek Roubalik,Kedify,,,, \ No newline at end of file +Zbynek Roubalik,Kedify,,,, +Rani Chowdary Mandepudi, Arm,,,, +Ayoub Bourjilat,Ac6,Bourjilat,ayoub-bourjilat-a55b58165/,,https://www.ac6.fr/en/ +Yahya Abouelseoud,Arm,,,, diff --git a/content/install-guides/_images/xfce4.png b/content/install-guides/_images/xfce4.png index 396c78d33d..d512b1e4b7 100644 Binary files a/content/install-guides/_images/xfce4.png and b/content/install-guides/_images/xfce4.png differ diff --git a/content/install-guides/vnc.md b/content/install-guides/vnc.md index 3b8ec2ad02..ab59f55abe 100644 --- a/content/install-guides/vnc.md +++ b/content/install-guides/vnc.md @@ -9,7 +9,7 @@ additional_search_terms: - linux ### Estimated completion time in minutes (please use integer multiple of 5) -minutes_to_complete: 30 +minutes_to_complete: 15 author: Jason Andrews @@ -29,31 +29,29 @@ multitool_install_part: false # Set to true if a sub-page of a multi-page arti layout: installtoolsall # DO NOT MODIFY. Always true for tool install articles --- -Virtual Network Computing (VNC) is one of the common tools used to connect to a remote Linux desktop. During development it may be useful to quickly create a remote desktop on an Arm server. +Virtual Network Computing (VNC) is one of the common tools used to connect to a remote Linux desktop. During development, it can be useful to quickly create a remote desktop on an Arm server. -This section provides info about how to setup VNC on a remote Arm Linux machine. +This guide provides information about how to set up VNC on a remote Arm Linux machine. -Feel free to seek out additional VNC tutorials or add more information to this page. - -This installation only works on newer versions of Ubuntu and Debian. It was successfully tested on **Ubuntu 22.04** and is known to fail on **Ubuntu 20.04**. +This installation only works on newer versions of Ubuntu and Debian. It was successfully tested on Ubuntu 22.04 and Ubuntu 24.04. ## What is VNC? -VNC is a client server application. A VNC server runs on a remote machine. A VNC client runs on the local machine and connects to the remote server. +VNC is a client-server application. A VNC server runs on a remote machine. A VNC client runs on the local machine and connects to the remote server. ### How do I install the VNC server and xfce4 desktop? -To use VNC, a VNC server needs to be installed. There are multiple VNC servers which can be used. This recipe uses [TigerVNC](https://tigervnc.org/). +To use VNC, you need to install a VNC server. There are multiple VNC servers you can use. This guide uses [TigerVNC](https://tigervnc.org/). -Desktop software is also needed. There are many options for this, but using [xfce4](https://www.xfce.org/) makes for a minimal install with good performance. +You also need desktop software. There are many options for this, but using [xfce4](https://www.xfce.org/) provides a minimal install with good performance. -Install the desktop software. +Install the desktop software: ```bash sudo apt-get install xfce4 xfce4-goodies xorg dbus-x11 x11-xserver-utils xfce4-terminal -y ``` -Install the VNC server. +Install the VNC server: ```bash sudo apt-get install tigervnc-standalone-server tigervnc-common -y @@ -61,17 +59,17 @@ sudo apt-get install tigervnc-standalone-server tigervnc-common -y ### How do I set a VNC password? -Run the password command to set a password for VNC. This is not the password for the user account, just for the VNC client to connect to the VNC server. +Run the `vncpasswd` command to set a password for VNC. This is not the password for your user account, but for the VNC client to connect to the VNC server. ```console vncpasswd ``` -Remember the password for later when the client is connected. +Remember this password for later when you connect the client. ### How do I configure the desktop startup for VNC? -Create a file at `$HOME/.vnc/xstartup` with the contents: +Create a file at `$HOME/.vnc/xstartup` with the following contents: ```console #!/bin/sh @@ -79,17 +77,21 @@ unset SESSION_MANAGER unset DBUS_SESSION_BUS_ADDRESS exec startxfce4 ``` -Make sure the `xstartup` file has executable permission. +Make sure the `xstartup` file has executable permissions: + ```console chmod +x $HOME/.vnc/xstartup ``` ### How do I set up a systemd service to manage VNC? -To create a systemd service to start the VNC server create the file `/etc/systemd/system/vncserver@.service` +To create a systemd service to start the VNC server, create the file `/etc/systemd/system/vncserver@.service`. -Use sudo or root as it is located in a read-only area. -```console +Use `sudo` or root privileges because this file is in a system directory. + +If your username is not `ubuntu` change the `User` value to match your username after you create the new file. + +```ini [Unit] Description=Remote desktop service (VNC) After=syslog.target network.target @@ -105,9 +107,9 @@ Use sudo or root as it is located in a read-only area. [Install] WantedBy=multi-user.target +``` - ``` -The commands below are for any Linux distribution using `systemd`. +The following commands are for any Linux distribution that uses `systemd`. To start the VNC service: @@ -129,24 +131,25 @@ sudo systemctl restart vncserver@1.service ### How do I use port forwarding via SSH to connect to VNC? -The default port for the first instance of VNC is `5901`. SSH port forwarding is the best solution for accessing the Linux desktop on a cloud machine. This way no additional ports need to be opened in the security group. +The default port for the first instance of VNC is `5901`. SSH port forwarding is the recommended solution for accessing the Linux desktop on a cloud machine. This way, no additional ports need to be opened in the security group. -SSH to your remote Linux machine. Refer to [SSH](/install-guides/ssh/) for additional details. +SSH to your remote Linux machine. See [SSH](/install-guides/ssh/) for additional details. -Substitute your private key file and public IP address of the remote machine. +Substitute your private key file and the public IP address of the remote machine in the following command: ```console ssh -i -L 5901:localhost:5901 ubuntu@ ``` -Once connected via SSH, use a VNC client to connect. [Download](https://sourceforge.net/projects/tigervnc/files/stable/1.12.0/) an install a TigerVNC client for your computer. +Once connected via SSH, use a VNC client to connect. [Download](https://sourceforge.net/projects/tigervnc/files/stable/1.12.0/) and install a TigerVNC client for your computer. + +Open the VNC client and enter the following for the VNC server: -Open the VNC client and enter the following for the VNC server. ```console localhost:5901 ``` -You will be prompted for the password created earlier with `vncpasswd`. +You will be prompted for the password you created earlier with `vncpasswd`. -A remote Linux Desktop should appear on your local computer. Make sure to close the VNC client first and then exit the SSH connection. +A remote Linux desktop should appear on your local computer. When you are finished, close the VNC client first and then exit the SSH connection. ![Linux desktop #center](/install-guides/_images/xfce4.png) diff --git a/content/learning-paths/automotive/zenacssdebug/_index.md b/content/learning-paths/automotive/zenacssdebug/_index.md index 9c57db90fa..89c7019b46 100644 --- a/content/learning-paths/automotive/zenacssdebug/_index.md +++ b/content/learning-paths/automotive/zenacssdebug/_index.md @@ -14,7 +14,7 @@ learning_objectives: prerequisites: - Ubuntu 22.04 host machine - - Arm Development Studio 2024.1 or later with a valid license - for support see the [Install Guide for ADS](/install-guides/armds) + - Arm Development Studio 2024.1 or later with a valid license - for support see the [Install Guide for Arm DS](/install-guides/armds) - Basic understanding of the Arm Zena CSS software stack, Armv8-A/Armv9-A cores, and Linux author: Ronan Synnott diff --git a/content/learning-paths/automotive/zenacssdebug/config.md b/content/learning-paths/automotive/zenacssdebug/config.md index b9d6dbf12f..4290df2611 100644 --- a/content/learning-paths/automotive/zenacssdebug/config.md +++ b/content/learning-paths/automotive/zenacssdebug/config.md @@ -46,19 +46,27 @@ Debug configurations are stored in a configuration database. Create a local data - Open the same wizard (**File > New > Other**), then choose **Configuration Database > Model Configuration**. - Click **Next**, select the **Configuration Database** you created, then click **Next**. - For **Model Interface**, choose **Iris**, then click **Next**. -- Choose **Browse for model running on local host**. The debugger detects and interrogates the FVP. +- Choose **Browse for model running on local host**. Select your FVP and click **Finish**. The debugger detects and interrogates the FVP. - If connecting remotely, choose **Connect to model running on either local or remote host** and provide the host and port. +{{% notice Tip %}} +The name of the FVP may be displayed as `RD_ASD` or other. + +If unsure, use the port number to identify the correct FVP. +{{% /notice %}} + Arm Development Studio generates a `model.mdf` file that enumerates all CPUs in the FVP. -Optionally, update **Manufacturer Name** (for example, `Arm`) and **Platform Name** (for example, `Zena_CSS_FVP`). Then **Save** and **Import** the model into the configuration database. +Optionally, update **Manufacturer Name** (for example, `Arm`) and **Platform Name** (for example, `Zena_CSS_FVP`). + +**Save** and **Import** the model into the configuration database. {{% notice Tip %}} -If the FVP is not detected, verify the Iris server is running on the expected port (`7100` by default) and that your firewall allows local connections. For remote connections, confirm the host is reachable and the port is open. +If the FVP is not detected, verify the Iris server is running on the expected port (`7100` by default) and that your firewall allows local connections. + +For remote connections, confirm the host is reachable and the port is open. {{% /notice %}} A `model.mdf` file will be created that identifies all CPUs within the FVP. -You can change the **Manufacturer Name** and **Platform Name** to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then **Save**, and **Import** into the configuration database. - The debugger is now aware of the FVP and you are ready to debug. diff --git a/content/learning-paths/automotive/zenacssdebug/primarycompute.md b/content/learning-paths/automotive/zenacssdebug/primarycompute.md index c3c41e4ae3..be53e9e21f 100644 --- a/content/learning-paths/automotive/zenacssdebug/primarycompute.md +++ b/content/learning-paths/automotive/zenacssdebug/primarycompute.md @@ -25,8 +25,9 @@ For example the processors start in `EL3` and move to `EL2N` when the Linux kern ``` text stop -add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-a/2.11.0+git/image/firmware/bl2.elf EL3:0x0 +add-symbol-file "/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-a/2.13.0+git/image/firmware/bl2.elf" EL3:0x0 tbreak bl2_entrypoint + ``` {{% notice Note %}} @@ -41,12 +42,20 @@ Symbol loading is Exception Level–aware. If execution changes Exception Level, ## Debug the Linux kernel with OS awareness (symmetric multiprocessing) -Switch to the `Primary_Linux.launch` connection you created earlier to enable Arm Development Studio OS awareness for the Linux kernel. Load the kernel symbols and set source mapping if your kernel sources are located outside the default paths: +{{% notice Note %}} +OS awareness for Linux Kernel 6.12 (as used with Reference Software Stack 2.1) is not currently supported as of Arm Development Studio 2025.0. + +It will be supported in a future Development Studio version. +{{% /notice %}} + +Disconnect `Primary_init.launch` and use the `Primary_Linux.launch` connection you created earlier to enable Arm Development Studio OS awareness for the Linux kernel. + +Load the kernel symbols and set source mapping if your kernel sources are located outside the default paths: ```text stop -add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/linux-yocto/6.6.54+git/linux-fvp_rd_aspen-standard-build/vmlinux EL2N:0x0 -set substitute-path /usr/src/kernel/ /arm-auto-solutions/build/tmp_baremetal/work-shared/fvp-rd-aspen/kernel-source/ +add-symbol-file "/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/linux-yocto-rt/6.12.30+git/linux-fvp_rd_aspen-preempt-rt-build/vmlinux" EL2N:0x0 +set substitute-path "/usr/src/kernel/" "/arm-auto-solutions/build/tmp_baremetal/work-shared/fvp-rd-aspen/kernel-source/" ``` Run the FVP until the OS prompt appears. @@ -73,7 +82,7 @@ You might see a warning like: ```text WARNING(ROS60): Could not enable OS support as the OS does not appear to be initialized. This might be caused by a mismatch between the loaded symbols and the code on the target or because the OS is not up and running. Enabling OS support will be re-attempted when the target next stops. ``` -This occurs if the OS has not completed boot when you connect; it is safe to ignore and will clear after the next target stop. +This occurs if the OS has not completed boot when you connect; it is safe to ignore and will clear when stopping target after the OS has booted. {{% /notice %}} You have successfully learnt how to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack. diff --git a/content/learning-paths/automotive/zenacssdebug/rse.md b/content/learning-paths/automotive/zenacssdebug/rse.md index 0580a209aa..e485b60dab 100644 --- a/content/learning-paths/automotive/zenacssdebug/rse.md +++ b/content/learning-paths/automotive/zenacssdebug/rse.md @@ -46,17 +46,17 @@ Load TF‑M symbols and map sources: - In **Debug Control**, open the pane menu and choose **Load...** - Select **Add Symbols file**. -- Choose the TF‑M image, for example: +- Locate the TF‑M image, for example: ```bash - /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf + /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.2.1+git/build/bin/bl1_1.axf ``` When prompted for **substitute path**, map build-time paths to your local sources, for example: ```bash - /usr/src/debug/trusted-firmware-m/2.1.0/ - /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/ + /usr/src/debug/trusted-firmware-m/2.2.1+git/ + /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.2.1+git/git/tfm/ ``` -Step one instruction to fetch the reset handler and stop there: +Step one instruction to fetch the reset handler and stop there. In the debugger `Commands` pane enter: ```text stepi ``` @@ -78,10 +78,14 @@ Automate the connection steps by adding **Debugger Commands** to the `.launch` c ```text stop -add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf -set substitute-path /usr/src/debug/trusted-firmware-m/2.1.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/ +add-symbol-file "/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.2.1+git/build/bin/bl1_1.axf" +set substitute-path "/usr/src/debug/trusted-firmware-m/2.2.1+git/" "/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.2.1+git/git/tfm/" stepi ``` +{{% notice Note %}} +Use the `Commands` or `History` pane to see and copy the equivalent debugger commands of the GUI actions. +{{% /notice %}} + ![RSE.launch in Arm Development Studio showing Debugger pane with TF-M symbols loaded and path substitution mapping alt-text#center](debugger_commands.png "RSE Debugger pane with TF-M symbol loading and source path substitution") diff --git a/content/learning-paths/automotive/zenacssdebug/safetyisland.md b/content/learning-paths/automotive/zenacssdebug/safetyisland.md index 02663a4d34..f35ce2a853 100644 --- a/content/learning-paths/automotive/zenacssdebug/safetyisland.md +++ b/content/learning-paths/automotive/zenacssdebug/safetyisland.md @@ -29,8 +29,8 @@ Configure the **SI** model connection similarly to **RSE**. Add the following ** ```text stop -add-symbol-file /arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/si0_ramfw.elf -set substitute-path /usr/src/debug/scp-firmware/2.14.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/scp-firmware/2.14.0/git/ +add-symbol-file "/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/si0_ramfw.elf" +set substitute-path "/usr/src/debug/scp-firmware/2.16.0+git/" "/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/scp-firmware/2.16.0+git/git/" b arch_exception_reset ``` @@ -53,10 +53,7 @@ When RSE enables Safety Island, you will see a log message like: The full output log is shown here for your reference: ``` output -Trying ::1... -Trying 127.0.0.1... -Connected to localhost. -Escape character is '^]'. +[INF] [CC3XX] Init OK PIDR0: 0xc1 [INF] Starting TF-M BL1_1 [INF] Jumping to BL1_2 [INF] Starting TF-M BL1_2 @@ -65,12 +62,23 @@ Escape character is '^]'. [INF] BL2 image validated successfully [INF] Jumping to BL2 [INF] Starting bootloader +[INF] [CC3XX] Init OK PIDR0: 0xc1 [INF] PSA Crypto init done, sig_type: EC-P256 +[INF] BL2: SI CL1 not present, skip loading [INF] BL2: SI CL0 pre load start [INF] BL2: SI CL0 pre load complete [INF] Primary slot: version=0.0.7+0 -[INF] Secondary slot: version=0.0.7+0 +[INF] Image 3 Secondary slot: Image not found [INF] Image 3 RAM loading to 0x70083c00 is succeeded. +[INF] Key 0 hash found for image 3 [INF] Image 3 loaded from the primary slot [INF] BL2: SI CL0 post load start +[INF] BL2: SI ATU region 0: [0x80000000 - 0xbfffffff]->[0x1_00000000 - 0x1_3fffffff] +[INF] BL2: SI ATU region 1: [0xc0000000 - 0xcfffffff]->[0x1_40000000 - 0x1_4fffffff] +[INF] BL2: SI ATU region 2: [0xd0000000 - 0xd001ffff]->[0x20000_d8000000 - 0x20000_d801ffff] +[INF] BL2: SI ATU region 3: [0xd0020000 - 0xd002ffff]->[0x20000_d0200000 - 0x20000_d020ffff] +[INF] BL2: SI ATU region 4: [0xd0030000 - 0xd003ffff]->[0x20000_d0400000 - 0x20000_d040ffff] +[INF] BL2: SI ATU region 5: [0xd0040000 - 0xd006ffff]->[0x20000_d0100000 - 0x20000_d012ffff] +[INF] BL2: SI ATU region 6: [0xe0030000 - 0xe0031fff]->[0x0_00000000 - 0x0_00001fff] +[INF] BL2: SI ATU region 7: [0xe0130000 - 0xe0135fff]->[0x0_00100000 - 0x0_00105fff] ``` diff --git a/content/learning-paths/automotive/zenacssdebug/zena.md b/content/learning-paths/automotive/zenacssdebug/zena.md index a6a8edb1e9..f579010bca 100644 --- a/content/learning-paths/automotive/zenacssdebug/zena.md +++ b/content/learning-paths/automotive/zenacssdebug/zena.md @@ -20,12 +20,16 @@ For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.a ## Build the software stack -Follow the steps to download and build the software stack in the [Arm Zena CSS User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html). +Follow the steps to download and build the software stack in the [Arm Automotive Solutions Software Reference Stack User Guide](https://arm-auto-solutions.docs.arm.com/en/latest/rd-aspen/user_guide/reproduce.html). -Here the default **Arm Automotive Solutions Demo** build is used. +The default **Cfg1, Arm Automotive Solutions Demo, Bare Metal** build is used in this learning path. + +Software build will usually take at least one hour to complete, depending on host machine. {{% notice Note %}} The primary focus of this Learning Path is to demonstrate how to debug the software stack. + +The latest version of software tested at time of writing is `2.1`. Screenshots show previous versions. {{% /notice %}} ## Verify the build and execution @@ -38,18 +42,18 @@ kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose" The system runs through the boot process until a Linux prompt is available (in `terminal_ns_uart0`). -Press **Ctrl+C** in the command terminal to terminate the process. +Press **Ctrl+C** in the command terminal (use `Ctrl+B` > `0` to swap to that terminal) to terminate the process. ## Install FVP (optional) The FVP is downloaded and installed as part of the build process. -You can also separately download Arm-Zena-CSS-FVP from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs). +You can also separately download either `Arm-Zena-CSS-FVP` (`Cfg1` or `Cfg2`) from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs). See also the Arm Ecosystem FVPs and Architecture Envelope Models [Install Guide](/install-guides/fm_fvp/eco_fvp/). {{% notice Note %}} -For legacy reasons, the FVP is named `FVP_RD_Aspen`. +For legacy reasons, the FVP was previously named `FVP_RD_Aspen`. {{% /notice %}} ## Arm Development Studio diff --git a/content/learning-paths/cross-platform/_example-learning-path/appendix-1-formatting.md b/content/learning-paths/cross-platform/_example-learning-path/appendix-1-formatting.md index 3a13d6d5c2..5b1a41506e 100644 --- a/content/learning-paths/cross-platform/_example-learning-path/appendix-1-formatting.md +++ b/content/learning-paths/cross-platform/_example-learning-path/appendix-1-formatting.md @@ -301,3 +301,4 @@ You can also give a custom title as specified here. {{% /notice %}} + diff --git a/content/learning-paths/cross-platform/_example-learning-path/questions.md b/content/learning-paths/cross-platform/_example-learning-path/questions.md index 8d36189628..27dac51177 100644 --- a/content/learning-paths/cross-platform/_example-learning-path/questions.md +++ b/content/learning-paths/cross-platform/_example-learning-path/questions.md @@ -30,3 +30,4 @@ Modify the weight values to be unique and the extra buttons will disappear. ### Why aren't my changes showing up under Learning Paths? There are various reasons this can happen. One being that the top links on the page will take you to the external site. Make sure that you are still viewing the Hugo server on `localhost`. + diff --git a/content/learning-paths/cross-platform/topdown-compare/1-top-down.md b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md index cab6ed9313..4799fdd962 100644 --- a/content/learning-paths/cross-platform/topdown-compare/1-top-down.md +++ b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md @@ -6,31 +6,46 @@ weight: 3 layout: learningpathall --- -## What are the differences between Arm and Intel x86 PMU counters? +## What are the differences between Arm Neoverse and Intel x86 PMU counters? -This is a common question from both software developers and performance engineers working across architectures. +This is a common question from both software developers and performance engineers working with multiple architectures. -Both Intel x86 and Arm Neoverse CPUs provide sophisticated Performance Monitoring Units (PMUs) with hundreds of hardware counters. Instead of trying to list all available counters and compare microarchitecture, it makes more sense to focus on the performance methodologies they enable and the calculations used for performance metrics. +Both Intel x86 and Arm Neoverse CPUs provide sophisticated Performance Monitoring Units (PMUs) with hundreds of hardware counters. Instead of trying to list all available counters and compare microarchitectures, it makes more sense to focus on the performance methodologies they enable and the calculations used for performance metrics. -While the specific counter names and formulas differ between architectures, both Intel x86 and Arm Neoverse have converged on top-down performance analysis methodologies that categorize performance bottlenecks into four key areas: +Although counter names and formulas differ, both Intel x86 and Arm Neoverse classify performance bottlenecks into the same four top-level categories: - Retiring - Bad Speculation - Frontend Bound - Backend Bound -This Learning Path provides a comparison of how x86 processors implement multi-level hierarchical top-down analysis compared to Arm Neoverse's methodology, highlighting the similarities in approach while explaining the architectural differences in PMU counter events and formulas. +The first step is to focus on the dominant top-level bucket. Then, on Intel x86 you descend through the formal sub-levels. On Arm, you derive similar insights using architecture-specific event groups and formulas that approximate those sub-divisions. + +This Learning Path compares Intel x86 Top-down Microarchitecture Analysis (a formal multi-level hierarchy) with Arm Neoverse top-down guidance (the same four level-1 buckets, but fewer standardized sub-levels). You will learn how the approaches align conceptually while noting differences in PMU event semantics and machine width. ## Introduction to top-down performance analysis -The top-down methodology makes performance analysis easier by shifting focus from individual PMU counters to pipeline slot utilization. Instead of trying to interpret dozens of seemingly unrelated metrics, you can systematically identify bottlenecks by attributing each CPU pipeline slot to one of the four categories. +The top-down methodology makes performance analysis easier by shifting focus from individual PMU counters to pipeline utilization. Instead of trying to interpret dozens of metrics, you can systematically identify bottlenecks by attributing CPU pipeline activity to one of the four categories. + +A slot represents one potential opportunity for a processor core to issue and execute a micro-operation (µop) during a single clock cycle. +The total slots = (machine width × cycles), where each slot can be used productively or wasted through speculation or stalls. + +**Retiring** represents slots that retire useful instructions (µops). + +**Bad Speculation** accounts for slots consumed by mispredicted branches, pipeline flushes, or other speculative work that does not retire. +On Intel x86, this includes machine clears and on Arm Neoverse it is modeled through misprediction and refill events. -**Retiring** represents pipeline slots that successfully complete useful work, while **Bad Speculation** accounts for slots wasted on mispredicted branches and pipeline flushes. **Frontend Bound** identifies slots stalled due to instruction fetch and decode limitations, whereas **Backend Bound** covers slots stalled by execution resource constraints such as cache misses or arithmetic unit availability. +**Frontend Bound** identifies slots lost because the core cannot supply enough decoded micro-ops. On Intel this subdivides into frontend latency (instruction cache, ITLB, branch predictor) versus frontend bandwidth (µop supply limits). On Arm Neoverse you approximate similar causes with instruction fetch, branch, and L1 I-cache events. -The methodology allows you to drill down only into the dominant bottleneck category, avoiding the complexity of analyzing all possible performance issues at the same time. +**Backend Bound** covers slots where issued micro-ops wait on data or execution resources. Intel x86 subdivides this into memory bound (cache / memory hierarchy latency or bandwidth) versus core bound (execution port pressure, scheduler or reorder buffer limits). Arm Neoverse guidance uses memory versus core style breakdown with different PMU event groupings and separates long-latency data access from execution resource contention. + +The methodology allows you to focus on the dominant bottleneck category, avoiding the complexity of analyzing all possible performance issues at the same time. The next sections compare the Intel x86 methodology with the Arm top-down methodology. -{{% notice Note %}} +{{% notice Notes %}} +This Learning Path uses the Arm Neoverse V2 when specific details are required, and some things will be different from other Neoverse N and Neoverse V processors. + AMD also has an equivalent top-down methodology which is similar to Intel, but uses different counters and calculations. {{% /notice %}} + diff --git a/content/learning-paths/cross-platform/topdown-compare/1a-intel.md b/content/learning-paths/cross-platform/topdown-compare/1a-intel.md index 317eb3f105..5845a11267 100644 --- a/content/learning-paths/cross-platform/topdown-compare/1a-intel.md +++ b/content/learning-paths/cross-platform/topdown-compare/1a-intel.md @@ -1,5 +1,5 @@ --- -title: "Understand Intel x86 multi-level hierarchical top-down analysis" +title: "Understand Intel x86 multilevel hierarchical top-down analysis" weight: 4 ### FIXED, DO NOT MODIFY @@ -8,62 +8,99 @@ layout: learningpathall ## Configure slot-based accounting with Intel x86 PMU counters -Intel uses a slot-based accounting model where each CPU cycle provides multiple issue slots. A slot is a hardware resource needed to process micro-operations (uops). More slots means more work can be done per cycle. The number of slots depends on the microarchitecture design, but current Intel processor designs typically have four issue slots per cycle. +Intel uses a slot-based accounting model, where each CPU cycle provides multiple issue slots. -Intel's methodology uses a multi-level hierarchy that typically extends to 3-4 levels of detail. Each level provides progressively more granular analysis, allowing you to drill down from high-level categories to specific microarchitecture events. +A slot is a hardware resource that represents one opportunity for a microoperation (μop) to issue for execution during a single clock cycle. -## Level 1: Identify top-level performance categories +Each cycle, the core exposes a fixed number of these issue opportunities, and this is known as the machine width in Intel’s Top-Down Microarchitecture Analysis Methodology (TMAM). You may also see the methodology referred to as TMA. -At Level 1, all pipeline slots are attributed to one of four categories, providing a high-level view of whether the CPU is doing useful work or stalling. +The total number of available slots is defined as: -- Retiring = `UOPS_RETIRED.RETIRE_SLOTS / SLOTS` -- Bad Speculation = `(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + N * RECOVERY_CYCLES) / SLOTS` -- Frontend Bound = `IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS` -- Backend Bound = `1 - (Frontend + Bad Spec + Retiring)` +`Total_SLOTS = machine_width × CPU_CLK_UNHALTED.THREAD` -Where `SLOTS = 4 * CPU_CLK_UNHALTED.THREAD` on most Intel cores. +The machine width corresponds to the maximum number of μops that a core can issue to execution pipelines per cycle. -## Level 2: Analyze broader bottleneck causes +- Intel cores such as Skylake and Cascade Lake are 4-wide. +- Newer server and client cores such as Sapphire Rapids, Emerald Rapids, Granite Rapids, and Meteor Lake P-cores are 6-wide. +- Future generations may widen further, but the slot-based framework remains the same. -Once you've identified the dominant Level 1 category, Level 2 drills into each area to identify broader causes. This level distinguishes between frontend latency and bandwidth limits, or between memory and core execution stalls in the backend. +Tools such as `perf topdown` automatically apply the correct machine width for the detected CPU. -- Frontend Bound covers frontend latency compared with frontend bandwidth -- Backend Bound covers memory bound compared with core bound -- Bad Speculation covers branch mispredicts compared with machine clears -- Retiring covers base compared with microcode sequencer +Intel’s methodology uses a multi-level hierarchy that typically extends to three or four levels of detail. Each level provides progressively finer analysis, allowing you to drill down from high-level categories to specific hardware events. -## Level 3: Target specific microarchitecture bottlenecks +### Level 1: Identify top-level performance categories -After identifying broader cause categories in Level 2, Level 3 provides fine-grained attribution that pinpoints specific bottlenecks like DRAM latency, cache misses, or port contention. This precision makes it possible to identify the exact root cause and apply targeted optimizations. +At Level 1, all pipeline slots are attributed to one of four categories, giving a high-level view of how the CPU’s issue capacity is being used: -Memory Bound expands into detailed cache hierarchy analysis including L1 Bound, L2 Bound, L3 Bound, DRAM Bound, and Store Bound categories. Core Bound breaks down into execution unit constraints such as Divider and Ports Utilization, along with many other specific microarchitecture-level categories that enable precise performance tuning. +- Retiring = UOPS_RETIRED.RETIRE_SLOTS / SLOTS +- Frontend Bound = IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS +- Bad Speculation = derived from speculative flush behavior (branch mispredictions and machine clears) or computed residually +- Backend Bound = 1 − (Retiring + Frontend Bound + Bad Speculation) -## Level 4: Access specific PMU counter events +Most workflows compute Backend Bound as the residual after Retiring, Frontend Bound, and Bad Speculation are accounted for. -Level 4 provides direct access to the specific microarchitecture events that cause the inefficiencies. At this level, you work directly with raw PMU counter values to understand the underlying hardware behavior causing performance bottlenecks. This enables precise tuning by identifying exactly which execution units, cache levels, or pipeline stages are limiting performance, allowing you to apply targeted code optimizations or hardware configuration changes. +### Level 2: Analyze broader bottleneck causes -## Apply essential Intel x86 PMU counters for analysis +Once the dominant Level 1 category is identified, Level 2 separates each category into groups: -Intel processors expose hundreds of performance events, but top-down analysis relies on a core set of counters that map directly to the four-level hierarchy: +| Category | Level 2 Sub-Categories | Purpose | +|-----------|------------------------|----------| +| Frontend Bound | Frontend Latency vs Frontend Bandwidth | Distinguish instruction-fetch delays from decode or μop cache throughput limits. | +| Backend Bound | Memory Bound vs Core Bound | Separate stalls caused by memory hierarchy latency/bandwidth from those caused by execution-unit contention or scheduler pressure. | +| Bad Speculation | Branch Mispredict vs Machine Clears | Identify speculation waste due to control-flow mispredictions or pipeline clears. | +| Retiring | Base vs Microcode Sequencer | Show the proportion of useful work from regular instructions versus microcoded sequences. | -| Event Name | Purpose | -| :---------------------------------------------- | :----------------------------------------------------------------------------------- | -| `UOPS_RETIRED.RETIRE_SLOTS` | Count retired micro-operations (Retiring) | -| `UOPS_ISSUED.ANY` | Count issued micro-operations (helps quantify Bad Speculation) | -| `IDQ_UOPS_NOT_DELIVERED.CORE` | Frontend delivery failures (Frontend Bound) | -| `CPU_CLK_UNHALTED.THREAD` | Core clock cycles (baseline for normalization) | -| `BR_MISP_RETIRED.ALL_BRANCHES` | Branch mispredictions (Bad Speculation detail) | -| `MACHINE_CLEARS.COUNT` | Pipeline clears due to memory ordering or faults (Bad Speculation detail) | -| `CYCLE_ACTIVITY.STALLS_TOTAL` | Total stall cycles (baseline for backend breakdown) | -| `CYCLE_ACTIVITY.STALLS_MEM_ANY` | Aggregate stalls from memory hierarchy misses (Backend → Memory Bound) | -| `CYCLE_ACTIVITY.STALLS_L1D_MISS` | Stalls due to L1 data cache misses | -| `CYCLE_ACTIVITY.STALLS_L2_MISS` | Stalls waiting on L2 cache misses | -| `CYCLE_ACTIVITY.STALLS_L3_MISS` | Stalls waiting on last-level cache misses | -| `MEM_LOAD_RETIRED.L1_HIT` / `L2_HIT` / `L3_HIT` | Track where loads are satisfied in the cache hierarchy | -| `MEM_LOAD_RETIRED.L3_MISS` | Loads missing LLC and going to memory | -| `MEM_LOAD_RETIRED.DRAM_HIT` | Loads serviced by DRAM (DRAM Bound detail) | -| `OFFCORE_RESPONSE.*` | Detailed classification of off-core responses (L3 vs. DRAM, local vs. remote socket) | - - -Using the above levels of metrics, you can determine which of the four top-level categories are causing bottlenecks. +### Level 3: Target specific microarchitecture bottlenecks + +Level 3 provides fine-grained attribution that pinpoints precise hardware limitations. + +Examples include: +- Memory Bound covers L1 Bound, L2 Bound, L3 Bound, DRAM Bound, Store Bound +- Core Bound covers execution-port pressure, divider utilization, scheduler or ROB occupancy +- Frontend latency covers instruction-cache misses, ITLB walks, branch-prediction misses +- Frontend bandwidth covers decode throughput or μop cache saturation + +At this level, you can determine whether workloads are limited by memory latency, cache hierarchy bandwidth, or execution-resource utilization. +### Level 4: Access specific PMU counter events + +Level 4 exposes the Performance Monitoring Unit (PMU) events that implement the hierarchy. + +Here you analyze raw event counts to understand detailed pipeline behavior. +Event names and availability vary by microarchitecture, but you can verify them with `perf list`. + +| Event Name | Purpose | +| :---------------------------------------------- | :----------------------------------------------------------------------------------- | +| `UOPS_RETIRED.RETIRE_SLOTS` | Counts retired μops | +| `UOPS_ISSUED.ANY` | Counts all issued μops (used in speculation analysis) | +| `IDQ_UOPS_NOT_DELIVERED.CORE` | Counts μops not delivered from frontend | +| `CPU_CLK_UNHALTED.THREAD` | Core clock cycles (baseline for normalization) | +| `BR_MISP_RETIRED.ALL_BRANCHES` | Branch mispredictions | +| `MACHINE_CLEARS.COUNT` | Pipeline clears due to faults or ordering | +| `CYCLE_ACTIVITY.STALLS_TOTAL` | Total stall cycles | +| `CYCLE_ACTIVITY.STALLS_MEM_ANY` | Stalls from memory hierarchy misses | +| `CYCLE_ACTIVITY.STALLS_L1D_MISS` | Stalls due to L1 data-cache misses | +| `CYCLE_ACTIVITY.STALLS_L2_MISS` | Stalls waiting on L2 cache misses | +| `CYCLE_ACTIVITY.STALLS_L3_MISS` | Stalls waiting on last-level cache misses | +| `MEM_LOAD_RETIRED.L1_HIT / L2_HIT / L3_HIT` | Track where loads are satisfied in the cache hierarchy | +| `MEM_LOAD_RETIRED.L3_MISS` | Loads missing the LLC and going to memory | +| `MEM_LOAD_RETIRED.DRAM_HIT` | Loads serviced by DRAM | +| `OFFCORE_RESPONSE.*` | Detailed classification of off-core responses (L3 vs DRAM, local vs remote socket) | + +Some events (for example, CYCLE_ACTIVITY.* and MEM_LOAD_RETIRED.*) vary across microarchitectures so you should confirm them on your CPU. + +### Practical guidance + +Here are some practical steps to keep in mind: + +- Normalize all metrics to total slots: machine_width × CPU_CLK_UNHALTED.THREAD. +- Start at Level 1 to identify the dominant bottleneck. +- Drill down progressively through Levels 2 and 3 to isolate the root cause. +- Use raw events (Level 4) for detailed validation or hardware-tuning analysis. +- Check event availability before configuring counters on different CPU generations. + +## Summary + +Intel's Top-Down methodology provides a structured, slot-based framework for understanding pipeline efficiency. Each slot represents a potential μop issue opportunity. + +By attributing every slot to one of the four categories you can measure how effectively a core executes useful work versus wasting cycles on stalls or speculation. diff --git a/content/learning-paths/cross-platform/topdown-compare/1b-arm.md b/content/learning-paths/cross-platform/topdown-compare/1b-arm.md index b328921f80..9104a25dc7 100644 --- a/content/learning-paths/cross-platform/topdown-compare/1b-arm.md +++ b/content/learning-paths/cross-platform/topdown-compare/1b-arm.md @@ -5,59 +5,123 @@ weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- + ## Explore Arm's approach to performance analysis -After understanding Intel's comprehensive 4-level hierarchy, you can explore how Arm approached the same performance analysis challenge with a different philosophy. Arm developed a complementary top-down methodology specifically for Neoverse server cores that prioritizes practical usability while maintaining analysis effectiveness. +After understanding the Intel x86 multi-level top-down hierarchy, you can look at how Arm Neoverse approaches the same challenge with a two-stage model designed for clarity and cross-architecture consistency. + +Arm's methodology follows the same four top-level categories: Retiring, Bad Speculation, Frontend Bound, and Backend Bound, but uses Arm-specific Performance Monitoring Unit (PMU) events and formulas. + +Neoverse V2 is used below; the details will be different for other Neoverse processors. + +## Pipeline slot model in Arm Neoverse V2 + +Neoverse V2 uses an 8-slot rename unit to measure pipeline bandwidth. + +Each cycle provides up to eight micro-operation (μop) slots for issue and execution. -The Arm Neoverse architecture uses an 8-slot rename unit for pipeline bandwidth accounting, which differs from Intel's issue-slot model. Unlike Intel's hierarchical model, Arm employs a streamlined two-stage methodology that balances analysis depth with practical usability. +This forms the foundation of the Neoverse V2 slot-based top-down accounting. -### Execute Stage 1: Calculate top-down performance categories +`Total_SLOTS = CPU_CYCLES × 8` -Stage 1 identifies high-level bottlenecks using the same four categories as Intel, but with Arm-specific PMU events and formulas. This stage uses slot-based accounting similar to Intel's approach while employing Arm event names and calculations tailored to the Neoverse architecture. +Just like Intel's issue-slot model, Arm attributes every slot to one of the same four categories. -#### Configure Arm-specific PMU counter formulas +This allows percentage-based comparisons of pipeline efficiency across different cores, regardless of frontend or backend width. -Arm uses different top-down metrics based on different events, but the concept remains similar to Intel's approach. The key difference lies in the formula calculations and slot accounting methodology: +## Stage 1: Identify top-level performance categories -| Metric | Formula | Purpose | +Stage 1 corresponds to Level 1 in Intel's TMAM and determines where the processor spends its available slots. + +All formulas below are defined in the Arm Neoverse V2 Telemetry Specification. + +| Metric | Formula | Description | | :-- | :-- | :-- | -| Backend bound | `100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * 8))` | Backend resource constraints | -| Frontend bound | `100 * ((STALL_SLOT_FRONTEND / (CPU_CYCLES * 8)) - (BR_MIS_PRED / (4 * CPU_CYCLES)))` | Frontend delivery issues | -| Bad speculation | `100 * (1 - (OP_RETIRED/OP_SPEC)) * (1 - (STALL_SLOT/(CPU_CYCLES * 8))) + (BR_MIS_PRED / (4 * CPU_CYCLES))` | Misprediction recovery | -| Retiring | `100 * (OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/(CPU_CYCLES * 8)))` | Useful work completed | +| Backend Bound | 100 × (STALL_SLOT_BACKEND / (CPU_CYCLES × 8) − (BR_MIS_PRED × 3 / CPU_CYCLES)) | Percentage of total slots stalled by backend resource constraints | +| Frontend Bound | 100 × (STALL_SLOT_FRONTEND / (CPU_CYCLES × 8) − (BR_MIS_PRED / CPU_CYCLES)) | Slots lost because the frontend cannot supply μops (fetch, decode, or branch delays) | +| Bad Speculation | 100 × ((1 − OP_RETIRED / OP_SPEC) × (1 − STALL_SLOT / (CPU_CYCLES × 8)) + (BR_MIS_PRED × 4 / CPU_CYCLES)) | Slots used by operations that never retire due to mispredictions or pipeline flushes | +| Retiring | 100 × (OP_RETIRED / OP_SPEC × (1 − STALL_SLOT / (CPU_CYCLES × 8))) | Slots that retire valid μops (useful work) | + +Each metric is expressed as a percentage of total slots. + +All four categories sum to 100%, providing a complete top-level pipeline utilization view. + +## Stage 2: Microarchitecture exploration and effectiveness groups + +Stage 2 expands on Stage 1 hotspots by examining detailed resource groups rather than a strict hierarchy. + +The Neoverse V2 Telemetry Specification organizes metrics into effectiveness groups that can be analyzed independently: + +| Metric Group | Example Metrics | Purpose | +| :-- | :-- | :-- | +| Cycle Accounting | frontend_stalled_cycles, backend_stalled_cycles | Percentage of cycles stalled in frontend vs. backend | +| Branch Effectiveness | branch_misprediction_ratio, branch_mpki | Branch prediction accuracy and misprediction rate | +| ITLB/DTLB Effectiveness | itlb_walk_ratio, dtlb_mpki | TLB efficiency and translation latency impact | +| Cache Effectiveness | l1i_cache_mpki, l1d_cache_miss_ratio, l2_cache_mpki, ll_cache_read_hit_ratio | Cache performance across all hierarchy levels | +| Operation Mix | integer_dp_percentage, load_percentage, simd_percentage, store_percentage, sve_all_percentage | Workload instruction composition and vector utilization | +| MPKI / Miss Ratio | Derived from cache and TLB refill events | Normalized misses per kilo instructions for cross-core comparisons | + +Unlike Intel's drill-down approach, Arm's groups can be explored in any order to focus on the dominant subsystem. + +## Key Arm Neoverse V2 PMU events for top-down analysis + +Neoverse V2 implements the Arm PMUv3.5 architecture and exposes about 155 events. + +The following subset is essential for top-down and resource-effectiveness analysis: + +| Event Name | Purpose / Usage | +| :-- | :-- | +| `CPU_CYCLES` | Core clock cycles – used as baseline for normalization | +| `OP_SPEC` | Speculatively executed μops – denominator for slot accounting | +| `OP_RETIRED` | Retired μops – measures useful work | +| `INST_RETIRED` | Retired instructions – used for IPC and MPKI metrics | +| `INST_SPEC` | Speculative instructions – required for Operation Mix | +| `STALL_SLOT` | All stalled slots (frontend + backend) | +| `STALL_SLOT_FRONTEND` | Stalled slots caused by frontend fetch/decode limitations | +| `STALL_SLOT_BACKEND` | Stalled slots caused by backend resource constraints | +| `BR_MIS_PRED` | Speculatively executed mispredicted branches (used in top-down formulas) | +| `BR_MIS_PRED_RETIRED` | Mispredicted branches retired – used in Branch Effectiveness | +| `BR_RETIRED` | Total branches retired – misprediction ratio denominator | +| `L1I_CACHE_REFILL` | Instruction cache refills – frontend latency indicator | +| `ITLB_WALK` | Instruction TLB walks – frontend translation stall indicator | +| `L1D_CACHE_REFILL` | Data cache refills – backend memory latency indicator | +| `L2D_CACHE_REFILL` | L2 cache refills – backend stall from L2 misses | +| `LL_CACHE_MISS_RD` | Last-level cache read misses – backend stalls from SLC or memory accesses | +| `DTLB_WALK` | Data TLB walks – backend stall due to address translation | +| `MEM_ACCESS` | All memory accesses – baseline for cache/TLB ratios | + +## Understanding MPKI metrics + +MPKI (Misses Per Kilo Instructions) is a normalized metric that measures cache or TLB misses per 1,000 retired instructions. + +The formula is: `MPKI = (Miss_Events / INST_RETIRED) × 1000` -### Execute Stage 2: Explore resource effectiveness groups +For example: +- L1D Cache MPKI = `(L1D_CACHE_REFILL / INST_RETIRED) × 1000` +- DTLB MPKI = `(DTLB_WALK / INST_RETIRED) × 1000` -Stage 2 focuses on resource-specific effectiveness metrics grouped by CPU component. This stage provides industry-standard metrics like MPKI (Misses Per Kilo Instructions) and offers detailed breakdown without the strict hierarchical drilling required by Intel's methodology. +MPKI provides several advantages: +- Workload comparison: Compare cache efficiency across different applications regardless of execution time +- Architecture comparison: Evaluate cache performance between different processor designs +- Optimization tracking: Measure improvement from code changes or compiler optimizations -#### Navigate resource groups without hierarchical constraints +## Practical guidance -Instead of Intel's hierarchical levels, Arm organizes detailed metrics into effectiveness groups that can be explored independently. +Here are some practical steps to keep in mind: -**Branch Effectiveness** provides misprediction rates and MPKI, while **ITLB/DTLB Effectiveness** measures translation lookaside buffer efficiency. **Cache Effectiveness** groups (L1I/L1D/L2/LL) deliver cache hit ratios and MPKI across the memory hierarchy. Additionally, **Operation Mix** breaks down instruction types (SIMD, integer, load/store), and **Cycle Accounting** tracks frontend versus backend stall percentages. +- Normalize all percentages to total slots (CPU_CYCLES × 8) +- Use Stage 1 to locate the dominant performance category +- Apply Stage 2 metric groups to isolate microarchitectural causes +- Compare frontend vs. backend stalls +- Evaluate branch predictor accuracy +- Use MPKI metrics (cache or TLB) = (refills / INST_RETIRED) × 1000 for workload comparisons +- For vectorized workloads, examine Operation Mix metrics (integer, SIMD, SVE percentages) -## Apply essential Arm Neoverse PMU counters for analysis +## Summary -Neoverse cores expose approximately 100 hardware events optimized for server workloads. The core set for top-down analysis includes: +Arm Neoverse V2 employs a concise, two-stage top-down methodology built around an 8-slot rename unit. -| Event Name | Purpose / Usage | -| :-------------------- | :--------------------------------------------------------------------------------------- | -| `CPU_CYCLES` | Core clock cycles (baseline for normalization). | -| `OP_SPEC` | Speculatively executed micro-operations (used as slot denominator). | -| `OP_RETIRED` | Retired micro-operations (used to measure useful work). | -| `INST_RETIRED` | Instructions retired (architectural measure; used for IPC, MPKI normalization). | -| `INST_SPEC` | Instructions speculatively executed (needed for operation mix and speculation analysis). | -| `STALL_SLOT` | Total stall slots (foundation for efficiency metrics). | -| `STALL_SLOT_FRONTEND` | Stall slots due to frontend resource constraints. | -| `STALL_SLOT_BACKEND` | Stall slots due to backend resource constraints. | -| `BR_RETIRED` | Branches retired (baseline for branch misprediction ratio). | -| `BR_MIS_PRED_RETIRED` | Mispredicted branches retired (branch effectiveness, speculation waste). | -| `L1I_CACHE_REFILL` | Instruction cache refills (frontend stalls due to I-cache misses). | -| `ITLB_WALK` | Instruction TLB walks (frontend stalls due to translation). | -| `L1D_CACHE_REFILL` | Data cache refills (backend stalls due to L1D misses). | -| `L2D_CACHE_REFILL` | Unified L2 cache refills (backend stalls from L2 misses). | -| `LL_CACHE_MISS_RD` | Last-level/system cache read misses (backend stalls from LLC/memory). | -| `DTLB_WALK` | Data TLB walks (backend stalls due to translation). | -| `MEM_ACCESS` | Total memory accesses (baseline for cache/TLB effectiveness ratios). | +Stage 1 classifies total slots into Retiring, Bad Speculation, Frontend Bound, and Backend Bound. +Stage 2 uses effectiveness groups to investigate specific subsystems such as branch prediction, cache, and memory. +This model mirrors Intel's top-down philosophy so you can compare the top-level categories. diff --git a/content/learning-paths/cross-platform/topdown-compare/1c-compare-arch.md b/content/learning-paths/cross-platform/topdown-compare/1c-compare-arch.md index f1541c9302..7ecc5310b8 100644 --- a/content/learning-paths/cross-platform/topdown-compare/1c-compare-arch.md +++ b/content/learning-paths/cross-platform/topdown-compare/1c-compare-arch.md @@ -5,37 +5,79 @@ weight: 6 ### FIXED, DO NOT MODIFY layout: learningpathall --- + ## Contrast Intel and Arm Neoverse implementation approaches -After understanding each architecture's methodology individually, you can now examine how they differ in implementation while achieving equivalent analysis capabilities. Both architectures implement the same fundamental approach with architecture-specific adaptations: +After examining each architecture individually, it's clear that Intel x86 and Arm Neoverse V2 share the same top-down philosophy but differ in their implementation, scope, and event model. + +Both use slot-based accounting to represent potential pipeline issue opportunities per cycle. However, Intel defines four issue slots per cycle (machine width = 4 to 6 µops per cycle, depending on microarchitecture), while Neoverse V2 defines eight rename slots per cycle for its top-down accounting. + +### Key shared principles + +Here are some concepts that are shared between Intel x86 and Arm: -- Slot-based accounting: pipeline utilization measured in issue or rename slots -- Hierarchical analysis: broad classification followed by drill-down into dominant bottlenecks -- Resource attribution: map performance issues to specific CPU micro-architectural components +- Slot-based utilization to measure pipeline efficiency in terms of µop issue or rename slots per cycle +- Four common categories at the top level: Retiring, Bad Speculation, Frontend Bound, Backend Bound +- Quantitative normalization for metrics that are expressed as a percentage of total available slots (width × cycles) +- Resource attribution to map inefficiencies to architectural subsystems such as frontend, backend, or memory -## Compare multi-level hierarchical and resource groups methodologies +### Philosophical differences -| Aspect | Intel x86 | Arm Neoverse | +Intel x86 favors a hierarchical multi-level drill-down, while Arm emphasizes two-stage flexibility. + +Intel's TMAM expands from high-level slots to detailed microarchitectural causes, and Arm's model classifies slot usage first, then groups detailed metrics into resource effectiveness groups including cache, TLB, branch, and operation mix that can be examined independently. + +## Compare hierarchical versus grouped methodologies + +| Aspect | Intel x86 | Arm Neoverse V2 | | :-- | :-- | :-- | -| Hierarchy Model | Multi-level tree (Level 1 → Level 2 → Level 3+) | Two-stage: Topdown Level 1 + Resource Groups | -| Slot Width | 4 issue slots per cycle (typical) | 8 rename slots per cycle (Neoverse V1) | -| Formula Basis | Micro-operation (uop) centric | Operation and cycle centric | -| Event Naming | Intel-specific mnemonics | Arm-specific mnemonics | -| Drill-down Strategy | Strict hierarchical descent | Exploration by resource groups | +| Analysis Model | Multi-level hierarchy (Levels 1 → 2 → 3 → 4) | Two-stage model: Stage 1 Top-Down L1 + Stage 2 Resource Groups | +| Machine / Slot Width | 4–6 issue slots per cycle (typically 4 for Skylake/Ice Lake, 6 for Sapphire Rapids and later) | 8 rename slots per cycle for Neoverse V2 | +| Measurement Basis | µops issued and retired per slot | µops speculatively executed and retired per slot | +| Formula Structure | Uses `UOPS_RETIRED.*`, `IDQ_UOPS_NOT_DELIVERED.*`, and derived ratios | Uses `STALL_SLOT_*`, `OP_SPEC`, `OP_RETIRED`, `BR_MIS_PRED` events | +| Hierarchy Depth | Four levels with formal sub-categories (Latency/Bandwidth, Memory/Core etc.) | Stage 1 Top-down L1 + Stage 2 effectiveness groups (Cycle, Branch, Cache) | +| Drill-Down Approach | Sequential hierarchical descent | Parallel exploration by resource group | +| Output Units | Percent of slots utilized per category | Percent of slots utilized per category (normalized to 8 slots/cycle) | ## Map equivalent PMU counters across architectures -| Performance Question | x86 Intel Events | Arm Neoverse Events | +The table below shows PMU events used to answer the analysis questions. + +| Performance Question | Intel x86 PMU Events | Arm Neoverse V2 PMU Events | Description | +| :-- | :-- | :-- | :-- | +| Is the frontend limiting µop delivery? | `IDQ_UOPS_NOT_DELIVERED.CORE` | `STALL_SLOT_FRONTEND` | Stalls due to instruction-fetch or decode limits | +| Is speculation causing waste? | `BR_MISP_RETIRED.*`, `MACHINE_CLEARS.*` | `BR_MIS_PRED`, `BR_MIS_PRED_RETIRED` | Lost slots/cycles from mispredicted or squashed µops | +| Is memory the bottleneck? | `CYCLE_ACTIVITY.STALLS_L3_MISS`, `CYCLE_ACTIVITY.STALLS_MEM_ANY` | `STALL_SLOT_BACKEND`, `L1D_CACHE_REFILL`, `L2D_CACHE_REFILL`, `LL_CACHE_MISS_RD` | Backend stalls waiting for cache/memory refills | +| How efficient is the cache hierarchy? | `MEM_LOAD_RETIRED.L1_HIT/L2_HIT/L3_HIT`, `MEM_LOAD_RETIRED.L3_MISS` | `L1D_CACHE_REFILL`, `L2D_CACHE_REFILL`, `LL_CACHE_MISS_RD` | Indicates cache locality and hierarchy effectiveness | +| Branch predictor accuracy? | `BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES` | `BR_MIS_PRED_RETIRED / BR_RETIRED` | Fraction of mispredicted branches affecting control-flow stalls | + +On Intel, `MACHINE_CLEARS.*` represent pipeline flushes caused by memory ordering violations, self-modifying code, or other speculation faults. + +On Arm Neoverse V2, equivalent lost work appears under Bad Speculation through `BR_MIS_PRED` and `STALL_SLOT` accounting, which include misprediction recovery and pipeline refill overhead. + +## Interpretation and cross-platform analysis + +| Focus | Intel x86 Approach | Arm Neoverse V2 Approach | | :-- | :-- | :-- | -| Frontend bound? | `IDQ_UOPS_NOT_DELIVERED.*` | `STALL_SLOT_FRONTEND` | -| Bad speculation? | `BR_MISP_RETIRED.*` | `BR_MIS_PRED_RETIRED` | -| Memory bound? | `CYCLE_ACTIVITY.STALLS_L3_MISS` | `L1D_CACHE_REFILL`, `L2D_CACHE_REFILL` | -| Cache effectiveness? | `MEM_LOAD_RETIRED.L3_MISS_PS` | Cache refill metrics / Cache access metrics | +| Frontend vs Backend Balance | Measured by slot distribution from `IDQ_UOPS_NOT_DELIVERED` and `CYCLE_ACTIVITY` counters | Measured using `STALL_SLOT_FRONTEND` and `STALL_SLOT_BACKEND` | +| Speculative Execution Impact | Explicit "Bad Speculation" slot fraction + `MACHINE_CLEARS.*` | Derived from `BR_MIS_PRED`, `BR_MIS_PRED_RETIRED`, and `OP_SPEC/OP_RETIRED` ratios | +| Cache and Memory Hierarchy | Layered: L1 → L2 → L3 → DRAM via `CYCLE_ACTIVITY` events | Effectiveness groups: L1I/L1D/L2/LL with MPKI and hit/miss ratios | +| Stall Accounting Granularity | Strict hierarchical attribution (single bottleneck per slot) | Flexible overlap across groups—multiple concurrent stall sources possible | +| Metric Normalization | All metrics normalized to total slots = machine_width × CPU cycles | All metrics normalized to total slots = CPU cycles × 8 (rename width) | + +## Practical cross-platform guidance + +- Normalize correctly: Intel uses width × CPU cycles (where width could be 4 or 6) and Arm Neoverse V2 uses 8 × CPU cycles +- Interpret "slots" consistently: both represent theoretical µop issue capacity per cycle +- Compare memory and cache behavior using MPKI and refill events rather than identical counter names +- Speculation loss differs as Intel isolates machine clears and Arm includes misprediction recovery directly in Bad Speculation + +## Summary + +Intel's TMAM and Arm Neoverse V2 top-down analysis both translate raw PMU data into actionable insights about how efficiently each core issues, executes, and retires µops. -While PMU counter names and calculation formulas differ significantly between Intel x86 and Arm Neoverse architectures, both provide equivalent top-down analysis capabilities. Understanding these methodological differences enables effective cross-platform performance optimization: +Intel x86 uses a deep, multi-level hierarchy emphasizing structured drill-down from slots to hardware events. -- **Intel x86**: Use `perf stat --topdown` for Level 1 analysis, then drill down through hierarchical levels -- **Arm Neoverse**: Use `topdown-tool -m Cycle_Accounting` for Stage 1, then explore resource effectiveness groups -- **Cross-platform strategy**: Focus on the four common categories while adapting tools and counter interpretations to each architecture +Arm Neoverse V2 uses a simplified, two-stage model with explicit slot-based formulas and resource-group flexibility. -Continue to the next step to see practical examples comparing both methodologies. \ No newline at end of file +By understanding these conceptual and measurement differences, you can interpret performance data consistently across architectures, enabling direct comparison of Retiring %, Frontend/Backend Bound %, and Bad Speculation % to optimize workloads for both x86 and Arm servers. diff --git a/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md index 40e2e7152e..f1ce7511db 100644 --- a/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md +++ b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md @@ -6,17 +6,21 @@ weight: 7 layout: learningpathall --- -## Cross-platform performance analysis example +## Cross-platform performance analysis example To compare x86 and Arm Neoverse top-down methodologies, you can run a backend-bound benchmark that demonstrates PMU counter differences between architectures. -You can prepare the application and test it on both x86 and Arm Neoverse Linux systems. You will need a C compiler installed, [GCC](/install-guides/gcc/native/) or Clang, and [Perf](/install-guides/perf/) installed on each system. For Arm systems, you'll also need [topdown-tool](/install-guides/topdown-tool/). Refer to the package manager for your Linux distribution for installation information. +You need a C compiler installed, [GCC](/install-guides/gcc/native/) or Clang, and [Perf](/install-guides/perf/) installed on each system. For Arm systems, you also need [topdown-tool](/install-guides/topdown-tool/). Refer to the package manager for your Linux distribution for installation information. -Use a text editor to copy the code below to a file named `test.c` +Use a text editor to copy the code below to a file named `core-bound-div-chain.c`: ```C +// Usage: ./core-bound-div-chain +// Intention: Backend/Core-bound via FP64 divide dependency chain. + #include #include +#include int main(int argc, char *argv[]) { if (argc != 2) { @@ -24,118 +28,148 @@ int main(int argc, char *argv[]) { return 1; } - long long num_iterations = atoll(argv[1]); - if (num_iterations <= 0) { + long long iters = atoll(argv[1]); + if (iters <= 0) { fprintf(stderr, "Number of iterations must be a positive integer.\n"); return 1; } - // Using volatile tells the compiler not to optimize this variable away. - // We initialize it to a non-trivial value. - volatile double result = 1.23456789; + volatile double result = 1.23456789; // keep it live between iterations + volatile double divisor = 1.00000001; // volatile thwarts constant folding & reciprocal - printf("Performing %lld dependent floating-point divisions...\n", num_iterations); + // Optional warmup: helps get steady-state frequency/thermal behavior + for (int w = 0; w < 1000000; ++w) { result /= divisor; } - // This loop creates a long dependency chain of floating-point divisions. - // Division is a high-latency operation. The dependency between iterations - // means the CPU backend will be stalled waiting for the result of the - // previous division before it can start the next one. This creates a - // classic backend-bound scenario, specifically core-bound. - for (long long i = 0; i < num_iterations; ++i) { - result /= 1.00000001; - } + printf("Running %lld dependent FP64 divisions...\n", iters); + for (long long i = 0; i < iters; ++i) { + // True dependency chain: next division waits for previous to complete + result /= divisor; - printf("Done. Final result: %f\n", (double)result); + // Tiny perturbation keeps the compiler cautious while having negligible impact + if ((i & 0x3FFFFF) == 0) { asm volatile("" ::: "memory"); } + } + // Print result to keep operations observable + printf("Done. Final result: %.9f\n", (double)result); return 0; } ``` -This program demonstrates a backend-bound workload that will show high `STALL_SLOT_BACKEND` on Arm Neoverse and high `Backend_Bound` percentage on x86. It takes a single command-line argument specifying the number of iterations to run. The sequential floating-point divisions create a dependency chain of high-latency operations, simulating a core-bound workload where each iteration must wait for the previous division to complete. +The example code performs one floating-point divide per iteration. The next divide depends on the previous result (result is reused). Divides are high-latency, low-throughput and generally take about 20–40 cycles on Intel x86 and Neoverse V2. Because each iteration depends on the last, the CPU can't overlap operations (no instruction-level parallelism). The backend's execution resources, specifically the FP divide unit, become the bottleneck. The frontend and speculation engines have no problem supplying work. + +This example is not a realistic application, but creates a controlled environment where the CPU's backend execution become the bottleneck. While such tight loops of dependent divides rarely exist in production code, similar patterns occur in scientific, financial, and engineering applications that perform iterative numerical calculations. In those cases, limited instruction-level parallelism and high operation latency lead to the same core-bound behavior. + +By isolating the pipeline dynamics the top-down performance metrics are easy to observe and interpret. By removing noise from memory access, control flow, and cache effects, the program highlights how the top-down methodology identifies backend stalls and distinguishes execution-resource bottlenecks from frontend or memory limitations. + +The program takes a single command-line argument specifying the number of iterations to run. The sequential floating-point divisions create a dependency chain of high-latency operations, simulating a core-bound workload where each iteration must wait for the previous division to complete. Build the application using GCC: ```console -gcc -O3 -march=native -o test test.c +gcc -O3 -march=native -o core-bound-div-chain core-bound-div-chain.c ``` You can also use Clang by substituting `clang` instead of `gcc` in the command above. -Run the application and pin it to one core to make the numbers more consistent: +{{% notice Note %}} +If you use GCC 13 and newer for Arm compilation, using `-march=native` is recommended. For older versions you should use `-mcpu=neoverse-v2`. +{{% /notice %}} + +Run the application using `taskset` to pin it to one core to make the numbers more consistent: ```console -taskset -c 1 ./test 1000000000 +taskset -c 1 ./core-bound-div-chain 1000000000 ``` -The output is similar to: +The output is: ```output -Performing 1000000000 dependent floating-point divisions... -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 ``` -## Collect x86 top-down Level 1 metrics with Perf +You can now try top-down on Intel x86 and Arm Neoverse V2. -Linux Perf computes 4-level hierarchical top-down breakdown using PMU counters like `UOPS_RETIRED.RETIRE_SLOTS`, `IDQ_UOPS_NOT_DELIVERED.CORE`, and `CPU_CLK_UNHALTED.THREAD` for the four categories: Retiring, Bad Speculation, Frontend Bound, and Backend Bound. +## Collect Intel x86 top-down Level 1 metrics with Perf + +Linux Perf collects level 1 top-down information using PMU counters like `UOPS_RETIRED.RETIRE_SLOTS`, `IDQ_UOPS_NOT_DELIVERED.CORE`, and `CPU_CLK_UNHALTED.THREAD` for the four categories: Retiring, Bad Speculation, Frontend Bound, and Backend Bound. Use `perf stat` on the pinned core to collect Level 1 metrics: ```console -taskset -c 1 perf stat -C 1 --topdown ./test 1000000000 +taskset -c 1 perf stat -C 1 --topdown ./core-bound-div-chain 1000000000 ``` -The output will be similar to: +The expected output is similar to: ```output -Performing 1000000000 dependent floating-point divisions... -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 Performance counter stats for 'CPU(s) 1': retiring bad speculation frontend bound backend bound -S0-D0-C1 1 8.5% 0.0% 0.1% 91.4% +S0-D0-C1 1 18.3% 0.0% 0.1% 81.6% + + 6.119817329 seconds time elapsed - 6.052117775 seconds time elapsed ``` -You see a very large `backend bound` component for this program. +Here's a summary of the results: + +| Metric | Result | Interpretation | +| :----------------------- | :---------------------------------------------------------------------------------------------- | :------------- | +| Retiring is 18.3% | Roughly one-fifth of available issue slots were used for useful µops that successfully retired. | | +| Bad Speculation is 0% | The program executed without branch mispredictions or pipeline flushes. | | +| Frontend Bound is 0.1% | The instruction fetch and decode stages easily kept the backend supplied with µops. | | +| Backend Bound is 81.6% | The vast majority of slots were stalled in the backend, waiting for execution resources. | | + +This pattern is expected from the dependent floating-point division chain: each divide must wait for the previous result, and the FP-divide unit has a latency of roughly 20–40 cycles. -You can also run with the `-M topdownl1` argument with Perf. +The high Backend Bound and low Frontend/Speculation percentages confirm that performance is limited by execution-unit latency, not instruction supply, memory access, or branching. + +In real applications, similar backend/core-bound behavior appears in compute-intensive numerical kernels that contain long dependency chains or high-latency math operations. + +You can also use the `-M topdownl1` argument with Perf: ```console -taskset -c 1 perf stat -C 1 -M topdownl1 ./test 1000000000 +taskset -c 1 perf stat -C 1 -M topdownl1 ./core-bound-div-chain 1000000000 ``` -The output is similar to: +The expected output is similar to: ```output -Performing 1000000000 dependent floating-point divisions... -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 Performance counter stats for 'CPU(s) 1': - 3,278,902,619 uops_issued.any # 0.00 Bad_Speculation (14.30%) - 19,185,808,092 cpu_clk_unhalted.thread # 0.04 Retiring (14.30%) - 3,275,536,897 uops_retired.retire_slots (14.30%) - 1,065,517 int_misc.recovery_cycles (14.30%) - 3,263,874,383 uops_issued.any # 0.96 Backend_Bound (14.33%) - 28,107,558 idq_uops_not_delivered.core (28.64%) - 631,768 int_misc.recovery_cycles (42.90%) - 19,173,526,414 cpu_clk_unhalted.thread (57.17%) - 19,176,373,078 cpu_clk_unhalted.thread # 0.00 Frontend_Bound (42.79%) - 25,090,380 idq_uops_not_delivered.core (42.79%) + 7,030,153,262 uops_issued.any # 0.00 Bad_Speculation (14.30%) + 19,206,823,557 cpu_clk_unhalted.thread # 0.09 Retiring (14.30%) + 7,026,625,438 uops_retired.retire_slots (14.30%) + 1,111,503 int_misc.recovery_cycles (14.30%) + 7,017,115,812 uops_issued.any # 0.91 Backend_Bound (14.33%) + 31,942,584 idq_uops_not_delivered.core (28.60%) + 704,834 int_misc.recovery_cycles (42.87%) + 19,201,754,818 cpu_clk_unhalted.thread (57.15%) + 19,203,260,401 cpu_clk_unhalted.thread # 0.00 Frontend_Bound (42.82%) + 26,956,040 idq_uops_not_delivered.core (42.82%) cpu_clk_unhalted.thread - 6.029283206 seconds time elapsed + 6.052936784 seconds time elapsed + ``` -Again, showing a `Backend_Bound` value that is very high (0.96). Notice the x86-specific PMU counters: -- `uops_issued.any` and `uops_retired.retire_slots` for micro-operation accounting -- `idq_uops_not_delivered.core` for frontend delivery failures -- `cpu_clk_unhalted.thread` for cycle normalization +This output provides the raw event counts and derived Top-Down Level 1 metrics for a single core running the dependent floating-point divide workload. -If you want to learn more, you can continue with the Level 2 and Level 3 hierarchical analysis. +| Metric |Interpretation | +| :------------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------ | +| Retiring is 9% | This is expected because each iteration performs one divide dependent on the previous result. | +| Bad Speculation is 0% | No wasted work from mispredicted branches or pipeline flushes. | +| Frontend Bound is 0% | The frontend easily supplied µops; instruction delivery was not a limiting factor. | +| Backend Bound is 91% | The majority of slots were stalled waiting for execution resources, specifically the floating-point divide unit. | +You can continue with the Level 2 and Level 3 hierarchical analysis if you want to learn more. ## Use the Arm Neoverse top-down methodology @@ -146,147 +180,67 @@ Make sure you install the Arm topdown-tool using the [Telemetry Solution install Collect general metrics including Instructions Per Cycle (IPC): ```console -taskset -c 1 topdown-tool -m General ./test 1000000000 +taskset -c 1 topdown-tool -m General ./core-bound-div-chain 1000000000 ``` -The output is similar to: +The expected output is similar to: ```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 CPU Neoverse V2 metrics └── Stage 2 (uarch metrics) └── General (General) └── ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┓ ┃ Metric ┃ Value ┃ Unit ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━┩ - │ Instructions Per Cycle │ 0.324 │ per cycle │ + │ Instructions Per Cycle │ 0.485 │ per cycle │ └────────────────────────┴───────┴───────────┘ ``` -Collect the Stage 1 topdown metrics using Arm's cycle accounting: +The reported Instructions Per Cycle (IPC) of 0.485 means that, on average, the Neoverse V2 core retired about 0.5 instructions every clock cycle while running the workload. + +Neoverse V2 can theoretically retire up to eight µops per cycle, so achieving only 0.485 IPC indicates that the core was mostly waiting rather than issuing useful work each cycle. This aligns with expectations for the dependent floating-point division chain, where every iteration must wait for the previous division to finish. The long divide latency prevents instruction-level parallelism, causing the pipeline to spend most of its time stalled in the backend. + +Collect the Stage 1 top-down metrics using Arm's cycle accounting: ```console -taskset -c 1 topdown-tool -m Cycle_Accounting ./test 1000000000 +taskset -c 1 topdown-tool -m Cycle_Accounting ./core-bound-div-chain 1000000000 ``` -The output is similar to: +The expected output is similar to: ```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 CPU Neoverse V2 metrics └── Stage 2 (uarch metrics) └── Cycle Accounting (Cycle_Accounting) └── ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━┓ ┃ Metric ┃ Value ┃ Unit ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━┩ - │ Backend Stalled Cycles │ 93.22 │ % │ - │ Frontend Stalled Cycles │ 0.03 │ % │ + │ Backend Stalled Cycles │ 89.22 │ % │ + │ Frontend Stalled Cycles │ 0.02 │ % │ └─────────────────────────┴───────┴──────┘ ``` -This confirms the example has high backend stalls, equivalent to x86's Backend_Bound category. Notice how Arm's Stage 1 uses percentage of cycles rather than Intel's slot-based accounting. - -You can continue to use the `topdown-tool` for additional microarchitecture exploration. - -For L1 data cache: - -```console -taskset -c 1 topdown-tool -m L1D_Cache_Effectiveness ./test 1000000000 -``` - -The output is similar to: - -```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 -CPU Neoverse V2 metrics -└── Stage 2 (uarch metrics) - └── L1 Data Cache Effectiveness (L1D_Cache_Effectiveness) - ├── Follows - │ └── Backend Bound (backend_bound) - └── ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ Metric ┃ Value ┃ Unit ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ L1D Cache Miss Ratio │ 0.000 │ per cache access │ - │ L1D Cache MPKI │ 0.129 │ misses per 1,000 instructions │ - └──────────────────────┴───────┴───────────────────────────────┘ -``` - -For L1 instruction cache effectiveness: - -```console -taskset -c 1 topdown-tool -m L1I_Cache_Effectiveness ./test 1000000000 -``` - -The output is similar to: - -```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 -CPU Neoverse V2 metrics -└── Stage 2 (uarch metrics) - └── L1 Instruction Cache Effectiveness (L1I_Cache_Effectiveness) - ├── Follows - │ └── Frontend Bound (frontend_bound) - └── ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ Metric ┃ Value ┃ Unit ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ L1I Cache Miss Ratio │ 0.003 │ per cache access │ - │ L1I Cache MPKI │ 0.474 │ misses per 1,000 instructions │ - └──────────────────────┴───────┴───────────────────────────────┘ -``` - -For last level cache: - -```console -taskset -c 1 topdown-tool -m LL_Cache_Effectiveness ./test 1000000000 -``` +The Cycle Accounting metrics show that during execution, 89.22% of all cycles were backend-stalled, while only 0.02% were frontend-stalled. -The output is similar to: +This means the Neoverse V2 core spent nearly all of its time waiting for backend execution resources rather than for instructions to be fetched or decoded. -```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 -CPU Neoverse V2 metrics -└── Stage 2 (uarch metrics) - └── Last Level Cache Effectiveness (LL_Cache_Effectiveness) - ├── Follows - │ ├── Backend Bound (backend_bound) - │ └── Frontend Bound (frontend_bound) - └── ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ Metric ┃ Value ┃ Unit ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ LL Cache Read Hit Ratio │ nan │ per cache access │ - │ LL Cache Read Miss Ratio │ nan │ per cache access │ - │ LL Cache Read MPKI │ 0.000 │ misses per 1,000 instructions │ - └──────────────────────────┴───────┴───────────────────────────────┘ -``` +The result confirms that the workload is backend/core-bound by arithmetic execution latency. The frontend and memory subsystems remained fully capable of feeding the pipeline. For operation mix: ```console -taskset -c 1 topdown-tool -m Operation_Mix ./test 1000000000 +taskset -c 1 topdown-tool -m Operation_Mix ./core-bound-div-chain 1000000000 ``` -The output is similar to: +The expected output is similar to: ```output -Performing 1000000000 dependent floating-point divisions... -Monitoring command: test. Hit Ctrl-C to stop. -Run 1 -Done. Final result: 0.000056 +Running 1000000000 dependent FP64 divisions... +Done. Final result: 0.000055492 CPU Neoverse V2 metrics └── Stage 2 (uarch metrics) └── Speculative Operation Mix (Operation_Mix) @@ -296,26 +250,49 @@ CPU Neoverse V2 metrics └── ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━┓ ┃ Metric ┃ Value ┃ Unit ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━┩ - │ Barrier Operations Percentage │ ❌ │ % │ - │ Branch Operations Percentage │ ❌ │ % │ + │ Barrier Operations Percentage │ 0.00 │ % │ + │ Branch Operations Percentage │ 22.16 │ % │ │ Crypto Operations Percentage │ 0.00 │ % │ │ Integer Operations Percentage │ 33.52 │ % │ - │ Load Operations Percentage │ 16.69 │ % │ - │ Floating Point Operations Percentage │ 16.51 │ % │ + │ Load Operations Percentage │ 22.19 │ % │ + │ Floating Point Operations Percentage │ 11.03 │ % │ │ Advanced SIMD Operations Percentage │ 0.00 │ % │ - │ Store Operations Percentage │ 16.58 │ % │ + │ Store Operations Percentage │ 11.11 │ % │ │ SVE Operations (Load/Store Inclusive) Percentage │ 0.00 │ % │ └──────────────────────────────────────────────────┴───────┴──────┘ ``` +The Operation Mix report shows the relative share of different instruction types that executed on the Neoverse V2 core. +Even though this benchmark performs only a single arithmetic operation in the loop, the compiler and runtime add supporting instructions for loop control, memory access, and branching. + +Key observations: + +| Metric | Interpretation | +| :-------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Floating Point Operations is 11% | These are the FP64 divide instructions that dominate runtime latency. Each divide is expensive (≈ 25–35 cycles) and fully serialized, explaining the high backend stall percentage seen earlier. | +| Integer Operations is 33% | Loop-index increments, comparisons, and address arithmetic generated by the compiler. They retire quickly and contribute little to total time. | +| Load ≈ 22 % / Store is 11% | Reading and writing the `result` variable in memory or registers each iteration. They show the loop’s basic data movement overhead. | +| Branch is 22% | The loop’s conditional branch that repeats until the iteration limit. Although frequent, branches are correctly predicted, so they don’t cause speculation stalls. | +| SIMD / SVE / Crypto / Barrier is 0% | None of these specialized instruction classes are used, as expected for a scalar FP benchmark. | + +Overall, the mix confirms that this is a scalar floating-point workload with a high fraction of control and integer bookkeeping instructions. + +The small proportion of FP operations but their long latency explains why the backend-bound stalls dominate performance: a few slow FP divides hold up many lightweight integer and branch instructions waiting behind them. + ## Cross-architecture performance analysis summary -Both Arm Neoverse and modern x86 cores expose hardware PMU events that enable equivalent top-down analysis, despite different counter names and calculation methods. +Both Arm Neoverse V2 and Intel x86 cores expose rich hardware Performance Monitoring Unit (PMU) events that enable Top-Down analysis. Although the counter names, formulas, and tools differ, both methodologies can identify pipeline efficiency and identify where bottlenecks occur. + +Intel x86 processors implement a multi-level hierarchical model known as the Top-Down Microarchitecture Analysis Methodology (TMAM). This approach uses slot-based pipeline accounting and PMU events such as UOPS_RETIRED.RETIRE_SLOTS, IDQ_UOPS_NOT_DELIVERED.CORE, and CPU_CLK_UNHALTED.THREAD to divide execution time into four categories: Retiring, Bad Speculation, Frontend Bound, and Backend Bound. + +Linux Perf provides a standard interface for this analysis through commands like `perf stat --topdown` and metric groups such as `perf stat -M topdownl1`. -Intel x86 processors use a four-level hierarchical methodology based on slot-based pipeline accounting, relying on PMU counters such as `UOPS_RETIRED.RETIRE_SLOTS`, `IDQ_UOPS_NOT_DELIVERED.CORE`, and `CPU_CLK_UNHALTED.THREAD` to break down performance into retiring, bad speculation, frontend bound, and backend bound categories. Linux Perf serves as the standard collection tool, using commands like `perf stat --topdown` and the `-M topdownl1` option for detailed breakdowns. +Arm Neoverse V2 offers a two-stage methodology implemented through the Arm Telemetry Solution and its `topdown-tool`. +Stage 1 measures the same four top-level categories using PMU events such as STALL_SLOT_BACKEND, STALL_SLOT_FRONTEND, OP_RETIRED, and OP_SPEC. -Arm Neoverse platforms implement a complementary two-stage methodology where Stage 1 focuses on topdown categories using counters such as `STALL_SLOT_BACKEND`, `STALL_SLOT_FRONTEND`, `OP_RETIRED`, and `OP_SPEC` to analyze pipeline stalls and instruction retirement. Stage 2 evaluates resource effectiveness, including cache and operation mix metrics through `topdown-tool`, which accepts the desired metric group via the `-m` argument. +Stage 2 expands the analysis into resource-effectiveness groups including Cycle Accounting, Cache Effectiveness, Branch Effectiveness, and Operation Mix. This modular structure enables flexible exploration of specific pipeline subsystems without requiring a strict hierarchy. -Both architectures identify the same performance bottleneck categories, enabling similar optimization strategies across Intel and Arm platforms while accounting for methodological differences in measurement depth and analysis approach. +When applied to the same floating-point division workload, both frameworks produced the same conclusion: +the program was Backend/Core-Bound, limited by execution-unit latency rather than instruction fetch, speculation, or memory access. diff --git a/content/learning-paths/cross-platform/topdown-compare/_index.md b/content/learning-paths/cross-platform/topdown-compare/_index.md index 0acdd66b2e..bf454668bb 100644 --- a/content/learning-paths/cross-platform/topdown-compare/_index.md +++ b/content/learning-paths/cross-platform/topdown-compare/_index.md @@ -12,7 +12,7 @@ learning_objectives: prerequisites: - Familiarity with performance analysis on Linux systems using Perf and PMU counters - - Access to Arm Neoverse and Intel x86 Linux systems for hands-on examples + - Access to Arm Neoverse V2 and Intel x86 Linux systems to run the code example - Basic understanding of CPU pipeline concepts and performance bottlenecks author: @@ -38,9 +38,13 @@ shared_between: further_reading: - resource: - title: Arm Neoverse V1 Top-down Methodology for Performance Analysis & Telemetry Specification - link: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/arm-neoverse-v1-top-down-methodology - type: blog + title: Arm Neoverse V2 Core Telemetry Specification + link: https://developer.arm.com/documentation/109528/0200/?lang=en + type: documentation + - resource: + title: Arm Neoverse V2 Software Optimization Guide + link: https://developer.arm.com/documentation/109898/latest/ + type: documentation - resource: title: Performance Analysis and Tuning on Modern CPUs link: https://www.amazon.com/Performance-Analysis-Tuning-Modern-CPUs/dp/B0DNQZJ92S diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index 175ba5cb29..813c94fc0e 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -13,7 +13,7 @@ operatingsystems_filter: - Baremetal: 30 - Linux: 31 - macOS: 7 -- RTOS: 9 +- RTOS: 10 - Windows: 4 subjects_filter: - CI-CD: 5 @@ -22,7 +22,7 @@ subjects_filter: - Libraries: 3 - ML: 17 - Performance and Architecture: 21 -- RTOS Fundamentals: 4 +- RTOS Fundamentals: 5 - Security: 2 - Virtual Hardware: 2 subtitle: Learn best practices for microcontroller development @@ -37,7 +37,7 @@ tools_software_languages_filter: - Arm Fast Models: 4 - Arm Virtual Hardware: 12 - Assembly: 1 -- C: 4 +- C: 5 - C++: 1 - ChatGPT: 1 - Clang: 1 @@ -96,6 +96,6 @@ tools_software_languages_filter: - TVMC: 1 - vcpkg: 1 - Yocto Project: 1 -- Zephyr: 1 +- Zephyr: 2 weight: 5 --- diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/customcomponentdeployment.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/customcomponentdeployment.md index 2496172830..f28ccb6345 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/customcomponentdeployment.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/customcomponentdeployment.md @@ -12,7 +12,7 @@ In this section, we will create an AWS IoT Greengrass deployment that will downl ### 0. (Non-Camera Edge Devices Only): Additional Custom Component -If your edge device does not contain a camera (i.e. EC2 edge device), you will need to deploy an additional custom component. Please follow [these steps](./NonCameraCustomComponent.md) to get the additional component created. You will be selecting this component in addition to the custom component we created for the Edge Impulse "Runner" service. +If your edge device does not contain a camera (i.e. EC2 edge device), you will need to deploy an additional custom component. Please follow [these steps](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/noncameracustomcomponent/) to get the additional component created. You will be selecting this component in addition to the custom component we created for the Edge Impulse "Runner" service. ### 1. Deploy the custom component to a selected Greengrass edge device or group of edge devices. diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetup.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetup.md index dafaace9c6..d8fb82612b 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetup.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetup.md @@ -8,17 +8,15 @@ layout: learningpathall ## Edge Device Hardware Setup -First, an edge device must be setup. In the following sections, Linux-compatible edge devices are detailed to enable them to receive and run as a AWS IoT Greengrass edge device. The list of supported devices will grow over time. +##### First, an edge device must be setup. In the following sections, Linux-compatible edge devices are detailed to enable them to receive and run as a AWS IoT Greengrass edge device. The list of supported devices will grow over time. Please select one of the following and follow the "Setup" link: -Please select one of the following and follow the "Setup" link... +### Option 1: Ubuntu EC2 Instance [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/HardwareSetupEC2/) -### Option 1: Ubuntu EC2 Instance [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/HardwareSetupEC2/) +### Option 2: Qualcomm QC6490 Platforms with Ubuntu [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/HardwareSetupQC6490Ubuntu/) -### Option 2: Qualcomm QC6490 Platforms with Ubuntu [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/HardwareSetupQC6490Ubuntu/) +### Option 3: Nvidia Jetson Platforms with Jetpack 5.x/6.0 [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/HardwareSetupNvidiaJetson/) -### Option 3: Nvidia Jetson Platforms with Jetpack 5.x/6.0 [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/HardwareSetupNvidiaJetson/) - -### Option 4: Raspberry Pi 5 with RaspberryPi OS [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/HardwareSetupRPi5/) +### Option 4: Raspberry Pi 5 with RaspberryPi OS [Setup](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/HardwareSetupRPi5/) #### (More exciting device options will be added soon. Stay tuned!) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupec2.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupec2.md similarity index 84% rename from content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupec2.md rename to content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupec2.md index f8aed88de1..c621196700 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupec2.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupec2.md @@ -1,3 +1,10 @@ +--- +hide_from_navpane: true + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + ## Setup and Configuration for Ubuntu-based EC2 instance ### Create Ubuntu EC2 Instance @@ -6,42 +13,42 @@ AWS EC2 instances can be used to simulate edge devices when edge device hardware We'll start by opening our AWS Console and search for EC2: -![AWS Console](../images/EC2_Setup_1.png) +![AWS Console](./images/EC2_Setup_1.png) We'll now open the EC2 console page: -![AWS EC2 Console](../images/EC2_Setup_2.png) +![AWS EC2 Console](./images/EC2_Setup_2.png) Select "Launch instance". Provide a Name for the EC2 instance and select the "Ubuntu" Quick Start option. Additionally, select "64-bit(Arm)" as the architecture type and select "t4g.large" as the Instance type: -![Create EC2 Instance](../images/EC2_Setup_3.png) +![Create EC2 Instance](./images/EC2_Setup_3.png) Additionally, please click on "Create new Key Pair" and provide a name for a new SSH key pair that will be used to SSH into our EC2 instance. Press "Create key pair": -![Create EC2 Keypair](../images/EC2_Setup_4.png) +![Create EC2 Keypair](./images/EC2_Setup_4.png) >**_NOTE:_** >You will notice that a download will occur with your browser. Save off this key (a .pem file) as we'll use it shortly. Next, we need to edit our "Network Settings" for our EC2 instance... scroll down to "Network Settings" and press "Edit": -![Security Group](../images/EC2_Setup_4_ns.png) +![Security Group](./images/EC2_Setup_4_ns.png) Press "Add security group rule" and lets allow port tcp/4912: -![Security Group](../images/EC2_Setup_4_4912.png) +![Security Group](./images/EC2_Setup_4_4912.png) Lets also give the EC2 instance a bit more disk space. Please change the "8" to "28" here: -![Increase disk space](../images/EC2_Setup_5.png) +![Increase disk space](./images/EC2_Setup_5.png) Finally, press "Launch instance". You should see your EC2 instance getting created: -![Launch Instance](../images/EC2_Setup_6.png) +![Launch Instance](./images/EC2_Setup_6.png) Now, press "View all instances" and press the refresh button... you should see your new EC2 instance in the "Running" state: -![Running Instance](../images/EC2_Setup_7.png) +![Running Instance](./images/EC2_Setup_7.png) You can scroll over and save off your Public IPv4 IP Address. You'll need this to SSH into your EC2 instance. @@ -55,7 +62,7 @@ Lets now confirm that we can SSH into our EC2 instance. With the saved off pem f You should see a login shell now for your EC2 instance! -![Login Shell](../images/EC2_Setup_8.png) +![Login Shell](./images/EC2_Setup_8.png) Excellent! You can keep that shell open as we'll make use of it when we start installing Greengrass a bit later. @@ -105,6 +112,6 @@ Before we go to the next section, lets also save off this JSON - it will be used } } -OK, Lets proceed to the next step and get our Edge Impulse environment setup! +OK, Lets proceed to the next step and get our Edge Impulse environment setup! Press "Next" to continue: -[Next](../../edgeimpulseprojectbuild/) +### [Next](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/edgeimpulseprojectbuild/) diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupnvidiajetson.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupnvidiajetson.md similarity index 94% rename from content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupnvidiajetson.md rename to content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupnvidiajetson.md index 436328a2a1..a8f8705829 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupnvidiajetson.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupnvidiajetson.md @@ -1,3 +1,10 @@ +--- +hide_from_navpane: true + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + ## Install/Configure Nvidia Jetpack (Jetson devices) The workshop will assume that the Nvidia Jetson edge device has been loaded with Jetpack 5.x and/or Jetpack 6.0 per flashing instructions located at this [Nvidia website](https://docs.nvidia.com/jetson/archives/r34.1/DeveloperGuide/index.html#page/Tegra%20Linux%20Driver%20Package%20Development%20Guide/flashing.html). @@ -87,6 +94,6 @@ We are now setup! Before we continue, please save off the following JSONs. Thes } } -OK! Lets continue by getting our Edge Impulse project setup! Let's go! +OK! Lets continue by getting our Edge Impulse project setup! Let's go! Press "Next" to continue: -[Next](../../edgeimpulseprojectbuild/) \ No newline at end of file +### [Next](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/edgeimpulseprojectbuild/) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupqc6490ubuntu.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupqc6490ubuntu.md similarity index 95% rename from content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupqc6490ubuntu.md rename to content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupqc6490ubuntu.md index 8be5496230..b4d65ba220 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetupqc6490ubuntu.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetupqc6490ubuntu.md @@ -1,3 +1,10 @@ +--- +hide_from_navpane: true + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + ## Ubuntu-based QC6490 platforms First, please flash your QC6490 device per your manufacturers instructions to load up Ubuntu onto the device. @@ -120,6 +127,6 @@ We are now setup! Before we continue, please save off the following JSONs. Thes } } -OK! Lets continue by getting our Edge Impulse project setup! Let's go! +OK! Lets continue by getting our Edge Impulse project setup! Let's go! Press "Next" to continue: -[Next](../../edgeimpulseprojectbuild/) \ No newline at end of file +### [Next](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/edgeimpulseprojectbuild/) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetuprpi5.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetuprpi5.md similarity index 93% rename from content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetuprpi5.md rename to content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetuprpi5.md index af9477a537..38c40e0a9d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/hardwaresetuprpi5.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardwaresetuprpi5.md @@ -1,3 +1,10 @@ +--- +hide_from_navpane: true + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + ## Setup and configuration of Raspberry Pi 5 with Raspberry Pi OS ### Install RaspberryPi OS @@ -8,7 +15,7 @@ First step in this exercise is to install the latest version of the Raspberry Pi The easiest way to setup Raspberry Pi OS is to follow the instructions here after downloading and installing the Raspberry Pi Imager application: -![Raspberry Pi Imager](../images/RPi_Imager.png) +![Raspberry Pi Imager](./images/RPi_Imager.png) Instructions: [Install Raspberry Pi Imager](https://www.raspberrypi.com/software/) @@ -101,6 +108,6 @@ Lastly, please safe off these JSONs. These will be used to customize our AWS Gr } } -Alright! Lets continue by getting our Edge Impulse project setup! Let's go! +Alright! Lets continue by getting our Edge Impulse project setup! Let's go! Press "Next" to continue: -[Next](../../edgeimpulseprojectbuild/) \ No newline at end of file +### [Next](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/edgeimpulseprojectbuild/) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/noncameracustomcomponent.md b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/noncameracustomcomponent.md similarity index 85% rename from content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/noncameracustomcomponent.md rename to content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/noncameracustomcomponent.md index 6be44f2a4a..b818d04e49 100644 --- a/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/hardware/noncameracustomcomponent.md +++ b/content/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/noncameracustomcomponent.md @@ -1,3 +1,10 @@ +--- +hide_from_navpane: true + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + ## Non-Camera Custom Component For those edge devices that do not contain a camera, the following component will prepare the edge device with some sample images that can be referenced by the Edge Impulse "Runner" component's JSON configuration (via "gst\_args" settings) to direct the running model to pull its image data from the file (vs. camera). @@ -31,16 +38,16 @@ Next, we need to edit the EdgeImpulseRunnerRuntimeInstallerComponent.yaml and ch Within the AWS dashboard, go to the IoTCore dashboard, then navigate to "Components" under the "Greengrass devices" drop-down on the left hand side. -![CreateComponent](GG_Create_NC_Component_1.png) +![CreateComponent](./images/GG_Create_NC_Component_1.png) Press "Create Component" and select "YAML" as the recipe format type. Copy and paste the contents of your updated/modified file EdgeImpulseRunnerRuntimeInstallerComponent.yaml into the text window after clearing the initial contents: -![CreateComponent](GG_Create_NC_Component_2.png) +![CreateComponent](./images/GG_Create_NC_Component_2.png) Finally, press "Create Component" and you should now have 2 custom components registered: -![CreateComponent](GG_Create_NC_Component_3.png) +![CreateComponent](./images/GG_Create_NC_Component_3.png) Awesome! Now that the non-camera support component is created, we can go back and continue with the deployment of these components to your edge device via the AWS IoT Greengrass deployment mechanism. Press "Return to Deployment Steps" below and continue! -[Return to Deployment Steps](../6_CustomComponentDeployment/CustomComponentDeployment.md) \ No newline at end of file +### [Return to Deployment Steps](/learning-paths/embedded-and-microcontrollers/edge_impulse_greengrass/customcomponentdeployment/) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md index 55b214dac7..0c24e883ad 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md @@ -41,7 +41,7 @@ From within the Python virtual environment, run the commands below to download t cd $HOME git clone https://github.com/pytorch/executorch.git cd executorch -git checkout 188312844ebfb499f92ab5a02137ed1a4abca782 +git checkout release/1.0 ``` Run the commands below to set up the ExecuTorch internal dependencies: diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md index 5a601ffb89..61786ed292 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md @@ -39,4 +39,4 @@ Test that the setup was successful by running the `run.sh` script for Ethos-U85, You will see a number of examples run on the FVP. -This confirms the installation, so you can now proceed to the Learning Path [Build a Simple PyTorch Model](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model/). \ No newline at end of file +This confirms the installation, so you can now proceed to the Learning Path [Build a Simple PyTorch Model](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model/). diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md index 597fc6e65b..a976b0063a 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md @@ -68,23 +68,28 @@ cd $ET_HOME python -m examples.arm.aot_arm_compiler --model_name=examples/arm/simple_nn.py --delegate --quantize --target=ethos-u85-256 --system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Sram_Only ``` -From the Arm Examples directory, you can build an embedded Arm runner with the `.pte` included. This allows you to optimize the performance of your model, and ensures compatibility with the CPU kernels on the FVP. Finally, generate the executable `arm_executor_runner`. +From the Arm Examples directory, you can build an embedded Arm runner with the `.pte` included. This allows you to optimize the performance of your model, and ensures compatibility with the CPU kernels on the FVP. Finally, build the ExecuTorch libraries and generate the executable `arm_executor_runner`. ```bash -cd $HOME/executorch/examples/arm/executor_runner +cmake -S "${ET_HOME}" \ + -B "${executorch_DIR}" \ + --preset arm-baremetal \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build "$executorch_DIR" --target install --parallel +cd $HOME/executorch/examples/arm/executor_runner -cmake -DCMAKE_BUILD_TYPE=Release \ --DCMAKE_TOOLCHAIN_FILE=$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \ --DTARGET_CPU=cortex-m85 \ --DET_DIR_PATH:PATH=$ET_HOME/ \ --DET_BUILD_DIR_PATH:PATH=$ET_HOME/cmake-out \ --DET_PTE_FILE_PATH:PATH=$ET_HOME/simple_nn_arm_delegate_ethos-u85-256.pte \ --DETHOS_SDK_PATH:PATH=$ET_HOME/examples/arm/ethos-u-scratch/ethos-u \ --DETHOSU_TARGET_NPU_CONFIG=ethos-u85-256 \ --DPYTHON_EXECUTABLE=$HOME/executorch-venv/bin/python3 \ --DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid \ --B $ET_HOME/examples/arm/executor_runner/cmake-out +cmake -S "${ET_HOME}/examples/arm/executor_runner" \ + -B "${ET_HOME}/examples/arm/executor_runner/cmake-out" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \ + -DTARGET_CPU=cortex-m85 \ + -DET_PTE_FILE_PATH:PATH=$ET_HOME/simple_nn_arm_delegate_ethos-u85-256.pte \ + -DETHOS_SDK_PATH:PATH=$ET_HOME/examples/arm/ethos-u-scratch/ethos-u \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u85-256 \ + -DPYTHON_EXECUTABLE=$HOME/executorch-venv/bin/python3 \ + -DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid \ cmake --build $ET_HOME/examples/arm/executor_runner/cmake-out --parallel -- arm_executor_runner diff --git a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/executorch.md b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/executorch.md index e623ead0c8..d0fe999c43 100644 --- a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/executorch.md +++ b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/executorch.md @@ -59,6 +59,7 @@ After cloning the repository, the project's submodules are updated, and two scri ``` bash git clone https://github.com/pytorch/executorch.git cd executorch +git checkout release/1.0 git submodule sync git submodule update --init --recursive ./install_executorch.sh diff --git a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-env-setup-execut.md b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-env-setup-execut.md index fa02f06a35..598f785873 100644 --- a/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-env-setup-execut.md +++ b/content/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/4-env-setup-execut.md @@ -47,6 +47,7 @@ Clone the ExecuTorch repository and install dependencies: cd $HOME git clone https://github.com/pytorch/executorch.git cd executorch +git checkout release/1.0 ``` Set up internal submodules: diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md new file mode 100644 index 0000000000..55907d401b --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/1_installation.md @@ -0,0 +1,101 @@ +--- +title: Install and configure Zephyr Workbench in VS Code +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Set up your Zephyr development environment + +Setting up a [Zephyr](https://zephyrproject.org/) RTOS development environment from scratch can be challenging, requiring you to manually install SDKs, configure toolchains, and initialize workspace directories. These steps often vary across operating systems and board vendors, leading to a fragmented and error-prone setup process. + +[Zephyr Workbench](https://zephyr-workbench.com/) is an open-source Visual Studio Code extension that transforms Zephyr RTOS development into a streamlined IDE experience. Created by [Ac6](https://www.ac6.fr/en/), it automates toolchain setup, project management, and debugging, making Zephyr projects faster to start and easier to scale. + +In this Learning Path, you'll learn the essential steps to install Zephyr Workbench and configure a complete development environment on your local machine. Once complete, you'll be ready to create, build, and debug applications for Arm Cortex-M platforms using Zephyr RTOS. + +Zephyr Workbench provides one-click environment setup that automatically installs the required tools including Python, CMake, Ninja, and Git. It supports importing and managing Zephyr SDKs with version and architecture selection, while initializing west workspaces and creating board-specific applications from samples. The extension builds Zephyr applications and flashes hardware directly from the VS Code interface. It also provides breakpoint debugging and memory usage insights with hardware probe support. + +## What you need before installing Zephyr Workbench + +To get started with Zephyr Workbench you need to have Visual Studio Code downloaded, installed, and running on your computer. + +For Windows, you need version 10 or later (64-bit), along with administrator privileges for installing tools and drivers. + +On macOS, the Homebrew package manager is required. To install Homebrew, run the following command: + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +``` + +Zephyr Workbench supports STM32 development boards (STM32 Discovery, Nucleo series), Nordic Semiconductor boards (nRF52, nRF53, nRF91 series), NXP development boards (FRDM, LPCXpresso series), Espressif boards (ESP32-based boards), and many other Zephyr-supported platforms. You need a development board to try out the code examples. + +## Configure the Zephyr Workbench extension in Visual Studio Code + +This section covers installing the Zephyr Workbench extension and configuring your Arm development environment. + +### Install the extension + +To install the Zephyr Workbench extension, open Visual Studio Code and navigate to the Extensions view by selecting the Extensions icon in the Activity Bar. + +You can also use the keyboard shortcut `Ctrl+Shift+X` (Windows/Linux) or `Cmd+Shift+X` (macOS). + +In the search box, type "Zephyr Workbench" and locate the official "Zephyr Workbench" extension by Ac6. Select **Install** to add the extension to VS Code. + +The extension icon appears in the Activity Bar, and a welcome message may appear confirming successful installation. + +Once installed, the Zephyr Workbench icon appears in the sidebar with a welcome screen. + +### Install the required host tools + +In the Zephyr Workbench panel, select **Install Host Tools** to automatically install the required dependencies. + +This process installs Python 3.x, CMake, the Ninja build system, Git, Device Tree Compiler (DTC), and the West meta-tool. + +![Install Host Tools #center](images/install_host_tools.png) + +{{% notice Note %}} +On Windows, you may be prompted for permission when tools are executed. Select "Allow" when requested. +{{% /notice %}} + +When the installation completes, select **Verify Host Tools** to check the version of each installed package. + +### Import and configure the toolchain + +Next, download and configure the toolchain by selecting **Import Toolchain** in the Zephyr Workbench panel. Select the toolchain family (*Zephyr SDK*) and configure the SDK Type by choosing *Minimal* for basic functionality. + +Select your desired version (such as v0.17.0 or v0.17.3) and choose the target architectures. For this Learning Path, you only need to select *arm*. + +Specify the parent directory for SDK installation and select **Import** to download and install the SDK. + +![Import Toolchain #center](images/import_toolchain.png) + + +### Initialize the Zephyr project workspace + +Zephyr uses a Git-based workspace manager called West to organize its source code, modules, and samples. Use Zephyr Workbench to initialize your first West workspace. + +In the Zephyr Workbench panel, select **Initialize Workspace** to set up your project environment. Configure the workspace settings by selecting "Minimal from template" for the source location and using the default path `https://github.com/zephyrproject-rtos/zephyr`. + +Choose a target-specific template (such as STM32 or NXP) and select your Zephyr version (such as v3.7.0 or v4.1.0). Specify the directory for your workspace, keeping in mind that initialization takes approximately 10 minutes to complete. + +Select **Import** to create and update the workspace. + +![Initialize West Workspace #center](images/initialize_workspace.png) + +{{% notice Note %}} +The workspace initialization downloads the Zephyr source code and dependencies. This process may take several minutes depending on your internet connection speed. +{{% /notice %}} + +### Verify setup + +Test your setup by confirming that the Zephyr Workbench panel shows all components as installed successfully. Verify the host tools are installed, the SDK is imported and detected, and the West workspace is initialized. Ensure no error messages appear in the VS Code output panel. + +{{% notice Note %}} +**Troubleshooting tips:** +- Run VS Code as Administrator if host tool installation fails on Windows +- Ensure internet access is allowed through your firewall +- Check for minimum 2 GB free disk space before importing SDK +{{% /notice %}} + +You're ready to create and build your first Zephyr application targeting an Arm Cortex-M board. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md new file mode 100644 index 0000000000..2bda1ffe4d --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/2_development.md @@ -0,0 +1,86 @@ +--- +title: Build Zephyr applications in VS Code +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Create and build your first Zephyr application + +In this session, you'll learn how to create and build your first Zephyr application using Zephyr Workbench. This step prepares you to customize, test, and expand real firmware projects on Arm Cortex-M boards. + +For demonstration, you'll use an [NXP FRDM-MCXN947](https://www.nxp.com/design/design-center/development-boards-and-designs/FRDM-MCXN947) development board as the target device. However, the same steps apply to any Zephyr-supported Arm Cortex-M board. +You can find the full list of supported boards in the [Supported Boards](https://docs.zephyrproject.org/latest/boards/#). + +Depending on your board, you might need to install a different debug tool. The next module covers this setup. + +### Create application + +In the Zephyr Workbench panel: + +1. Select **Create New Application** +2. Configure your project: + - Select workspace and SDK + - Choose your target board (for example, NXP FRDM-MCXN947) + - Select a sample app (for example, `hello_world`) + - Provide a project name + +![Create App](images/create_app.png) + +### Build the application + +Select the **Build** button in Zephyr Workbench or press `Ctrl+Shift+B`. + +The build system compiles your application and links it against the Zephyr kernel and board-specific drivers. + +![Build Application](images/build_application.png) + +### Install board-specific debug utilities + +To enable debugging on your target hardware, you might need to install additional tools based on the board vendor. + +For the NXP FRDM-MCXN947, download and install the LinkServer debug utility: +- LinkServer for Microcontrollers: [NXP LinkServer Download Page](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/linkserver-for-microcontrollers:LINKERSERVER) + +Once installed, Zephyr Workbench attempts to detect it automatically during a debug session. +If you're using a different board, see your vendor's documentation to install the appropriate debug utility. + +{{% notice Note %}} +If Zephyr Workbench doesn't automatically detect the installed debug runner, you can manually configure it. +Open the **Debug Manager** from the Zephyr sidebar, and enter the full path to the runner executable. +{{% /notice %}} + +### Review output + +Check the build output at the bottom panel of VS Code. Make sure there are no errors or warnings. A successful build displays: + +```output +Building ‘hello_world’ for frdm_mcxn947 +Memory region Used Size Region Size % Used + FLASH: 19844 B 1 MB 1.9% + SRAM: 4048 B 256 KB 1.5% +``` + +### Code walkthrough: hello_world + +The following code shows a basic Zephyr application that prints a message to the console: + +```c +#include +#include + +int main(void) +{ + printk("Hello World! %s\n", CONFIG_BOARD); // Prints board name to serial console + return 0; +} +``` + +`CONFIG_BOARD` expands to your target board name. You'll modify this app in the next module! + +### Try this: modify and rebuild + +Now that the app works, try editing the message in `printk()` or changing the board target in the application settings. Then rebuild and observe the output. This helps verify that your toolchain and workspace respond correctly to code and config changes. + +With your first Zephyr application successfully built, you're ready to take the next step—debugging. In the next module, you'll launch a debug session, set breakpoints, and perform memory analysis using Zephyr Workbench. These skills help you validate and optimize applications running on real Arm Cortex-M hardware. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md new file mode 100644 index 0000000000..475055798c --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/3_debug.md @@ -0,0 +1,225 @@ +--- +title: Analyze and debug Zephyr applications in VS Code +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Analyze and debug Zephyr applications in VS Code + +In this module, you'll learn how to inspect memory usage and perform live debugging on your Zephyr applications using Zephyr Workbench. These capabilities are essential for diagnosing bugs and optimizing embedded firmware performance on Arm Cortex-M platforms. + +## Analyze memory usage + +Understanding how your application uses memory is crucial for optimizing embedded firmware on resource-constrained Arm Cortex-M systems. Zephyr Workbench provides built-in tools to generate detailed memory usage reports after a successful build, helping you identify ROM and RAM consumption hotspots early in development. + +### Generate memory reports + +After building your Zephyr application, analyze how memory is allocated and used. Zephyr Workbench offers built-in memory reporting tools that help you visualize RAM and ROM usage, identify inefficient memory patterns, and guide optimization efforts. These insights are especially useful when working with constrained Arm Cortex-M platforms. + +To generate memory reports, open the **Zephyr Workbench** panel and select **Memory Analysis** after a successful build. The tool generates detailed reports showing RAM usage (stack, heap, static variables), ROM usage (code size, constants), and **Puncover** analysis for binary analysis including function size, call graphs, and timing on Arm Cortex-M processors. + +The following steps show how to generate and review memory reports: + +- Open the **Zephyr Workbench** panel +- Select **Memory Analysis** after a successful build +- Review detailed memory reports: + - **RAM usage**: stack, heap, static variables + - **ROM usage**: code size, constants + - **Puncover**: binary analysis for function size, call graphs, and timing on Arm Cortex-M + +![Memory Analysis](images/memory_analysis.png) + +The RAM Report displays detailed memory allocation information: + +``` +Path Size % Address Section +=================================================================================================================================== +Root 4323 100.00% - +├── (hidden) 4 0.09% - +├── (no paths) 3492 80.78% - +│ ├── SystemCoreClock 4 0.09% 0x300000f4 datas +│ ├── _kernel 32 0.74% 0x3000036c bss +│ ├── _thread_dummy 128 2.96% 0x30000240 bss +│ ├── z_idle_threads 128 2.96% 0x30000140 bss +│ ├── z_interrupt_stacks 2048 47.37% 0x300003a0 noinit +│ ├── z_main_stack 1024 23.69% 0x30000ce0 noinit +│ └── z_main_thread 128 2.96% 0x300001c0 bss +├── WORKSPACE 8 0.19% - +│ └── deps 8 0.19% - +│ └── modules 8 0.19% - +│ └── hal 8 0.19% - +│ └── nxp 8 0.19% - +│ └── mcux 8 0.19% - +│ └── mcux-sdk-ng 8 0.19% - +│ └── devices 8 0.19% - +│ └── MCX 8 0.19% - +│ └── MCXN 8 0.19% - +│ └── MCXN947 8 0.19% - +│ └── drivers 8 0.19% - +│ └── fsl_clock.c 8 0.19% - +│ ├── s_Ext_Clk_Freq 4 0.09% 0x300000fc datas +│ └── s_Xtal32_Freq 4 0.09% 0x300000f8 datas +└── ZEPHYR_BASE 819 18.95% - + ├── arch 25 0.58% - + │ └── arm 25 0.58% - + │ └── core 25 0.58% - + │ ├── mpu 21 0.49% - + │ │ ├── arm_mpu.c 1 0.02% - + │ │ │ └── static_regions_num 1 0.02% 0x30000398 bss + │ │ └── arm_mpu_v8_internal.h 20 0.46% - + │ │ └── dyn_reg_info 20 0.46% 0x300002e4 bss + │ └── tls.c 4 0.09% - + │ └── z_arm_tls_ptr 4 0.09% 0x300002e0 bss + ├── drivers 376 8.70% - + │ ├── clock_control 2 0.05% - + │ │ └── clock_control_mcux_syscon.c 2 0.05% - + │ │ └── __devstate_dts_ord_10 2 0.05% 0x3000010c device_states + │ ├── gpio 70 1.62% - + │ │ └── gpio_mcux.c 70 1.62% - + │ │ ├── __devstate_dts_ord_12 2 0.05% 0x30000116 device_states + │ │ ├── __devstate_dts_ord_14 2 0.05% 0x30000114 device_states + │ │ ├── __devstate_dts_ord_157 2 0.05% 0x3000010e device_states + │ │ ├── __devstate_dts_ord_176 2 0.05% 0x30000112 device_states + │ │ ├── __devstate_dts_ord_19 2 0.05% 0x30000110 device_states + │ │ ├── gpio_mcux_port0_data 12 0.28% 0x30000338 bss + │ │ ├── gpio_mcux_port1_data 12 0.28% 0x3000032c bss + │ │ ├── gpio_mcux_port2_data 12 0.28% 0x30000320 bss + │ │ ├── gpio_mcux_port3_data 12 0.28% 0x30000314 bss + │ │ └── gpio_mcux_port4_data 12 0.28% 0x30000308 bss + │ ├── mfd 232 5.37% - + │ │ └── mfd_nxp_lp_flexcomm.c 232 5.37% - + │ │ ├── __devstate_dts_ord_119 2 0.05% 0x3000011e device_states + │ │ ├── __devstate_dts_ord_123 2 0.05% 0x3000011c device_states + │ │ ├── __devstate_dts_ord_127 2 0.05% 0x3000011a device_states + │ │ ├── __devstate_dts_ord_131 2 0.05% 0x30000118 device_states + │ │ ├── nxp_lp_flexcomm_children_0 48 1.11% 0x300000c4 datas + │ │ ├── nxp_lp_flexcomm_children_1 48 1.11% 0x3000008c datas + │ │ ├── nxp_lp_flexcomm_children_2 48 1.11% 0x30000054 datas + │ │ ├── nxp_lp_flexcomm_children_3 48 1.11% 0x3000001c datas + │ │ ├── nxp_lp_flexcomm_data_0 8 0.19% 0x300000bc datas + │ │ ├── nxp_lp_flexcomm_data_1 8 0.19% 0x30000084 datas + │ │ ├── nxp_lp_flexcomm_data_2 8 0.19% 0x3000004c datas + │ │ └── nxp_lp_flexcomm_data_3 8 0.19% 0x30000014 datas + │ ├── pinctrl 12 0.28% - + │ │ └── pinctrl_nxp_port.c 12 0.28% - + │ │ ├── __devstate_dts_ord_11 2 0.05% 0x3000012a device_states + │ │ ├── __devstate_dts_ord_13 2 0.05% 0x30000128 device_states + │ │ ├── __devstate_dts_ord_156 2 0.05% 0x30000122 device_states + │ │ ├── __devstate_dts_ord_175 2 0.05% 0x30000126 device_states + │ │ ├── __devstate_dts_ord_18 2 0.05% 0x30000124 device_states + │ │ └── __devstate_dts_ord_83 2 0.05% 0x30000120 device_states + │ ├── serial 36 0.83% - + │ │ └── uart_mcux_lpuart.c 36 0.83% - + │ │ ├── __devstate_dts_ord_125 2 0.05% 0x3000012e device_states + │ │ ├── __devstate_dts_ord_133 2 0.05% 0x3000012c device_states + │ │ ├── mcux_lpuart_0_data 16 0.37% 0x30000354 bss + │ │ └── mcux_lpuart_1_data 16 0.37% 0x30000344 bss + │ └── timer 24 0.56% - + │ └── cortex_m_systick.c 24 0.56% - + │ ├── announced_cycles 8 0.19% 0x30000130 bss + │ ├── cycle_count 8 0.19% 0x30000138 bss + │ ├── last_load 4 0.09% 0x30000368 bss + │ └── overflow_cyc 4 0.09% 0x30000364 bss + ├── kernel 378 8.74% - + │ ├── init.c 321 7.43% - + │ │ ├── z_idle_stacks 320 7.40% 0x30000ba0 noinit + │ │ └── z_sys_post_kernel 1 0.02% 0x30000399 bss + │ ├── timeout.c 20 0.46% - + │ │ ├── announce_remaining 4 0.09% 0x30000394 bss + │ │ ├── curr_tick 8 0.19% 0x300002d8 bss + │ │ └── timeout_list 8 0.19% 0x30000104 datas + │ └── timeslicing.c 37 0.86% - + │ ├── pending_current 4 0.09% 0x3000038c bss + │ ├── slice_expired 1 0.02% 0x3000039a bss + │ ├── slice_max_prio 4 0.09% 0x30000390 bss + │ ├── slice_ticks 4 0.09% 0x30000100 datas + │ └── slice_timeouts 24 0.56% 0x300002c0 bss + └── lib 40 0.93% - + ├── libc 32 0.74% - + │ ├── common 12 0.28% - + │ │ └── source 12 0.28% - + │ │ └── stdlib 12 0.28% - + │ │ └── malloc.c 12 0.28% - + │ │ └── z_malloc_heap 12 0.28% 0x300002fc bss + │ └── picolibc 20 0.46% - + │ └── stdio.c 20 0.46% - + │ ├── __stdout 16 0.37% 0x30000004 datas + │ └── _stdout_hook 4 0.09% 0x300002f8 bss + ├── os 4 0.09% - + │ └── printk.c 4 0.09% - + │ └── _char_out 4 0.09% 0x30000000 datas + └── utils 4 0.09% - + └── last_section_id.c 4 0.09% - + └── last_id 4 0.09% 0x10005f00 .last_section +========================================================================================================================================== + 4323 + +``` + + +## Install and configure debug tools + +Depending on your board, different debug utilities might be required. Zephyr Workbench integrates several common runners: + +Go to **Host Tools > Install Debug Tools** in Zephyr Workbench. Debug tools vary depending on your target board. + +- **OpenOCD**: Generic open-source debugger +- **LinkServer**: For NXP targets +- **STM32CubeProgrammer**: For STM32 boards +- **J-Link**: For SEGGER debug probes + +### Install debug utilities + +To install debug tools for your specific board, go to **Host Tools > Install Debug Tools** in the Zephyr Workbench panel and select the tools applicable to your board. + +![Debug Tools](images/install_debug_tools.png) + +## Configure debug settings + +Before starting a debug session, make sure your settings match your application and board configuration. + +### Application configuration +Select your application and build config (for example, "primary"), then wait for values to load or build the project if needed. + +### Program settings +The ELF executable path is auto-filled after build. You can optionally add a **CMSIS-SVD** file to enable register-level view. + +### Debug server +Choose the runner from OpenOCD, J-Link, LinkServer, or PyOCD. If the system doesn't detect your runner automatically, enter the runner path manually. Select **Apply** to save your settings or launch debug directly. + +![Debug Manager](images/debug_manager.png) + +### Manual debug runner configuration + +If Zephyr Workbench doesn't automatically detect the installed debug runner, open the **Debug Manager** from the sidebar and locate your board profile to enter the path to the runner executable manually. + +{{% notice Note %}} +Manual configuration might be required on first-time setups or if using custom runner versions. +{{% /notice %}} + +## Launch and use the debugger + +You can start debugging from Zephyr Workbench by selecting **Debug**, or from VS Code by going to **Run and Debug** (`Ctrl+Shift+D`), selecting the debug config, and selecting **Run**. + +![Debug Application](images/debug_app.png) + +### Debug toolbar controls + +The debug toolbar provides the following controls for stepping through your code: + +- **Continue/Pause (F5)** +- **Step Over (F10)** +- **Step Into (F11)** +- **Step Out (Shift+F11)** +- **Restart (Ctrl+Shift+F5)** +- **Stop (Shift+F5)** + +### Debug features + +The debugger provides comprehensive inspection capabilities including breakpoints and variable watches, **Register view** for Arm CPU states, **Call stack navigation**, and **Memory view** of address space. + +If using `pyocd`, target support might take a few seconds to initialize. + +In this Learning Path, you explored how to analyze memory usage and debug Zephyr applications using Zephyr Workbench in VS Code. You learned to generate memory reports, install and configure debug tools, and launch interactive debug sessions. These steps help you troubleshoot and optimize embedded applications for Arm Cortex-M boards. diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md new file mode 100644 index 0000000000..2b964723b0 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_index.md @@ -0,0 +1,56 @@ +--- +title: Build Zephyr projects with Zephyr Workbench in VS Code + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for embedded developers targeting Arm-based platforms with the Zephyr RTOS using the Zephyr Workbench extension for VS Code. + +learning_objectives: + - Install and configure the Zephyr Workbench extension in VS Code + - Set up a complete Zephyr development environment including the SDK and toolchain + - Create, build, and debug Zephyr applications using hands-on examples + - Perform memory usage analysis and apply basic optimization techniques + - Apply essential debugging workflows for embedded systems + +prerequisites: + - Basic familiarity with embedded C programming + - Visual Studio Code installed and running + - A Cortex-M development board + +author: + - Ayoub Bourjilat + - Odin Shen + +skilllevels: Introductory +subjects: RTOS Fundamentals +armips: + - Cortex-M +operatingsystems: + - RTOS +tools_software_languages: + - Zephyr + - C + +further_reading: + - resource: + title: Zephyr Project Documentation + link: https://docs.zephyrproject.org/latest/index.html + type: documentation + - resource: + title: Zephyr Workbench Official Website + link: https://zephyr-workbench.com/ + type: website + - resource: + title: AC6 Zephyr Training + link: https://www.ac6-training.com/en/cours.php/cat_oRT/ref_oRT5/zephyr-rtos-programming + type: website + +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_next-steps.md b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png new file mode 100644 index 0000000000..53cf2449e8 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/build_application.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png new file mode 100644 index 0000000000..6a129d7292 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/create_app.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_app.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_app.png new file mode 100644 index 0000000000..338ea7a7c0 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_app.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png new file mode 100644 index 0000000000..b2276d64a2 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/debug_manager.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png new file mode 100644 index 0000000000..8fadbc01ae Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/import_toolchain.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png new file mode 100644 index 0000000000..ea0cd693e9 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/initialize_workspace.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_debug_tools.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_debug_tools.png new file mode 100644 index 0000000000..63b23b78e4 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_debug_tools.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png new file mode 100644 index 0000000000..672c9f0976 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/install_host_tools.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/memory_analysis.png b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/memory_analysis.png new file mode 100644 index 0000000000..68464cfb73 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/zephyr_vsworkbench/images/memory_analysis.png differ diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index aa493bf3dd..ef48d314d8 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -9,14 +9,14 @@ maintopic: true operatingsystems_filter: - Android: 2 - ChromeOS: 2 -- Linux: 34 +- Linux: 35 - macOS: 9 - Windows: 46 subjects_filter: - CI-CD: 5 - Containers and Virtualization: 7 - Migration to Arm: 30 -- ML: 2 +- ML: 3 - Performance and Architecture: 27 subtitle: Create and migrate apps for power efficient performance title: Laptops and Desktops @@ -28,8 +28,8 @@ tools_software_languages_filter: - Arm Performance Libraries: 2 - Arm64EC: 1 - Assembly: 1 -- Bash: 1 -- C: 8 +- Bash: 2 +- C: 9 - C#: 6 - C++: 11 - CCA: 1 @@ -52,6 +52,7 @@ tools_software_languages_filter: - Kubernetes: 1 - KVM: 1 - Linux: 1 +- llama.cpp: 1 - LLM: 1 - LLVM: 2 - llvm-mca: 1 @@ -64,7 +65,7 @@ tools_software_languages_filter: - OpenCV: 1 - perf: 4 - PowerShell: 1 -- Python: 6 +- Python: 7 - QEMU: 1 - Qt: 2 - RDP: 1 diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction.md new file mode 100644 index 0000000000..e3902ca04b --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction.md @@ -0,0 +1,49 @@ +--- +title: Explore Grace Blackwell architecture for efficient quantized LLM inference +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this Learning Path you will explore the architecture and system design of NVIDIA DGX Spark, a next-generation Arm-based CPU-GPU hybrid for large-scale AI workloads. The NVIDIA DGX Spark is a personal AI supercomputer that brings data center-class AI computing directly to the developer desktop. The NVIDIA GB10 Grace Blackwell Superchip fuses CPU and GPU into a single unified compute engine. + +The GB10 platform combines: +- The NVIDIA Grace CPU, featuring 10 Arm [Cortex-X925](https://www.arm.com/products/cortex-x) and 10 [Cortex-A725](https://www.arm.com/products/silicon-ip-cpu/cortex-a/cortex-a725) cores built on the Armv9 architecture, offering exceptional single-thread performance and power efficiency +- The NVIDIA Blackwell GPU, equipped with next-generation CUDA cores and 5th-generation Tensor Cores, optimized for FP8 and FP4 precision workloads +- A 128 GB unified memory subsystem, enabling both CPU and GPU to share the same address space with NVLink-C2C, eliminating data-transfer bottlenecks + +This GB10 platform design delivers up to 1 petaFLOP (1,000 TFLOPs) of AI performance at FP4 precision. DGX Spark is a compact yet powerful development platform that lets you build and test AI models locally before scaling them to larger systems. + +You can find out more about Nvidia DGX Spark on the [NVIDIA website](https://www.nvidia.com/en-gb/products/workstations/dgx-spark/). + +## Benefits of Grace Blackwell for quantized LLM inference + +Quantized Large Language Models (LLMs), such as those using Q4, Q5, or Q8 precision, benefit from the hybrid architecture of the Grace Blackwell Superchip which brings several key advantages to quantized LLM workloads. The unified CPU-GPU design eliminates traditional bottlenecks while providing specialized compute capabilities for different aspects of inference. + +On Arm-based systems, quantized LLM inference is especially efficient because the Grace CPU delivers high single-thread performance and energy efficiency, while the Blackwell GPU accelerates matrix operations using Arm-optimized CUDA libraries. The unified memory architecture means you don't need to manually manage data movement between CPU and GPU, which is a common challenge on traditional x86-based platforms. This is particularly valuable when working with large models or running multiple inference tasks in parallel, as it reduces latency and simplifies development. + +## Grace Blackwell features and their impact on quantized LLMs + +The table below shows how specific hardware features enable efficient quantized model inference: + +| **Feature** | **Impact on quantized LLMs** | +|--------------|------------------------------| +| Grace CPU (Arm Cortex-X925 / A725) | Handles token orchestration, memory paging, and lightweight inference efficiently with high instructions per cycle (IPC) | +| Blackwell GPU (CUDA 13, FP4/FP8 Tensor Cores) | Provides massive parallelism and precision flexibility, ideal for accelerating 4-bit or 8-bit quantized transformer layers | +| High bandwidth and low latency | NVLink-C2C delivers 900 GB/s of bidirectional bandwidth, enabling synchronized CPU-GPU workloads | +| Unified 128 GB memory (NVLink-C2C) | CPU and GPU share the same memory space, allowing quantized model weights to be accessed without explicit data transfer | +| Energy-efficient Arm design | Armv9 cores maintain strong performance-per-watt, enabling sustained inference for extended workloads | + +## Overview of a typical quantized LLM workflow + +In a typical quantized LLM workflow: +- The Grace CPU orchestrates text tokenization, prompt scheduling, and system-level tasks +- The Blackwell GPU executes the transformer layers using quantized matrix multiplications for optimal throughput +- Unified memory allows models like Qwen2-7B or LLaMA3-8B (Q4_K_M) to fit directly into the shared memory space, reducing copy overhead and enabling near-real-time inference + +Together, these features make the GB10 a developer-grade AI laboratory for running, profiling, and scaling quantized LLMs efficiently in a desktop form factor. + + diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1a_gb10_setup.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1a_gb10_setup.md new file mode 100644 index 0000000000..337ada2531 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1a_gb10_setup.md @@ -0,0 +1,223 @@ +--- +title: Verify your Grace Blackwell system readiness for AI inference +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Set up your Grace Blackwell environment + +Before building and running quantized LLMs on your DGX Spark, you need to verify that your system is fully prepared for AI workloads. This includes checking your Arm-based Grace CPU configuration, confirming your operating system, ensuring the Blackwell GPU and CUDA drivers are active, and validating that the CUDA toolkit is installed. These steps provide a solid foundation for efficient LLM inference and development on Arm. + +This section is organized into three main steps: verifying your CPU, checking your operating system, and confirming your GPU and CUDA toolkit setup. You'll also find additional context and technical details throughout, should you wish to explore the platform's capabilities more deeply. + +## Step 1: Verify your CPU configuration + +Before running LLM workloads, it's helpful to understand more about the CPU you're working with. The DGX Spark uses Arm-based Grace processors, which bring some unique advantages for AI inference. + +Start by checking your system's CPU configuration: + +```bash +lscpu +``` + +The output is similar to: + +```output +Architecture: aarch64 + CPU op-mode(s): 64-bit + Byte Order: Little Endian +CPU(s): 20 + On-line CPU(s) list: 0-19 +Vendor ID: ARM + Model name: Cortex-X925 + Model: 1 + Thread(s) per core: 1 + Core(s) per socket: 10 + Socket(s): 1 + Stepping: r0p1 + CPU(s) scaling MHz: 89% + CPU max MHz: 4004.0000 + CPU min MHz: 1378.0000 + BogoMIPS: 2000.00 + Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 sm3 sm4 as + imddp sha512 sve asimdfhm dit uscat ilrcpc flagm sb paca pacg dcpodp sve2 sveaes svepmull svebitperm svesha3 svesm4 f + lagm2 frint svei8mm svebf16 i8mm bf16 dgh bti ecv afp wfxt + Model name: Cortex-A725 + Model: 1 + Thread(s) per core: 1 + Core(s) per socket: 10 + Socket(s): 1 + Stepping: r0p1 + CPU(s) scaling MHz: 99% + CPU max MHz: 2860.0000 + CPU min MHz: 338.0000 + BogoMIPS: 2000.00 + Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 sm3 sm4 as + imddp sha512 sve asimdfhm dit uscat ilrcpc flagm sb paca pacg dcpodp sve2 sveaes svepmull svebitperm svesha3 svesm4 f + lagm2 frint svei8mm svebf16 i8mm bf16 dgh bti ecv afp wfxt +Caches (sum of all): + L1d: 1.3 MiB (20 instances) + L1i: 1.3 MiB (20 instances) + L2: 25 MiB (20 instances) + L3: 24 MiB (2 instances) +NUMA: + NUMA node(s): 1 + NUMA node0 CPU(s): 0-19 +Vulnerabilities: + Gather data sampling: Not affected + Itlb multihit: Not affected + L1tf: Not affected + Mds: Not affected + Meltdown: Not affected + Mmio stale data: Not affected + Reg file data sampling: Not affected + Retbleed: Not affected + Spec rstack overflow: Not affected + Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl + Spectre v1: Mitigation; __user pointer sanitization + Spectre v2: Not affected + Srbds: Not affected + Tsx async abort: Not affected +``` + +If you have seen this message your system is using Armv9 cores, great! These are ideal for quantized LLM workloads. The Grace CPU implements the Armv9-A instruction set and supports advanced vector extensions, making it ideal for quantized LLM inference and tensor operations. + +### Grace CPU specification + +The following table provides more information about the key specifications of the Grace CPU and explains their relevance to quantized LLM inference: + +| **Category** | **Specification** | **Description/Impact for LLM Inference** | +|---------------|-------------------|---------------------------------------------| +| Architecture | Armv9-A (64-bit, aarch64) | Modern Arm architecture supporting advanced vector and AI extensions| +| Core Configuration | 20 cores total - 10× Cortex-X925 (performance) + 10× Cortex-A725 (efficiency) | Heterogeneous CPU design balancing high performance and power efficiency | +| Threads per Core | 1 | Optimized for deterministic scheduling and predictable latency | +| Clock Frequency | Up to **4.0 GHz** (Cortex-X925)
Up to **2.86 GHz** (Cortex-A725) | High per-core speed ensures strong single-thread inference for token orchestration. | +| Cache Hierarchy | L1: 1.3 MiB × 20
L2: 25 MiB × 20
L3: 24 MiB × 2 | Large shared L3 cache enhances data locality for multi-threaded inference workloads | +| Instruction Set Features** | SVE / SVE2, BF16, I8MM, AES, SHA3, SM4, CRC32 | Vector and mixed-precision instructions accelerate quantized (Q4/Q8) math operations | +| NUMA Topology | Single NUMA node (node0: 0–19) | Simplifies memory access pattern for unified memory workloads | +| Security and Reliability | Not affected by Meltdown, Spectre, Retbleed, or similar vulnerabilities | Ensures stable and secure operation for long-running inference tasks | + +Its SVE2, BF16, and INT8 matrix multiplication (I8MM) capabilities are what make it ideal for quantized LLM workloads, as these provide a power-efficient foundation for both CPU-only inference and CPU-GPU hybrid processing. + +### Verify OS + +You can also verify the operating system running on your DGX Spark by using the following command: + +```bash +lsb_release -a +``` + +The expected output is something similar to: + +```log +No LSB modules are available. +Distributor ID: Ubuntu +Description: Ubuntu 24.04.3 LTS +Release: 24.04 +Codename: noble +``` +This shows you that your DGX Spark runs on Ubuntu 24.04 LTS, a developer-friendly Linux distribution that provides excellent compatibility with AI frameworks, compiler toolchains, and system utilities. This makes it an ideal environment for building and deploying quantized LLM workloads. + +Nice work! You've confirmed your operating system is Ubuntu 24.04 LTS, so you can move on to the next step. + +## Step 2: Verify the Blackwell GPU and driver + +After confirming your CPU configuration, verify that the Blackwell GPU inside the GB10 Grace Blackwell Superchip is available and ready for CUDA workloads by using the following: + +```bash +nvidia-smi +``` + +You will see output similar to: + +```output +Wed Oct 22 09:26:54 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | +| N/A 32C P8 4W / N/A | Not Supported | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| 0 N/A N/A 3094 G /usr/lib/xorg/Xorg 43MiB | +| 0 N/A N/A 3172 G /usr/bin/gnome-shell 16MiB | ++-----------------------------------------------------------------------------------------+ +``` + +The `nvidia-smi` tool reports GPU hardware specifications and provides valuable runtime information, including driver status, temperature, power usage, and GPU utilization. This information helps verify that the system is ready for AI workloads. + +### Further information about the output from the nvidia-smi tool + +The table below provides more explanation of the `nvidia-smi` output: +| **Category** | **Specification (from nvidia-smi)** | **Description / impact for LLM inference** | +|---------------|--------------------------------------|---------------------------------------------| +| GPU name | NVIDIA GB10 | Confirms the system recognizes the Blackwell GPU integrated into the Grace Blackwell Superchip | +| Driver version | 580.95.05 | Indicates that the system is running the latest driver package required for CUDA 13 compatibility | +| CUDA version | 13.0 | Confirms that the CUDA runtime supports GB10 (sm_121) and is ready for accelerated quantized LLM workloads | +| Architecture / Compute capability | Blackwell (sm_121) | Supports FP4, FP8, and BF16 Tensor Core operations optimized for LLMs | +| Memory | Unified 128 GB LPDDR5X (shared with CPU via NVLink-C2C) | Enables zero-copy data access between Grace CPU and GPU for unified inference memory space | +| Power & Thermal status | ~4W at idle, 32°C temperature | Confirms the GPU is powered on and thermally stable while idle | +| GPU-utilization | 0% (Idle) | Indicates no active compute workloads; GPU is ready for new inference jobs | +| Memory usage | Not Supported (headless GPU configuration) | DGX Spark operates in headless compute mode; display memory metrics may not be exposed | +| Persistence mode | On | Ensures the GPU remains initialized and ready for rapid inference startup | + +Excellent! Your Blackwell GPU is recognized and ready for CUDA workloads. This means your system is set up for GPU-accelerated LLM inference. + + +## Step 3: Check the CUDA toolkit + +To build the CUDA version of llama.cpp, the system must have a CUDA toolkit installed. + +The `nvcc --version` command confirms that the CUDA compiler is available and compatible with CUDA 13. +This ensures that CMake can correctly detect and compile the GPU-accelerated components. + +```bash +nvcc --version +``` +You're almost ready! Verifying the CUDA toolkit ensures you can build GPU-enabled versions of llama.cpp for maximum performance. + +You will see output similar to: + +```output +nvcc: NVIDIA (R) Cuda compiler driver +Copyright (c) 2005-2025 NVIDIA Corporation +Built on Wed_Aug_20_01:57:39_PM_PDT_2025 +Cuda compilation tools, release 13.0, V13.0.88 +Build cuda_13.0.r13.0/compiler.36424714_0 +``` + +{{% notice Note %}} +The nvcc compiler is required only during the CUDA-enabled build process; it is not needed at runtime for inference. +{{% /notice %}} + +This confirms that the CUDA 13 toolkit is installed and ready for GPU compilation. +If the command is missing or reports an older version (e.g., 12.x), you should update to CUDA 13.0 or later to ensure compatibility with the Blackwell GPU (sm_121). + +At this point, you have verified that: +- The Grace CPU (Arm Cortex-X925 / A725) is correctly recognized and supports Armv9 extensions +- The Blackwell GPU is active with driver 580.95.05 and CUDA 13 runtime +- The CUDA toolkit 13.0 is available for building the GPU-enabled version of llama.cpp + +Your DGX Spark environment is now fully prepared for the next section, where you will build and configure both CPU and GPU versions of llama.cpp, laying the foundation for running quantized LLMs efficiently on the Grace Blackwell platform. + +## What you have accomplished + +In this entire setup section, you have achieved the following: + +- Verified your Arm-based Grace CPU and its capabilities by confirming that your system is running Armv9 cores with SVE2, BF16, and INT8 matrix multiplication support, which are perfect for quantized LLM inference +- Confirmed your Blackwell GPU and CUDA driver are ready by seeing that the GB10 GPU is active, properly recognized, and set up with CUDA 13, so you're all set for GPU-accelerated workloads +- Checked your operating system and CUDA toolkit - Ubuntu 24.04 LTS provides a solid foundation, and the CUDA compiler is installed and ready for building GPU-enabled inference tools + +You're now ready to move on to building and running quantized LLMs on your DGX Spark. The next section walks you through compiling llama.cpp for both CPU and GPU, so you can start running AI inference on this platform. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/2_gb10_llamacpp_gpu.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/2_gb10_llamacpp_gpu.md new file mode 100644 index 0000000000..41b854a087 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/2_gb10_llamacpp_gpu.md @@ -0,0 +1,204 @@ +--- +title: Build the GPU version of llama.cpp on GB10 +weight: 4 +layout: "learningpathall" +--- + +## How do I build the GPU version of llama.cpp on GB10? + +In the previous section, you verified that your DGX Spark system is correctly configured with the Grace CPU, Blackwell GPU, and CUDA 13 environment. Now that your hardware and drivers are ready, this section focuses on building the GPU-enabled version of llama.cpp, which is a lightweight, portable inference engine optimized for quantized LLM workloads on NVIDIA Blackwell GPUs. Llama.cpp is an open-source project by Georgi Gerganov that provides efficient and dependency-free large language model inference on both CPUs and GPUs. + +## Step 1: Install dependencies + +In this step, you will install the necessary build tools and download a small quantized model for validation: + +```bash +sudo apt update +sudo apt install -y git cmake build-essential nvtop htop +``` + +These packages provide the C/C++ compiler toolchain, CMake build system, and GPU monitoring utility (nvtop) required to compile and test llama.cpp. + +## Download a test model + +To test your GPU build, you'll need a quantized model. In this section, you'll download a lightweight model that's perfect for validation. + +First, ensure that you have the latest Hugging Face Hub CLI installed and download models: + +```bash +mkdir ~/models +cd ~/models +python3 -m venv venv +source venv/bin/activate +pip install -U huggingface_hub +hf download TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF --local-dir TinyLlama-1.1B +``` + +{{% notice Note %}} +After the download completes, you'll find the models in the `~/models` directory. + +**Tip:** Always activate your Python virtual environment with `source venv/bin/activate` before installing packages or running Python-based tools. This ensures dependencies are isolated and prevents conflicts with system-wide packages. +{{% /notice %}} + +Great! You’ve installed all the required build tools and downloaded a quantized model for validation. Your environment is ready for source code setup. + +## Step 2: Clone the llama.cpp repository + +Use the commands below to download the source code for llama.cpp from GitHub: + +```bash +cd ~ +git clone https://github.com/ggerganov/llama.cpp.git +cd ~/llama.cpp +``` + +Nice work! You now have the latest llama.cpp source code on your DGX Spark system. + +## Step 3: Configure and build the CUDA-enabled version (GPU Mode) + +Run the following `cmake` command to configure the build system for GPU acceleration: + + +```bash +mkdir -p build-gpu +cd build-gpu +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_CUDA=ON \ + -DGGML_CUDA_F16=ON \ + -DCMAKE_CUDA_ARCHITECTURES=121 \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_CUDA_COMPILER=nvcc +``` + +This command enables CUDA support and prepares llama.cpp for compiling GPU-optimized kernels. + +### Explanation of key flags: + +Here's what each configuration flag does: + +| **Feature** | **Description/Impact** | +|--------------|------------------------------| +| -DGGML_CUDA=ON | Enables the CUDA backend in llama.cpp, allowing matrix operations and transformer layers to be offloaded to the GPU for acceleration| +| -DGGML_CUDA_F16=ON | Enables FP16 (half-precision) CUDA kernels, reducing memory usage and increasing throughput — especially effective for quantized models (for example, Q4, Q5) | +| -DCMAKE_CUDA_ARCHITECTURES=121 | Specifies the compute capability for the NVIDIA Blackwell GPU (GB10 = sm_121), ensuring the CUDA compiler (nvcc) generates optimized GPU kernels| + +When the configuration process completes successfully, the terminal should display output similar to the following: + +```output +-- Configuring done (2.0s) +-- Generating done (0.1s) +-- Build files have been written to: /home/nvidia/llama.cpp/build-gpu +``` + +{{% notice Note %}} +- For systems with multiple CUDA versions installed, explicitly specifying the compilers (`-DCMAKE_C_COMPILER`, `-DCMAKE_CXX_COMPILER`, `-DCMAKE_CUDA_COMPILER`) ensures that CMake uses the correct CUDA 13.0 toolchain. +- If you encounter configuration errors, return to the previous section and confirm that your CUDA toolkit and driver versions are correctly installed and compatible with Blackwell (sm_121).{{% /notice %}} + +Once CMake configuration succeeds, start the compilation process: + +```bash +make -j"$(nproc)" +``` +This command compiles all CUDA and C++ source files in parallel, using all available CPU cores. On the Grace CPU, the build typically finishes in 2–4 minutes. + +The build output is shown below: + +```output +[ 0%] Building C object examples/gguf-hash/CMakeFiles/sha1.dir/deps/sha1/sha1.c.o +[ 15%] Building CUDA object ggml/src/ggml-cuda/CMakeFiles/ggml-cuda.dir/cpy.cu.o +[ 50%] Building CXX object src/CMakeFiles/llama.dir/llama-sampling.cpp.o +[100%] Built target test-backend-ops +[100%] Linking CXX executable ../../bin/llama-server +[100%] Built target llama-server +``` + +After the build completes, you'll find the GPU-accelerated binaries located under `~/llama.cpp/build-gpu/bin/`. + +These binaries provide all necessary tools for quantized model inference (llama-cli) and for serving GPU inference using HTTP API (llama-server). + +Excellent! The CUDA-enabled build is complete. Your binaries are optimized for the Blackwell GPU and ready for validation. Together, these options ensure that the build targets the Grace Blackwell GPU with full CUDA 13 compatibility. You are now ready to test quantized LLMs with full GPU acceleration in the next step. + +## Step 4: Validate the CUDA-enabled build + +After the build completes successfully, verify that the GPU-enabled binary of llama.cpp is correctly linked to the NVIDIA CUDA runtime. + +To verify CUDA linkage, run the following command: + +```bash +ldd bin/llama-cli | grep cuda +``` + +The output is similar to: + +```output + libggml-cuda.so => /home/nvidia/llama.cpp/build-gpu/bin/libggml-cuda.so (0x0000eee1e8e30000) + libcudart.so.13 => /usr/local/cuda/targets/sbsa-linux/lib/libcudart.so.13 (0x0000eee1e83b0000) + libcublas.so.13 => /usr/local/cuda/targets/sbsa-linux/lib/libcublas.so.13 (0x0000eee1e4860000) + libcuda.so.1 => /lib/aarch64-linux-gnu/libcuda.so.1 (0x0000eee1debd0000) + libcublasLt.so.13 => /usr/local/cuda/targets/sbsa-linux/lib/libcublasLt.so.13 (0x0000eee1b36c0000) +``` + +If the CUDA library is correctly linked, it confirms that the binary can access the GPU through the system driver. + +Next, confirm that the binary initializes the GPU correctly by checking device detection and compute capability: + +```bash +./bin/llama-server --version +``` + +The expected output is: + +```output +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA GB10, compute capability 12.1, VMM: yes +version: 6819 (19a5a3ed) +built with gcc (Ubuntu 12.4.0-2ubuntu1~24.04) 12.4.0 for aarch64-linux-gnu +``` + +The message "compute capability 12.1" confirms that the build was compiled specifically for the Blackwell GPU (sm_121) and that CUDA 13.0 is functioning correctly. + +Next, use the downloaded quantized model (for example, TinyLlama-1.1B) to verify that inference executes successfully on the GPU: + +```bash +./bin/llama-cli \ + -m ~/models/TinyLlama-1.1B/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \ + -ngl 32 \ + -t 16 \ + -p "Explain the advantages of the Armv9 architecture." +``` + +If the build is successful, you will see text generation begin within a few seconds. + +To monitor GPU utilization during inference, use `nvtop` to view real-time performance metrics: + +```bash +nvtop +``` + +This command displays GPU utilization, memory usage, temperature, and power consumption. You can use this to verify that CUDA kernels are active during model inference. + +The following screenshot shows GPU utilization during TinyLlama inference on DGX Spark: + +![nvtop terminal interface displaying real-time GPU metrics, including GPU utilization, memory usage, temperature, power consumption, and active processes for the NVIDIA GB10 GPU during model inference on DGX Spark. alt-text#center](nvtop.png "TinyLlama GPU Utilization") + +The nvtop interface shows: + + - GPU Utilization (%): Confirms CUDA kernels are active. + - Memory Usage (VRAM): Shows model loading and runtime footprint. + - Temperature / Power Draw: Monitors thermal stability under sustained workloads. + +You have now successfully built and validated the CUDA-enabled version of llama.cpp on DGX Spark. + +## What you have accomplished + +You have: +- Installed all required tools and dependencies +- Downloaded a quantized model for testing +- Built the CUDA-enabled version of llama.cpp +- Verified GPU linkage and successful inference + +You’re ready to move on to building and testing the CPU-only version. You will build the optimized CPU-only version of llama.cpp and explore how the Grace CPU executes Armv9 vector instructions during inference. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md new file mode 100644 index 0000000000..b7c87b35ed --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md @@ -0,0 +1,145 @@ +--- +title: Build the CPU version of llama.cpp on GB10 +weight: 5 +layout: "learningpathall" +--- + +## Overview +In this section, you'll build and test the CPU-only version of llama.cpp, optimized specifically for the Grace CPU's advanced Armv9 capabilities. + +The Grace CPU features Arm Cortex-X925 and Cortex-A725 cores with advanced vector extensions including SVE2, BFloat16, and I8MM. These extensions make the CPU highly efficient for quantized inference workloads, even without GPU acceleration. + +## Configure and build the CPU-only version + +This build runs entirely on the Grace CPU (Arm Cortex-X925 and Cortex-A725), which supports advanced Armv9 vector extensions including SVE2, BFloat16, and I8MM, making it highly efficient for quantized inference workloads even without GPU acceleration. +To ensure a clean separation from the GPU build artifacts, start from a clean directory. + +Configure the build system for the CPU-only version of llama.cpp: + +```bash +cd ~/llama.cpp +mkdir -p build-cpu +cd build-cpu + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ + -DLLAMA_ACCELERATE=ON \ + -DLLAMA_BLAS=OFF \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_FLAGS="-O3 -march=armv9-a+sve2+bf16+i8mm -mtune=native -fopenmp" \ + -DCMAKE_CXX_FLAGS="-O3 -march=armv9-a+sve2+bf16+i8mm -mtune=native -fopenmp" +``` + +Explanation of key flags: + +| **Feature** | **Description / Impact** | +|--------------|------------------------------| +| -march=armv9-a | Targets the Armv9-A architecture used by the Grace CPU and enables advanced vector extensions | +| +sve2+bf16+i8mm | Activates Scalable Vector Extensions (SVE2), INT8 matrix multiply (I8MM), and BFloat16 operations for quantized inference | +| -fopenmp | Enables multi-threaded execution via OpenMP, allowing all 20 Grace cores to be utilized | +| -mtune=native | Optimizes code generation for the local Grace CPU microarchitecture | +| -DLLAMA_ACCELERATE=ON | Enables llama.cpp's internal Arm acceleration path (Neon/SVE optimized kernels) | + +When the configuration process completes successfully, the terminal should display output similar to the following: + +```output +-- Configuring done (1.1s) +-- Generating done (0.1s) +-- Build files have been written to: /home/nvidia/llama.cpp/build-cpu +``` + +Once you see this, you can now move on to start the compilation process: + +```bash +make -j"$(nproc)" +``` + +{{% notice Note %}} +If the build fails after modifying optimization flags, it is likely due to a stale CMake cache. +Run the following commands to perform a clean reconfiguration: + +```bash +cmake --fresh . +make -j"$(nproc)" +``` +{{% /notice %}} + + +The CPU build on the DGX Spark completes in about 20 seconds, even faster than the GPU build. + +The build output is shown below: + +```output +[ 25%] Building CXX object src/CMakeFiles/llama.dir/llama-model-loader.cpp.o +[ 50%] Linking CXX executable ../bin/test-tokenizer-0 +[ 75%] Linking CXX executable ../bin/test-alloc +[100%] Linking CXX executable ../../bin/llama-server +[100%] Built target llama-server +``` + +After the build finishes, you'll find the CPU-optimized binaries at `~/llama.cpp/build-cpu/bin/` +## Validate the CPU-enabled build (CPU mode) + +First, validate that the binary was compiled in CPU-only mode and runs correctly on the Grace CPU: + +```bash +./bin/llama-server --version +``` + +The output confirms the build configuration: + +```output +version: 6819 (19a5a3ed) +built with gcc (Ubuntu 12.4.0-2ubuntu1~24.04) 12.4.0 for aarch64-linux-gnu +``` + +The message indicates the build is a CPU-only binary optimized for the Grace CPU. + +Next, use the downloaded quantized model (for example, TinyLlama-1.1B) to verify that inference executes successfully on the CPU: + +```bash +./bin/llama-cli \ + -m ~/models/TinyLlama-1.1B/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \ + -ngl 0 \ + -t 20 \ + -p "Explain the advantages of the Armv9 architecture." +``` + +Here is an explanation of the key flags: + +- `-ngl 0` disables GPU offloading (CPU-only execution) +- `-t 20` uses 20 threads (1 per Grace CPU core) + +If the build is successful, you will see smooth model initialization and token generation, with CPU utilization increasing across all cores. + +To monitor live CPU utilization and power metrics during inference, use `htop`: + +```bash +htop +``` + +The following screenshot shows CPU utilization and thread activity during TinyLlama inference on DGX Spark, confirming full multi-core engagement: +![htop display showing 20 Grace CPU cores at 75-85% utilization during TinyLlama inference with OpenMP threading alt-text#center](htop.png "TinyLlama CPU utilization") + +The `htop` interface shows: + +- CPU Utilization: all 20 cores operate between 75–85%, confirming efficient multi-thread scaling +- Load Average: around 5.0, indicating balanced workload distribution +- Memory Usage: approximately 4.5 GB total for the TinyLlama Q8_0 model +- Process List: displays multiple `llama-cli` threads (each 7–9% CPU), confirming OpenMP parallelism + +{{% notice Note %}} +In htop, press F6 to sort by CPU% and verify load distribution, or press `t` to toggle the tree view, which shows the `llama-cli` main process and its worker threads. +{{% /notice %}} + +## What you have accomplished + +In this section you have: +- Built and validated the CPU-only version of llama.cpp. +- Optimized the Grace CPU build using Armv9 vector extensions (SVE2, BF16, I8MM). +- Tested quantized model inference using the TinyLlama Q8_0 model. +- Used monitoring tools (htop) to confirm efficient CPU utilization. + +You have now successfully built and validated the CPU-only version of llama.cpp on the Grace CPU. In the next section, you will learn how to use the Process Watch tool to visualize instruction-level execution and better understand how Armv9 vectorization (SVE2 and NEON) accelerates quantized LLM inference on the Grace CPU. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md new file mode 100644 index 0000000000..d57eb5d3d0 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md @@ -0,0 +1,213 @@ +--- +title: Analyze CPU instruction mix using Process Watch +weight: 6 +layout: "learningpathall" +--- + +## How can I analyze the instruction mix on the CPU using Process Watch? + +In this section, you'll explore how the Grace CPU executes Armv9 vector instructions during quantized LLM inference. + +Process Watch helps you observe Neon SIMD instruction execution on the Grace CPU and understand why SVE and SVE2 remain inactive under the current kernel configuration. This demonstrates how Armv9 vector execution works in AI workloads and shows the evolution from traditional SIMD pipelines to scalable vector computation. + +## Install and configure Process Watch + +First, install the required packages: + +```bash +sudo apt update +sudo apt install -y git cmake build-essential libncurses-dev libtinfo-dev +``` + +Now clone and build Process Watch: + +```bash +cd ~ +git clone --recursive https://github.com/intel/processwatch.git +cd processwatch +./build.sh +sudo ln -s ~/processwatch/processwatch /usr/local/bin/processwatch +``` + +Process Watch requires elevated privileges to access kernel performance counters and eBPF features. + +Run the following commands to enable the required permissions: + +```bash +sudo setcap CAP_PERFMON,CAP_BPF=+ep ./processwatch +sudo sysctl -w kernel.perf_event_paranoid=-1 +sudo sysctl kernel.unprivileged_bpf_disabled=0 +``` + +These commands grant Process Watch access to performance monitoring and eBPF tracing capabilities. + +Verify the installation: + +```bash +./processwatch --help +``` + +You should see a usage summary similar to: + +```output +usage: processwatch [options] +options: + -h Displays this help message. + -v Displays the version. + -i Prints results every seconds. + -n Prints results for intervals. + -c Prints all results in CSV format to stdout. + -p Only profiles . + -m Displays instruction mnemonics, instead of categories. + -s Profiles instructions with a sampling period of . Defaults to 100000 instructions (1 in 100000 instructions). + -f Can be used multiple times. Defines filters for columns. Defaults to 'FPARMv8', 'NEON', 'SVE' and 'SVE2'. + -a Displays a column for each category, mnemonic, or extension. This is a lot of output! + -l Prints a list of all available categories, mnemonics, or extensions. + -d Prints only debug information. +``` + +You can run a quantized TinyLlama model on the Grace CPU to generate the instruction activity. + +Use the same CPU-only llama.cpp build created in the previous section: + +```bash +cd ~/llama.cpp/build-cpu/bin +./llama-cli \ + -m ~/models/TinyLlama-1.1B/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \ + -ngl 0 \ + -t 20 \ + -p "Explain the benefits of vector processing in modern Arm CPUs." +``` + +Keep this terminal running while the model generates text output. You can now attach Process Watch to this active process. Once the llama.cpp process is running on the Grace CPU, attach Process Watch to observe its live instruction activity. + +If only one `llama-cli` process is running, you can directly launch Process Watch without manually checking its PID: + +```bash +sudo processwatch --pid $(pgrep llama-cli) +``` + +If multiple processes are running, first identify the correct process ID: + +```bash +pgrep llama-cli +``` + +Then attach Process Watch to monitor the instruction mix of this process: + +```bash +sudo processwatch --pid +``` + +Replace `` with the actual process ID from the previous command. + +{{% notice Note %}} +`processwatch --list` does not display all system processes. +It is intended for internal use and may not list user-level tasks like llama-cli. +Use `pgrep` or `ps -ef | grep llama` or `htop` to identify process IDs before attaching. +{{% /notice %}} + +Process Watch displays a live instruction breakdown similar to the following: + +```output +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 5.07 15.23 0.00 0.00 100.00 29272 +72930 llama-cli 5.07 15.23 0.00 0.00 100.00 29272 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.57 9.95 0.00 0.00 100.00 69765 +72930 llama-cli 2.57 9.95 0.00 0.00 100.00 69765 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 1.90 6.61 0.00 0.00 100.00 44249 +72930 llama-cli 1.90 6.61 0.00 0.00 100.00 44249 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.60 10.16 0.00 0.00 100.00 71049 +72930 llama-cli 2.60 10.16 0.00 0.00 100.00 71049 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.12 7.56 0.00 0.00 100.00 68553 +72930 llama-cli 2.12 7.56 0.00 0.00 100.00 68553 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.52 9.40 0.00 0.00 100.00 65339 +72930 llama-cli 2.52 9.40 0.00 0.00 100.00 65339 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.34 7.76 0.00 0.00 100.00 42015 +72930 llama-cli 2.34 7.76 0.00 0.00 100.00 42015 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.66 9.77 0.00 0.00 100.00 74616 +72930 llama-cli 2.66 9.77 0.00 0.00 100.00 74616 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.15 7.06 0.00 0.00 100.00 58496 +72930 llama-cli 2.15 7.06 0.00 0.00 100.00 58496 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.61 9.34 0.00 0.00 100.00 73365 +72930 llama-cli 2.61 9.34 0.00 0.00 100.00 73365 + +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +ALL ALL 2.52 8.37 0.00 0.00 100.00 26566 +72930 llama-cli 2.52 8.37 0.00 0.00 100.00 26566 +``` + +Here is an interpretation of the values: +- NEON: 7–15% for SIMD integer and floating-point operations +- FPARMv8: 2-5% for scalar FP operations such as activation and normalization +- SVE/SVE2: 0%, the kernel does not issue SVE instructions + +This confirms that the Grace CPU performs quantized inference primarily using NEON. + +## Why are SVE and SVE2 inactive? + +Although the Grace CPU supports SVE and SVE2, the vector length is 16 bytes (128-bit). + +Verify the current setting: + +```bash +cat /proc/sys/abi/sve_default_vector_length +``` + +The output is similar to: + +```output +16 +``` + +Even if you try to increase the length it cannot be changed. + +```bash +echo 256 | sudo tee /proc/sys/abi/sve_default_vector_length +``` + +This behavior is expected because SVE is available but fixed at 128 bits. + +{{% notice Note %}} +Future kernel updates might introduce SVE2 instructions. +{{% /notice %}} + +## What you've accomplished and what's next + +You have completed the Learning Path for analyzing large language model inference on the DGX Spark platform with Arm-based Grace CPUs and Blackwell GPUs. + +Throughout this Learning Path, you have learned how to: + +- Set up your DGX Spark system with the required Arm software stack and CUDA 13 environment +- Build and validate both GPU-accelerated and CPU-only versions of llama.cpp for quantized LLM inference +- Download and run quantized TinyLlama models for efficient testing and benchmarking +- Monitor GPU utilization and performance using tools like nvtop +- Analyze CPU instruction mix with Process Watch to understand how Armv9 vector instructions are used during inference +- Interpret the impact of NEON, SVE, and SVE2 on AI workloads, and recognize current kernel limitations for vector execution + +By completing these steps, you are now equipped to: + +- Profile and optimize LLM workloads on Arm-based systems +- Identify performance bottlenecks and opportunities for acceleration on both CPU and GPU +- Prepare for future enhancements in Armv9 vector processing and software support +- Confidently deploy and monitor AI inference on modern Arm server platforms +For additional learning, see the resources in the Further Reading section. You can continue experimenting with different models and monitoring tools as new kernel updates become available. + diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_index.md new file mode 100644 index 0000000000..5d5a37757f --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_index.md @@ -0,0 +1,61 @@ +--- +title: Unlock quantized LLM performance on Arm-based NVIDIA DGX Spark + +minutes_to_complete: 60 + +who_is_this_for: This is an introductory topic for AI practitioners, performance engineers, and system architects who want to learn how to deploy and optimize quantized large language models (LLMs) on NVIDIA DGX Spark systems powered by the Grace-Blackwell (GB10) architecture. + +learning_objectives: + - Describe the Grace–Blackwell (GB10) architecture and its support for efficient AI inference + - Build CUDA-enabled and CPU-only versions of llama.cpp for flexible deployment + - Validate the functionality of both builds on the DGX Spark platform + - Analyze how Armv9 SIMD instructions accelerate quantized LLM inference on the Grace CPU + +prerequisites: + - Access to an NVIDIA DGX Spark system with at least 15 GB of available disk space + - Familiarity with command-line interfaces and basic Linux operations + - Understanding of CUDA programming basics and GPU/CPU compute concepts + - Basic knowledge of quantized large language models (LLMs) and machine learning inference + - Experience building software from source using CMake and make + + +author: Odin Shen + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Cortex-A + - Cortex-X +operatingsystems: + - Linux +tools_software_languages: + - Python + - C + - Bash + - llama.cpp + +further_reading: + - resource: + title: NVIDIA DGX Spark website + link: https://www.nvidia.com/en-gb/products/workstations/dgx-spark/ + type: website + - resource: + title: NVIDIA DGX Spark Playbooks GitHub repository + link: https://github.com/NVIDIA/dgx-spark-playbooks + type: documentation + - resource: + title: Profile llama.cpp performance with Arm Streamline and KleidiAI LLM kernels Learning Path + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/ + type: blog + - resource: + title: Arm-Powered NVIDIA DGX Spark Workstations to Redefine AI + link: https://newsroom.arm.com/blog/arm-powered-nvidia-dgx-spark-ai-workstations + type: blog + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_next-steps.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/htop.png b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/htop.png new file mode 100644 index 0000000000..0bcd461ce8 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/htop.png differ diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/nvtop.png b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/nvtop.png new file mode 100644 index 0000000000..dbdb78ef15 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/nvtop.png differ diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md index 3261fc7983..598c23fee6 100644 --- a/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md +++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md @@ -44,7 +44,7 @@ Verify your system supports KVM by running: ```console sudo apt install cpu-checker -y -sudo kvm-ok +kvm-ok ``` If KVM is available, you will see the messages: @@ -59,6 +59,13 @@ This confirms that: - The KVM kernel module is loaded - The `/dev/kvm` device exists +Add your user account to the KVM group: + +```console +sudo usermod -a -G kvm $USER +newgrp kvm +``` + ## Install required software The scripts require several software packages. @@ -67,7 +74,7 @@ Install the packages using the Linux package manager. ```console sudo apt update -sudo apt install qemu-system-arm qemu-utils genisoimage wget curl jq uuid-runtime -y +sudo apt install qemu-system-arm qemu-utils genisoimage wget curl jq uuid-runtime seabios -y ``` If needed, the [Remmina](https://remmina.org/) remote desktop (RDP) client is automatically installed by the run script so you don't need to install it now, but you can install it using this command: @@ -76,7 +83,4 @@ If needed, the [Remmina](https://remmina.org/) remote desktop (RDP) client is au sudo apt install remmina remmina-plugin-rdp -y ``` - You’ve verified your system requirements and you’re now ready to move on and start working with Windows on Arm virtual machines. - - diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md index 9536a75bfc..2a661b771d 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md @@ -8,7 +8,7 @@ layout: "learningpathall" --- ## Objective -In this section, you will build a real-time camera processing pipeline using Halide. First, you capture video frames from a webcam using OpenCV, then implement a Gaussian (binomial) blur to smooth the captured images, followed by thresholding to create a clear binary output highlighting prominent image features. After establishing this pipeline, you will measure performance and then explore Halide's scheduling options—parallelization and tiling—to understand when they help and when they don’t. +In this section, you will build a real-time camera processing pipeline using Halide. First, you capture video frames from a webcam using OpenCV, then implement a Gaussian (binomial) blur to smooth the captured images, followed by thresholding to create a clear binary output highlighting prominent image features. After establishing this pipeline, you will measure performance and then explore Halide's scheduling options—parallelization and tiling—to understand when they help and when they don’t. ## Gaussian blur and thresholding Create a new `camera-capture.cpp` file and modify it as follows: @@ -58,7 +58,7 @@ int main() { input.dim(2).set_stride(1); input.dim(2).set_bounds(0, 3); - // Clamp borders + // Clamp borders Func inputClamped = BoundaryConditions::repeat_edge(input); // Grayscale conversion (Rec.601 weights) @@ -68,7 +68,7 @@ int main() { 0.587f * inputClamped(x, y, 1) + 0.299f * inputClamped(x, y, 2)); - // 3×3 binomial blur + // 3×3 binomial blur Func blur("blur"); const uint16_t k[3][3] = {{1,2,1},{2,4,2},{1,2,1}}; Expr sum = cast(0); @@ -77,11 +77,11 @@ int main() { sum += cast(gray(x + i - 1, y + j - 1)) * k[j][i]; blur(x, y) = cast(sum / 16); - // Threshold fused with blur + // Threshold fused with blur Func output("output"); Expr T = cast(128); output(x, y) = select(blur(x, y) > T, cast(255), cast(0)); - + // Allocate output buffer once Buffer outBuf(width, height); @@ -212,7 +212,7 @@ The output should look as in the figure below: ## Parallelization and Tiling In this section, you will explore two complementary scheduling optimizations provided by Halide: Parallelization and Tiling. Both techniques help enhance performance but achieve it through different mechanisms—parallelization leverages multiple CPU cores, whereas tiling improves cache efficiency by optimizing data locality. -Now you will learn how to use each technique separately for clarity and to emphasize their distinct benefits. +Now you will learn how to use each technique separately for clarity and to emphasize their distinct benefits. Let’s first lock in a measurable baseline before we start changing the schedule. You will create a second file, `camera-capture-perf-measurement.cpp`, that runs the same grayscale → blur → threshold pipeline but prints per-frame timing, FPS, and MPix/s around the Halide realize() call. This lets you quantify each optimization you will add next (parallelization, tiling, caching). @@ -254,7 +254,7 @@ int main() { const int width = frame.cols; const int height = frame.rows; - const int ch = frame.channels(); + const int ch = frame.channels(); // Build the pipeline once (outside the capture loop) ImageParam input(UInt(8), 3, "input"); @@ -286,8 +286,11 @@ int main() { Expr T = cast(128); output(x, y) = select(blur(x, y) > T, cast(255), cast(0)); - // Baseline schedule: materialize gray; fuse blur+threshold into output - gray.compute_root(); + // Scheduling + { + // Baseline schedule: materialize gray; fuse blur+threshold into output + gray.compute_root(); + } // Allocate output buffer once & JIT once Buffer outBuf(width, height); @@ -336,7 +339,7 @@ int main() { return 0; } ``` - + * The console prints ms, FPS, and MPix/s per frame, measured strictly around realize() (camera capture and UI are excluded). * The first frame is labeled [warm-up] because it includes Halide's JIT compilation. You can ignore it when comparing schedules. * MPix/s = (width*height)/seconds is a good resolution-agnostic metric to compare schedule variants. @@ -344,128 +347,90 @@ int main() { Build and run the application. Here is the sample output: ```console -% ./camera-capture-perf-measurement -realize: 4.84 ms | 206.53 FPS | 428.25 MPix/s +% ./camera-capture-perf-measurement +realize: 3.98 ms | 251.51 FPS | 521.52 MPix/s ``` -This gives an FPS of 206.53, and average throughput of 428.25 MPix/s. Now you can start measuring potential improvements from scheduling. +This gives an FPS of 251.51, and average throughput of 521.52 MPix/s. Now you can start measuring potential improvements from scheduling. ### Parallelization Parallelization lets Halide run independent pieces of work at the same time on multiple CPU cores. In image pipelines, rows (or row tiles) are naturally parallel once producer data is available. By distributing work across cores, we reduce wall-clock time—crucial for real-time video. -With the baseline measured, apply a minimal schedule that parallelizes the blur reduction across rows while keeping the final stage explicit at root. This avoids tricky interactions between a parallel consumer and an unscheduled reduction. +With the baseline measured, apply a minimal schedule that parallelizes the loop iteration for y axis. -Add these lines after defining output(x, y) (and before any realize()): +Add these lines after defining output(x, y) (and before any realize()). In this sample code, replace the existing scheduling block. ```cpp -blur.compute_root().parallel(y); // parallelize reduction across scanlines -output.compute_root(); // cheap pixel-wise stage at root +// Scheduling +{ + // parallelize across scanlines + gray.compute_root().parallel(y); + output.compute_root().parallel(y); +} ``` This does two important things: -* compute_root() on blur moves the reduction to the top level, so it isn’t nested under a parallel loop that might complicate reduction ordering. -* parallel(y) parallelizes over the pure loop variable y (rows), not the reduction domain r, which is the safe/idiomatic way to parallelize reductions in Halide. +* compute_root() on gray divides the entire processing into two loops, one to compute the entire gray output, and the other to compute the final output. +* parallel(y) parallelizes over the pure loop variable y (rows). The rows are computed on different CPU cores in parallel. Now rebuild and run the application again. The results should look like: ```output % ./camera-capture-perf-measurement -realize: 3.80 ms | 263.07 FPS | 545.49 MPix/s +realize: 1.16 ms | 864.15 FPS | 1791.90 MPix/s ``` -That’s ≈20% faster than baseline. +The performance gain by parallelization depends on how many CPU cores are available for this application to occupy. ### Tiling Tiling is a scheduling technique that divides computations into smaller, cache-friendly blocks or tiles. This approach significantly enhances data locality, reduces memory bandwidth usage, and leverages CPU caches more efficiently. While tiling can also use parallel execution, its primary advantage comes from optimizing intermediate data storage. Tiling splits the image into cache-friendly blocks (tiles). Two wins: * Partitioning: tiles are easy to parallelize across cores. -* Locality: when you cache intermediates per tile, you avoid refetching/recomputing data and hit L1/L2 more often. +* Locality: when you cache intermediates per tile, you avoid refetching/recomputing data and hit CPU L1/L2 cache more often. Now lets look at both flavors. ### Tiling with explicit intermediate storage (best for cache efficiency) Here you will cache gray once per tile so the 3×3 blur can reuse it instead of recomputing RGB -> gray up to 9× per output pixel. -Before using this, remove any earlier compute_root().parallel(y) schedule for blur. - ```cpp -// After defining: input, gray, blur, thresholded -Halide::Var xo("xo"), yo("yo"), xi("xi"), yi("yi"); - -// Tile & parallelize the consumer; vectorize inner x on planar output. -output - .tile(x, y, xo, yo, xi, yi, 128, 64) - .vectorize(xi, 16) - .parallel(yo); - -// Compute blur inside each tile and vectorize its inner x. -blur - .compute_at(output, xo) - .vectorize(x, 16); - -// Cache RGB→gray per tile (reads interleaved input → keep unvectorized). -gray - .compute_at(output, xo) - .store_at(output, xo); +// Scheduling +{ + Halide::Var xo("xo"), yo("yo"), xi("xi"), yi("yi"); + + // Tile & parallelize the consumer + output + .tile(x, y, xo, yo, xi, yi, 128, 64) + .parallel(yo); + + // Cache RGB→gray per tile + gray + .compute_at(output, xo) + .store_at(output, xo); +} ``` In this scheduling: * tile(...) splits the image into cache-friendly blocks and makes it easy to parallelize across tiles. -* blur.compute_at(thresholded, xo) localizes the blur computation to each tile (it doesn’t force storing blur; it just computes it where it’s needed, keeping the working set small). +* parallel(yo) distributes tiles across CPU cores where a CPU core is in charge of a row (yo) of tiles. * gray.compute_at(...).store_at(...) materializes a tile-local planar buffer for the grayscale intermediate so blur can reuse it within the tile. -* Vectorization is applied only to planar stages (blur, thresholded), gray stays unvectorized because it reads interleaved input (x-stride = channels). Recompile your application as before, then run. What we observed on our machine: ```output -realize: 2.36 ms | 423.10 FPS | 877.34 MPix/s +realize: 0.98 ms | 1023.15 FPS | 2121.60 MPix/s ``` -This was the fastest variant here—caching a planar grayscale per tile enabled efficient reuse and vectorized blur reads. +This was the fastest variant here—caching a planar grayscale per tile enabled efficient reuse. -### Tiling for parallelization (without explicit intermediate storage) -Tiling can also be used just to partition work across cores, without caching intermediates. This keeps the schedule simple: you split the output into tiles, parallelize across tiles, and vectorize along unit-stride x. Producers are computed inside each tile to keep the working set small, but don’t materialize extra tile-local buffers: -```cpp -// Tiling (partitioning only) -Halide::Var xo("xo"), yo("yo"), xi("xi"), yi("yi"); +### How we schedule +In general, there is no one-size-fits-all rule of scheduling to achieve the best performance as it depends on your pipeline characteristics and the target device architecture. So, it is recommended to explore the scheduling options and that is where Halide's scheduling API is purposed for. -output - .tile(x, y, xo, yo, xi, yi, 128, 64) // try 128x64; tune per CPU - .vectorize(xi, 16) // safe: planar, unit-stride along x - .parallel(yo); // run tiles across cores - -blur - .compute_at(output, xo) // keep work tile-local - .vectorize(x, 16); // vectorize planar blur -``` - -What this does -* tile(...) splits the image into cache-friendly blocks and makes parallelization straightforward. -* parallel(yo) distributes tiles across CPU cores. -* compute_at(thresholded, xo) evaluates blur per tile (better locality) without forcing extra storage. -* Vectorization is applied to planar stages (blur, thresholded). - -Recompile your application as before, then run. On our test machine, we got 5.56 ms (179.91 FPS, 373.07 MPix/s). This is slower than both the baseline and the parallelization-only schedule. The main reasons: -* Recomputation of gray: with a 3×3 blur, each output reuses up to 9 neighbors; leaving gray inlined means RGB→gray is recomputed for each tap. -* Interleaved input: gray reads BGR interleaved data (x-stride = channels), limiting unit-stride vectorization efficiency upstream. -* Overhead vs. work: a 3×3 blur has low arithmetic intensity; extra tile/task overhead isn’t amortized. - -Tiling without caching intermediates mainly helps partition work, but for tiny kernels on CPU (and interleaved sources) it often underperforms. The earlier “quick win” (blur.compute_root().parallel(y)) remains the better choice here. - -### Tiling vs. parallelization -* Parallelization spreads independent work across CPU cores. For this pipeline, the safest/most effective quick win was: -```cpp -blur.compute_root().parallel(y); -thresholded.compute_root(); -``` -* Tiling for cache efficiency helps when an expensive intermediate is reused many times per output (e.g., larger kernels, separable/multi-stage pipelines, multiple consumers) and when producers read planar data. Caching gray per tile with a tiny 3×3 kernel over an interleaved source added overhead and ran slower. -* Tiling for parallelization (partitioning only) simplifies work distribution and enables vectorization of planar stages, but with low arithmetic intensity (3×3) and an interleaved source it underperformed here. - -When to choose what: -* Start with parallelizing the main reduction at root. -* Add tiling + caching only if: kernel ≥ 5×5, separable/multi-pass blur, or the intermediate is reused by multiple consumers—and preferably after converting sources to planar (or precomputing a planar gray). -* Keep stages that read interleaved inputs unvectorized; vectorize only planar consumers. +For example of this application: +* Start with parallelizing the outer-most loop. +* Add tiling + caching only if: there is a spatial filter, or the intermediate is reused by multiple consumers—and preferably after converting sources to planar (or precomputing a planar gray). +* From there, tune tile sizes and thread count for your target. `HL_NUM_THREADS` is the environmental variable which allows you to limit the number of threads in-flight. ## Summary -In this section, you built a real-time Halide+OpenCV pipeline—grayscale, a 3×3 binomial blur, then thresholding—and instrumented it to measure throughput. The baseline landed at 4.84 ms (206.53 FPS, 428.25 MPix/s). A small, safe schedule tweak that parallelizes the blur reduction across rows improved performance to 3.80 ms (263.07 FPS, 545.49 MPix/s)—about +20%. A tiling schedule used only for partitioning was slower at 5.56 ms (179.91 FPS, 373.07 MPix/s). In contrast, tiling with a cached per-tile grayscale (so the blur reuses a planar intermediate) was the fastest at 2.36 ms (423.10 FPS, 877.34 MPix/s). - -The pattern is clear. On CPU, with a small kernel and an interleaved camera source, the most reliable first step is to parallelize the main reduction across rows. Tiling pays off when you also cache a reused intermediate (e.g., a planar grayscale) so downstream stages get unit-stride, vectorizable access and better locality. Keep stages that read interleaved inputs unvectorized; vectorize planar consumers. From there, tune tile sizes and thread count for your target. Boundary conditions are handled once with repeat_edge, keeping edge behavior consistent and scheduling clean. +In this section, you built a real-time Halide+OpenCV pipeline—grayscale, a 3×3 binomial blur, then thresholding—and instrumented it to measure throughput. And then, we observed that parallelization and tiling improved the performance. +* Parallelization spreads independent work across CPU cores. +* Tiling for cache efficiency helps when an expensive intermediate is reused many times per output (e.g., larger kernels, separable/multi-stage pipelines, multiple consumers) and when producers read planar data. diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/1-dev-env-setup.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/1-dev-env-setup.md index ff573c391e..454856d8be 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/1-dev-env-setup.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/1-dev-env-setup.md @@ -13,61 +13,109 @@ In this Learning Path, you will learn how to build and deploy a simple LLM-based The first step is to prepare a development environment with the required software: - Android Studio (latest version recommended). -- Android NDK version 28.0.12433566. +- Android NDK version 28.0.12433566 or later. - Java 17 JDK. - Git. - Python 3.10 or later (these instructions have been tested with 3.10 and 3.12). The instructions assume macOS with Apple Silicon, an x86 Debian, or an Ubuntu Linux machine, with at least 16GB of RAM. -## Install Android Studio and Android NDK +## Install Java 17 JDK + +Open the [Java SE 17 Archive Downloads](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html) page in your browser. + +Select an appropriate download for your development machine operating system. -Follow these steps to install and configure Android Studio: +Downloads are available for macOS as well as Linux. -1. Download and install the latest version of [Android Studio](https://developer.android.com/studio/). +## Install and configure Android Studio -2. Start Android Studio and open the **Settings** dialog. +Start by downloading and installing the latest version of Android Studio by navigating to the Downloads page: -3. Navigate to **Languages & Frameworks**, then **Android SDK**. +``` +https://developer.android.com/studio/ +``` -4. In the **SDK Platforms** tab, check **Android 14.0 ("UpsideDownCake")**. +### For MacOS: Using UI -Next, install the specific version of the Android NDK that you require by first installing the Android command line tools: +Follow these steps to configure Android Studio: -Linux: +1. Start Android Studio and open the **Settings** dialog. +2. Navigate to **Languages & Frameworks**, then **Android SDK**. + +3. In the **SDK Platforms** tab, check **Android 14.0 ("UpsideDownCake")**. Click **Apply** to install. + +4. In the **SDK Tools** tab, check **NDK (Side by side)**. Click **Apply** to install. + +Initiate the `ANDROID_HOME` environment variable: + +```bash +export ANDROID_HOME="$(realpath ~/Library/Android/sdk)" ``` -curl https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip -o commandlinetools.zip -``` -macOS: +### For Linux: Using the CLI + +Command-line tools allow you to manage Android SDK components without the GUI. Create SDK directory and download command-line tools: +```bash +mkdir -p ~/Android/cmdline-tools +cd ~/Android/cmdline-tools +wget https://dl.google.com/android/repository/commandlinetools-linux-10406996_latest.zip ``` -curl https://dl.google.com/android/repository/commandlinetools-mac-11076708_latest.zip -o commandlinetools.zip + +Unzip and move into the `cmdline-tools` directory. + +```bash +unzip commandlinetools-linux-*.zip +mv cmdline-tools latest ``` -Unzip the Android command line tools: +Initiate the `ANDROID_HOME` environment variable and add the `sdkmanager` to `PATH`: +```bash +export ANDROID_HOME="~/Android" +export PATH="$ANDROID_HOME/cmdline-tools/latest/bin:$ANDROID_HOME/platform-tools:$PATH" ``` -unzip commandlinetools.zip -d android-sdk + +The next step is to accept the license agreements. Press 'y', then 'Enter', as many times as prompted. + +```bash +sdkmanager --licenses ``` -Install the NDK in the same directory that Android Studio installed the SDK. This is generally `~/Library/Android/sdk` by default. Set the requirement environment variables: +Finally, you can install the required Android SDK components: +```bash +sdkmanager "platform-tools" \ + "platforms;android-34" \ + "build-tools;34.0.0" \ + "ndk;29.0.14206865" ``` -export ANDROID_HOME="$(realpath ~/Library/Android/sdk)" -export PATH=$ANDROID_HOME/cmdline-tools/bin/:$PATH -sdkmanager --sdk_root="${ANDROID_HOME}" --install "ndk;28.0.12433566" -export ANDROID_NDK=$ANDROID_HOME/ndk/28.0.12433566/ + +## Verify NDK installation + +Verify by checking that the NDK was installed in the same directory that Android Studio installed the SDK. + +{{% notice Default Path %}} +On macOS, this is generally `~/Library/Android/sdk`, and on Linux, it's `~/Android/Sdk`. You should also update the command to use the installed NDK version. +{{% /notice %}} + +```bash +ls $ANDROID_HOME ``` -## Install Java 17 JDK +It should print the installed version, for example: -Open the [Java SE 17 Archive Downloads](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html) page in your browser. +```output +29.0.14206865 +``` -Select an appropriate download for your development machine operating system. +Set the required environment variable: -Downloads are available for macOS as well as Linux. +``` +export ANDROID_NDK="$ANDROID_HOME/ndk/29.0.14206865/" +``` ## Install Git and cmake diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md index 590d49ade7..e0080cc54c 100755 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md @@ -12,7 +12,9 @@ ExecuTorch is an end-to-end solution for enabling on-device inference capabiliti The best practice is to generate an isolated Python environment in which to install your ExecuTorch dependencies. We provide instructions for both a Python virtual environment and a Conda virtual environment, but you only need one. -### Option 1: Create a Python virtual environment +### Create a Python virtual environment + +Use the `venv` module that is available through Python: ```bash python3.10 -m venv executorch-venv @@ -21,24 +23,14 @@ source executorch-venv/bin/activate The prompt of your terminal has `executorch` as a prefix to indicate the virtual environment is active. -### Option 2: Create a Conda virtual environment - -Install Miniconda on your development machine by following the [Installing conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) instructions. - -Once `conda` is installed, create the environment: - -```bash -conda create -yn executorch-venv python=3.10.0 -conda activate executorch-venv -``` - ### Clone ExecuTorch and install the required dependencies -From within the conda environment, run the commands below to download the ExecuTorch repository and install the required packages: +From within the virtual environment, run the commands below to download the ExecuTorch repository and install the required packages: ``` bash git clone https://github.com/pytorch/executorch.git cd executorch +git checkout release/1.0 git submodule sync git submodule update --init --recursive ./install_executorch.sh diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md index a9422241fc..ee654a3521 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md @@ -32,6 +32,7 @@ Use `cmake` to cross-compile ExecuTorch: cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=arm64-v8a \ -DANDROID_PLATFORM=android-23 \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ @@ -69,11 +70,7 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DCMAKE_BUILD_TYPE=Release \ -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DSUPPORT_REGEX_LOOKAHEAD=ON \ -DBUILD_TESTING=OFF \ -Bcmake-out-android/examples/models/llama \ examples/models/llama diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/6-build-android-chat-app.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/6-build-android-chat-app.md index f62301100b..e229ba48d1 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/6-build-android-chat-app.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/6-build-android-chat-app.md @@ -6,17 +6,16 @@ weight: 7 layout: learningpathall --- +In this section, you will use a Android demo application to demonstrate local inference with ExecuTorch. + ## Build the Android Archive (AAR) -{{% notice Note %}} -You can use the Android demo application included in ExecuTorch repository [LlamaDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo) to demonstrate local inference with ExecuTorch. -{{% /notice %}} 1. Open a terminal window and navigate to the root directory of the `executorch` repository. -2. Set the following environment variables: +2. If you haven't already, set the following environment variables: ``` bash - export ANDROID_NDK=$ANDROID_HOME/ndk/28.0.12433566/ + export ANDROID_NDK=$ANDROID_HOME/ndk/29.0.14206865/ export ANDROID_ABI=arm64-v8a ``` @@ -25,21 +24,12 @@ You can use the Android demo application included in ExecuTorch repository [Llam Make sure you can confirm /build/cmake/android.toolchain.cmake is available for CMake to cross-compile. {{% /notice %}} -3. Run the following commands to set up the required JNI library: +3. Run the following command to set up the required JNI library: ``` bash - pushd extension/android - ./gradlew build - popd - pushd examples/demo-apps/android/LlamaDemo - ./gradlew :app:setup - popd + sh scripts/build_android_library.sh ``` -{{% notice Note %}} -This is running the shell script setup.sh which configures and builds the required core ExecuTorch, Llama, and Android libraries. -{{% /notice %}} - ## Getting models Make sure the exported model and tokenizer are copied to the Android phone: @@ -74,21 +64,26 @@ If the files are not on the device, use the device explorer to copy them. ## Build the Android Package Kit +Before starting, you need to obtain the example app by cloning `executorch-examples`: + +``` +git clone https://github.com/meta-pytorch/executorch-examples.git +``` + ### Option 1: Using Android Studio This is the recommended option. -1. Open Android Studio and select **Open an existing Android Studio project** and navigate to open `examples/demo-apps/android/LlamaDemo`. +1. Open Android Studio and select **Open an existing Android Studio project** and navigate to open `executorch-examples/llm/android/LlamaDemo`. 2. Run the app (^R). This builds and launches the app on the phone. ### Option 2: Command line -Without Android Studio UI, you can run gradle directly to build the app. You need to set up the Android SDK path and invoke gradle. +Without Android Studio UI, you can run gradle directly to build the app. You need to set up the Android SDK path and invoke gradle. Navigate to the newly cloned `executorch-examples` repository. ``` bash -export ANDROID_HOME= -pushd examples/demo-apps/android/LlamaDemo +pushd llm/android/LlamaDemo ./gradlew :app:installDebug popd ``` diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index 0c9685d2c2..1a686b2b6d 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -8,7 +8,7 @@ key_ip: maintopic: true operatingsystems_filter: - Android: 3 -- Linux: 185 +- Linux: 191 - macOS: 13 - Windows: 14 pinned_modules: @@ -19,13 +19,13 @@ pinned_modules: - migration subjects_filter: - CI-CD: 9 -- Containers and Virtualization: 32 -- Databases: 19 +- Containers and Virtualization: 34 +- Databases: 20 - Libraries: 9 - ML: 32 -- Performance and Architecture: 72 +- Performance and Architecture: 73 - Storage: 2 -- Web: 14 +- Web: 16 subtitle: Optimize cloud native apps on Arm for performance and cost title: Servers and Cloud Computing tools_software_languages_filter: @@ -45,7 +45,8 @@ tools_software_languages_filter: - Arm Development Studio: 3 - Arm ISA: 1 - Arm Performance Libraries: 1 -- Arm Streamline: 1 +- Arm Performance Studio: 1 +- Arm Streamline: 2 - armclang: 1 - armie: 1 - ArmRAL: 1 @@ -64,8 +65,8 @@ tools_software_languages_filter: - AWS Lambda: 1 - Azure CLI: 2 - Azure Portal: 1 -- Bash: 1 - bash: 2 +- Bash: 1 - Bastion: 3 - BOLT: 2 - bpftool: 1 @@ -81,6 +82,7 @@ tools_software_languages_filter: - Clang: 13 - ClickBench: 1 - ClickHouse: 1 +- Cloud Build: 1 - CMake: 1 - conda: 1 - cqlsh: 1 @@ -91,10 +93,9 @@ tools_software_languages_filter: - Docker Buildx: 1 - Envoy: 3 - ExecuTorch: 1 -- Express: 1 - FAISS: 1 - FlameGraph: 1 -- Flink: 1 +- Flink: 2 - Fortran: 1 - FunASR: 1 - FVP: 7 @@ -106,6 +107,7 @@ tools_software_languages_filter: - GitHub Actions: 1 - GitHub CLI: 1 - GitLab: 1 +- GKE: 1 - glibc: 1 - Go: 4 - Golang: 1 @@ -121,7 +123,7 @@ tools_software_languages_filter: - Intrinsics: 1 - iPerf3: 1 - ipmitool: 1 -- Java: 5 +- Java: 6 - JAX: 1 - JMH: 1 - Kafka: 2 @@ -130,17 +132,19 @@ tools_software_languages_filter: - KEDA: 1 - Kedify: 1 - Keras: 1 -- Kubernetes: 11 +- Kubernetes: 12 - Libamath: 1 - libbpf: 1 - Linaro Forge: 1 +- Linux kernel: 1 - Litmus7: 1 -- llama.cpp: 1 - Llama.cpp: 2 +- llama.cpp: 1 - LLM: 10 - llvm-mca: 1 - LSE: 1 - MariaDB: 1 +- Maven: 1 - Memcached: 2 - MLPerf: 1 - ModelScope: 1 @@ -153,8 +157,10 @@ tools_software_languages_filter: - Networking: 1 - Nexmark: 1 - NGINX: 4 +- nginx: 1 - Node.js: 5 -- npm: 2 +- node.js: 1 +- npm: 3 - Ollama: 1 - ONNX Runtime: 2 - OpenBLAS: 1 @@ -165,21 +171,25 @@ tools_software_languages_filter: - PAPI: 1 - perf: 6 - Perf: 1 +- Performance analysis: 1 - PHP: 1 - PHPBench: 1 -- PostgreSQL: 4 +- PostgreSQL: 5 - Profiling: 1 - Python: 32 - PyTorch: 9 - QEMU: 1 - RAG: 1 +- Rails: 1 - Redis: 3 - Remote.It: 2 - RME: 8 +- Ruby: 1 - Runbook: 71 - Rust: 2 - Service Mesh: 1 - Siege: 1 +- Skaffold: 1 - snappy: 1 - Snort3: 1 - SQL: 8 @@ -198,12 +208,13 @@ tools_software_languages_filter: - Trusted Firmware: 1 - Trustee: 1 - TSan: 1 -- TypeScript: 1 +- TypeScript: 2 - Vectorscan: 1 - Veraison: 2 - Visual Studio Code: 5 - vLLM: 2 - vvenc: 1 +- Web Server: 1 - Whisper: 1 - WindowsPerf: 1 - WordPress: 3 @@ -216,7 +227,7 @@ tools_software_languages_filter: weight: 1 cloud_service_providers_filter: - AWS: 17 -- Google Cloud: 23 -- Microsoft Azure: 18 +- Google Cloud: 26 +- Microsoft Azure: 19 - Oracle: 2 --- diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/_index.md index 264a2b50d8..5ed1b2542a 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/_index.md @@ -7,7 +7,7 @@ cascade: minutes_to_complete: 45 -who_is_this_for: This learning path is intended for software developers and DevOps engineers looking to set up and run CircleCI Arm native workflows on SUSE Linux Arm64 VMs, specifically on Google Cloud C4A with Axion processors, using self-hosted runners. +who_is_this_for: This is an introductory topic for software developers and DevOps engineers looking to set up and run CircleCI Arm native workflows on SUSE Linux Arm64 VMs, specifically on Google Cloud C4A with Axion processors, using self-hosted runners. learning_objectives: - Provision a SUSE Arm64 virtual machine on Google Cloud (C4A with Axion processors) @@ -40,7 +40,6 @@ tools_software_languages: - CircleCI - Node.js - npm - - Express - Docker operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/background.md index fa99445bc3..70e628a4f6 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/background.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/background.md @@ -16,10 +16,10 @@ To learn more about Google Axion, refer to the [Introducing Google Axion Process ## CircleCI -CircleCI is a cloud-based **Continuous Integration and Continuous Delivery (CI/CD)** platform that automates the process of **building, testing, and deploying software**. +CircleCI is a cloud-based Continuous Integration and Continuous Delivery (CI/CD) platform that automates the process of building, testing, and deploying software. -It integrates with popular version control systems like **GitHub**, **Bitbucket**, and **GitLab**, and allows developers to define custom workflows in a `.circleci/config.yml` file using **YAML syntax**. +It integrates with popular version control systems like GitHub, Bitbucket, and GitLab, and allows developers to define custom workflows in a `.circleci/config.yml` file using the YAML syntax. -CircleCI supports multiple environments, including **Docker**, **Linux**, **macOS**, and **Windows**, and offers advanced features like **parallelism**, **caching**, and **matrix builds** to speed up pipelines and improve efficiency. +CircleCI supports multiple platforms, including Linux, macOS, and Windows, and offers advanced features like parallelism, caching, and matrix builds to speed up pipelines and improve efficiency. -It is widely used for **automating tests, running builds, deploying applications, and ensuring code quality** in modern development workflows. Learn more from the [CircleCI official website](https://circleci.com/) and its [documentation](https://circleci.com/docs/). +It is widely used for automating tests, running builds, deploying applications, and ensuring code quality in modern development workflows. Learn more from the [CircleCI official website](https://circleci.com/) and its [documentation](https://circleci.com/docs/). diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/circleci-arm64-cloud-demo.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/circleci-arm64-cloud-demo.md index 780b8d1e96..f581576f2d 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/circleci-arm64-cloud-demo.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/circleci-arm64-cloud-demo.md @@ -6,69 +6,97 @@ weight: 8 layout: learningpathall --- -## Deploying a Cloud-Native Arm64 Node.js App using self-hosted CircleCI Runner on GCP +## Deploying a Cloud-Native Arm64 Node.js App Using a Self-Hosted CircleCI Runner on GCP -This guide walks through building and testing a simple **Node.js web app** using a **self-hosted CircleCI Arm64 runner** on a **GCP SUSE Arm64 VM**. +This section demonstrates how to build and test a simple Node.js web application using a self-hosted CircleCI runner running on a Google Cloud C4A (Axion Arm64) SUSE Linux virtual machine. +You’ll configure Docker on the VM so that CircleCI jobs can build, test, and run containerized applications directly in your Arm64 environment, ideal for cloud-native development and CI/CD workflows targeting Arm architecture. -### Install and Configure Docker -Ensure Docker is installed, started, and accessible by both your user and the CircleCI runner service. -- **Install Docker**: Refresh your package manager and install Docker on your system. -- **Enable Docker Service**: Ensure Docker starts on boot and is running. -- **Add User to Docker Group**: Add both your user and the CircleCI runner to the Docker group to grant access. +### Install and Configure Docker +Ensure Docker is installed, enabled, and accessible by both your local user and the CircleCI runner service. -```console +1. Install Docker +Refresh your package manager and install Docker on your system: +```bash sudo zypper refresh sudo zypper install docker +``` +2. Enable and start Docker service +Set Docker to start automatically at boot and verify it’s running: +```bash sudo systemctl enable docker sudo systemctl start docker sudo systemctl status docker +``` +3. Grant Docker access to users +Add both your current user and the circleci system user to the Docker group so they can run Docker commands without sudo: +```bash sudo usermod -aG docker $USER sudo usermod -aG docker circleci ``` ### Validate Docker access -This command switches to the CircleCI user and checks if Docker is working correctly. +After installing Docker and adding the circleci user to the Docker group, verify that the CircleCI runner user can access Docker without requiring elevated privileges. -```console +Run the following commands: +```bash sudo -u circleci -i docker ps exit ``` ### Verify Docker Permissions -Check Docker socket permissions and ensure that the CircleCI runner is active and running. +Now, confirm that Docker’s socket permissions and the CircleCI runner service are both configured correctly. -```console +```bash ls -l /var/run/docker.sock ps -aux | grep circleci-runner ``` -- **Check Docker Socket Permissions**: This command ensures the Docker socket is accessible. -- **Verify CircleCI Runner Process**: Confirm the CircleCI runner service is active and running. +These commands ensure that the Docker socket is accessible and the CircleCI runner service is active and running. -### **Install Node.js and npm** +Once both checks pass, your environment is ready to build and run container-based pipelines with CircleCI on SUSE Arm64. -Before proceeding with the app setup, please make sure **Node.js** and **npm** (Node.js package manager) are installed on the VM, as they are required to run your Node.js app. +### Install Node.js and npm -- **Install Node.js**: Use the official Node.js package for Arm64 architecture. -- **Install npm**: npm is automatically installed when Node.js is installed. +Before setting up the sample application, ensure that `Node.js` and its package manager `npm` are installed on your SUSE Arm64 VM. Both are required to run, build, and test the `Node.js` web application within your CircleCI pipeline. + +- Install Node.js: Install the official Node.js package built for the Arm64 architecture. +- Install npm: npm (Node Package Manager) is bundled with Node.js but can also be explicitly installed or upgraded if needed. ```console sudo zypper install nodejs sudo zypper install npm ``` -### Clone Your App Repository -Clone your application repository (or create one locally): +Next, you’ll create the demo project and prepare its CircleCI configuration to run jobs using your self-hosted Arm64 runner. + +### Create a repository for your example code +To store and manage your Node.js demo application, you’ll create a new GitHub repository using the GitHub CLI. + +1. Install the GitHub CLI +The GitHub CLI (gh) lets you manage repositories, issues, and pull requests directly from your terminal. + +```bash +sudo zypper install -y gh +``` +2. Authenticate with GitHub +Run the following command to connect the CLI to your GitHub account: + +```bash +gh auth login +``` + +3. Create a New Repository +Create a new public repository for your demo project and clone it locally: ```console -git clone https://github.com//arm64-node-demo.git +gh repo create arm64-node-demo --public --clone cd arm64-node-demo ``` ### Create a Dockerfile -In the root of your project, create a `Dockerfile` that defines how to build and run your application container. +In the root of your project, create a file named `Dockerfile` to define how your `Node.js` application container will be built and executed. -```dockerfile +```console # Dockerfile FROM arm64v8/node:20-alpine WORKDIR /app @@ -78,13 +106,18 @@ COPY . . EXPOSE 3000 CMD ["npm", "start"] ``` -- **Use Arm64 Node.js Image**: The `arm64v8/node` image is specifically designed for Arm64 architecture. -- **Install Dependencies**: `RUN npm install` installs the project dependencies listed in `package.json`. -- **Expose Port**: The app will run on port 3000. -- **Start the App**: The container will execute `npm start` to launch the Node.js server. +Breakdown of the Dockerfile: + +- Uses Arm64 Node.js Image: The `arm64v8/node` image is specifically designed for Arm64 architecture. +- Install Dependencies: `RUN npm install` installs the project dependencies listed in `package.json`. +- Expose Port: The app will run on port 3000. +- Start the App: The container will execute `npm start` to launch the Node.js server. + +Next, you’ll add the application code and a `.circleci/config.yml` file to automate the build and test pipeline using your self-hosted Arm64 runner. ### Add a CircleCI Configuration -Create a `.circleci/config.yml` file to define the CircleCI pipeline for building and testing your Node.js app on Arm64 architecture. +Create a configuration file that defines your CircleCI pipeline for building, running, and testing your Node.js app on Arm64 architecture. +In the root of your project, create a folder named `.circleci` and inside it, add a file called `config.yml` with the contents below: ```yaml version: 2.1 @@ -124,14 +157,17 @@ workflows: jobs: - arm64-demo ``` -- **arm64-demo Job**: This job checks if the architecture is Arm64, builds the Docker image, runs it in a container, and tests the app endpoint. -- **resource_class**: Specify the resource class for the CircleCI runner (e.g., a custom Arm64 runner if using self-hosted). -- **Test Endpoint**: The job sends a request to the app to verify it’s working. + +Explanation of the yaml file: + +- arm64-demo Job: This job checks if the architecture is Arm64, builds the Docker image, runs it in a container, and tests the app endpoint. +- resource_class: Specify the resource class for the CircleCI runner (e.g., a custom Arm64 runner if using self-hosted). +- Test Endpoint: The job sends a request to the app to verify it’s working. ### Node.js Application -Here’s the basic code for the Node.js app. +Create the application files in your repository root directory for the Node.js app. -`index.js`: +Use a file editor of your choice and copy the contents shown below into a file named `index.js`: ```javascript const express = require('express'); @@ -146,7 +182,8 @@ app.listen(PORT, () => { console.log(`Server running on port ${PORT}`); }); ``` -package.json + +Now copy the content below into a file named `package.json`: ```json { @@ -162,53 +199,65 @@ package.json } } ``` -- **Express Server**: The application uses Express.js to handle HTTP requests and respond with a simple message. -- **Package Dependencies**: The app requires the `express` package for handling HTTP requests. +- Express Server: The application uses Express.js to handle HTTP requests and respond with a simple message. +- Package Dependencies: The app requires the `express` package for handling HTTP requests. ### Push Code to GitHub -Once all files (`Dockerfile`, `index.js`, `package.json`, `.circleci/config.yml`) are ready, push your project to GitHub so CircleCI can build it automatically. +Now that all project files (Dockerfile, index.js, package.json, and .circleci/config.yml) are ready, push the code to GitHub. +This allows CircleCI to automatically detect the repository and trigger your Arm64 build pipeline using the self-hosted runner. + +Configure Git username and add and commit project files: ```console +git config --global user.name "your-user-name" git add . git commit -m "Add ARM64 CircleCI Node.js demo project" git push -u origin main ``` -- **Add and Commit Changes**: Stage and commit your project files. -- **Push to GitHub**: Push your code to the GitHub repository so that CircleCI can trigger the build. +You have pushed your code to the GitHub repository so that CircleCI can trigger the build. ### Start CircleCI Runner and Execute Job -Ensure that your CircleCI runner is enabled and started. This will allow your self-hosted runner to pick up jobs from CircleCI. +Before triggering your first workflow, ensure that the CircleCI runner service is enabled and running on your SUSE Arm64 VM. This will allow your self-hosted runner to pick up jobs from CircleCI. -```console +```bash sudo systemctl enable circleci-runner sudo systemctl start circleci-runner sudo systemctl status circleci-runner ``` -- **Enable CircleCI Runner**: Ensure the CircleCI runner is set to start automatically on boot. -- **Start and Check Status**: Start the CircleCI runner and verify it is running. +- Enable CircleCI Runner: Ensures the runner service starts automatically on system boot. +- Start and Check Status: Starts the CircleCI runner and verifies it is running. + -After pushing your code to GitHub, open your **CircleCI Dashboard → Projects**, and confirm that your **ARM64 workflow** starts running using your **self-hosted runner**. +### Verify Job Execution in CircleCI + +After pushing your code to GitHub, open your CircleCI Dashboard → Projects, and confirm that your Arm64 workflow starts running using your self-hosted runner. If the setup is correct, you’ll see your job running under the resource class you created. ### Output -Once the job starts running, CircleCI will: +When the CircleCI workflow starts running on your self-hosted Arm64 runner, you’ll see the following stages executed in your CircleCI Dashboard: -- Detect the ARM64 architecture. +1. Detect the ARM64 Architecture +CircleCI confirms the job is executing on your Arm64 self-hosted runner. This validates that the pipeline is correctly targeting your Google Cloud C4A (Axion) VM. ![CircleCI Dashboard alt-text#center](images/output1.png "Figure 1: Show architecture") -- Build the Docker image. +2. Build the Docker image +The runner builds the `arm64-node-demo` Docker image using the Dockerfile you defined. ![CircleCI Dashboard alt-text#center](images/output2.png "Figure 2: Docker Image") -- Runs a container from that image. +3. Runs a container from that Image +Once the image is built, the job launches a container to host your Node.js web app. ![CircleCI Dashboard alt-text#center](images/output4.png "Figure 3: Container Run") -- Test the application by hitting the endpoint. - +4. Test the application by hitting the endpoint. +The workflow tests the running app by sending an HTTP request to http://localhost:3000. ![CircleCI Dashboard alt-text#center](images/output3.png "Figure 3: Verify App") +If the app responds successfully, the test confirms that the Node.js web server is running correctly inside the container. If successful, you will see your CircleCI job running and the app deployed in the CircleCI Dashboard. + +This demonstrates an end-to-end cloud-native CI/CD workflow running natively on SUSE Arm64 with Google Cloud C4A (Axion) as a self-hosted runner on CircleCI. diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/create_resource_class.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/create_resource_class.md index 6959ce5144..e890b61e69 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/create_resource_class.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/create_resource_class.md @@ -7,36 +7,38 @@ layout: learningpathall --- ## Create a Resource Class for Self-Hosted Runner in CircleCI -This guide explains how to create a **Resource Class** in the **CircleCI Web Dashboard** for a **self-hosted runner**. -A Resource Class defines a unique identifier for your runner and links it to your CircleCI namespace, allowing CircleCI jobs to target your custom machine environment. +This section explains how to create a Resource Class in the CircleCI Web Dashboard for a self-hosted runner. +A Resource Class is a unique identifier that links your self-hosted runner to your CircleCI organization (namespace). It defines the “machine type” that CircleCI jobs can target, ensuring that only authorized jobs run on your managed infrastructure, in this case, your SUSE Linux Arm64 VM on Google Cloud C4A (Axion). ### Steps -1. **Go to the CircleCI Web Dashboard** - - From the left sidebar, navigate to **Self-Hosted Runners**. - - You’ll see a screen asking you to accept the **terms of use**. - - **Check the box** that says **“Yes, I agree to the terms”** to enable runners. - - Then click **Self-Hosted Runners** to continue setup. +1. Open the CircleCI Web Dashboard + - Login or Create a new account at [CircleCI](https://app.circleci.com/home) + - In the left-hand navigation panel, click Self-Hosted Runners. + - If this is your first time setting up runners, you’ll be prompted to accept the Terms of Use. + Check “Yes, I agree to the terms” to enable runner functionality for your organization. + - After accepting, click Self-Hosted Runners again to continue the setup process. ![Self-Hosted Runners alt-text#center](images/shrunner0.png "Figure 1: Self-Hosted Runners ") -2. **Create a New Resource Class** +2. Create a New Resource Class -Click **Create Resource Class** on your CircleCI dashboard. +On your CircleCI Dashboard, click Create Resource Class. -**Fill in the following details:** +Fill in the following details: - - **Namespace:** Your CircleCI username or organization name (e.g., `circleci`) - - **Resource Class Name:** A clear, descriptive name for your runner (e.g., `arm64`) - - Click **Create Resource Class**. + * Namespace: Your CircleCI organization or username (e.g., circleci) + * Resource Class Name: A clear, descriptive identifier for your runner (e.g., arm64) + * Once complete, click Create Resource Class to generate it. ![Self-Hosted Runners alt-text#center](images/shrunner1.png "Figure 2: Create Resource Class ") ![Self-Hosted Runners alt-text#center](images/shrunner2.png "Figure 3: Details Resource Class & Namespace") -3. **Save and Copy the Token** - - Once created, CircleCI will generate a **Resource Class Token**. - - Copy this token and store it securely — you will need it to register your runner on the GCP VM. +3. Save and Copy the Token + + After creating the resource class, CircleCI automatically generates a Resource Class Token, a secure authentication key used to register your runner. Copy this token immediately and store it in a secure location. +You’ll need this token in the next step to connect your SUSE Arm64 runner on the Google Cloud C4A (Axion) VM to CircleCI. ![Self-Hosted Runners alt-text#center](images/shrunner3.png "Figure 4: Resource Class Token") diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install-circleci-cli.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install-circleci-cli.md index dbf2afdfa5..a456d42a4b 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install-circleci-cli.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install-circleci-cli.md @@ -7,35 +7,52 @@ layout: learningpathall --- ## Install CircleCI CLI on GCP VM -This guide explains how to install the **CircleCI Command Line Interface (CLI)** on a **GCP SUSE Arm64 virtual machine**. -The CLI allows you to interact with CircleCI directly from your terminal, such as to validate configuration files, run jobs locally, or manage runners. +This section explains how to install the CircleCI Command Line Interface (CLI) on a SUSE Linux (Arm64) virtual machine running on Google Cloud C4A (Axion). The CLI allows you to interact with CircleCI directly from your terminal, to validate configuration files, run jobs locally, or manage runners. ### Install Required Packages + +Before installing the CLI, make sure your SUSE environment has the necessary repositories and development tools. Add the openSUSE Leap repository: + +```bash +sudo zypper addrepo https://download.opensuse.org/distribution/leap/15.5/repo/oss/ openSUSE-Leap-15.5-OSS +``` + +Refresh package repositories: +```bash +sudo zypper refresh +``` +This updates the local metadata so that zypper recognizes the latest available packages and dependencies. + +# Install git Before installing the CircleCI CLI, make sure your system has the basic tools required for downloading and extracting files. ```console -sudo zypper install curl tar gzip coreutils gpg git +sudo zypper install -y curl tar gzip coreutils gpg git-core ``` +Once Git and the required tools are installed, you’re ready to download and configure the CircleCI CLI binary for Arm64. ## Download and Extract the CircleCI CLI -Now download the CircleCI CLI binary for Linux Arm64 and extract it. +Download the CircleCI CLI binary for Linux Arm64 and extract it. ```console curl -fLSs https://github.com/CircleCI-Public/circleci-cli/releases/download/v0.1.33494/circleci-cli_0.1.33494_linux_arm64.tar.gz | tar xz sudo mv circleci-cli_0.1.33494_linux_arm64/circleci /usr/local/bin/ ``` +Explanation of the commands: + - The `curl` command downloads the `.tar.gz` archive from the official CircleCI GitHub release page. - The `| tar xz` part extracts the downloaded file directly without saving it separately. -- After extraction, you’ll see a new folder named `circleci-cli_0.1.33494_linux_arm64` in your current directory. + +After extraction, you’ll see a new folder named `circleci-cli_0.1.33494_linux_arm64` in your current directory. ### Verify the Installation -Finally, verify that the CLI is installed correctly by checking its version. +Check that the CircleCI CLI is installed and executable: ```console circleci version ``` -You should see an output similar to: +You should see output similar to: ```output 0.1.33494+7cc6570 (release) ``` -If you see similar version output, the installation was successful! +The CircleCI CLI is now installed and running natively on your SUSE Arm64 VM (Google Cloud C4A). diff --git a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install_circleci_runner.md b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install_circleci_runner.md index dbacd91960..17d6966387 100644 --- a/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install_circleci_runner.md +++ b/content/learning-paths/servers-and-cloud-computing/circleci-gcp/install_circleci_runner.md @@ -8,59 +8,66 @@ layout: learningpathall ## Install CircleCI Machine Runner on SUSE Arm64 -This guide explains how to install and configure the **CircleCI Machine Runner** on a **GCP SUSE Arm64 virtual machine**. -This setup allows your self-hosted environment to execute CircleCI jobs targeting Arm64 architecture. +This section explains how to install and configure the CircleCI Machine Runner on a SUSE Linux Arm64 virtual machine running on Google Cloud C4A (Axion). By installing this runner, you enable your own VM to execute CircleCI Arm-native jobs. ### Add CircleCI Package Repository -SUSE is an RPM-based distribution, so first add the official CircleCI repository: +Because SUSE is an RPM-based Linux distribution, you first need to add the official CircleCI package repository from PackageCloud: ```console curl -s https://packagecloud.io/install/repositories/circleci/runner/script.rpm.sh?any=true | sudo bash ``` +This command automatically detects your distribution and adds the appropriate repository configuration for SUSE-based systems. ### Install the CircleCI Runner -Install the pre-built CircleCI runner package: +Before installation, create a symbolic link for `adduser`. The CircleCI runner installation script is primarily built for Debian/Ubuntu systems, which use the `adduser` command. SUSE uses `useradd` instead. + +```bash +sudo ln -s /usr/sbin/useradd /usr/sbin/adduser +``` + +Now install the CircleCI runner package: ```console sudo zypper install -y circleci-runner ``` ### Prepare User and Permissions -Before starting the runner, ensure the required user, group, and directory permissions are properly set up: - -```console -# Create a symlink for adduser (required on SUSE) -sudo ln -s /usr/sbin/useradd /usr/sbin/adduser +Before starting the CircleCI runner, ensure the correct user, group, and directory permissions are in place. These steps ensure the runner operates securely and has proper access to its configuration and work directories. -# Create CircleCI system user and group +Create CircleCI system user and group: +```bash sudo useradd -m -r circleci sudo groupadd --system circleci - -# Set up CircleCI directories and permissions +``` +Set up CircleCI directories and permissions: +```bash sudo mkdir -p /var/lib/circleci sudo chown -R circleci:circleci /var/lib/circleci sudo chown -R circleci:circleci /etc/circleci-runner - -# Reload systemd and restart the runner service +``` +Reload systemd and restart the runner service: +```bash sudo systemctl daemon-reload sudo systemctl restart circleci-runner - -# Verify service status +``` +Verify service status: +```bash sudo systemctl status circleci-runner ``` ### Configure the Runner Token -Replace the authentication token in the runner configuration file. -Use the token obtained from your Resource Class in the CircleCI Dashboard. +Now, configure the authentication token that connects your runner to CircleCI. +Use the token generated earlier from your Resource Class in the CircleCI dashboard. ```console export RUNNER_AUTH_TOKEN="AUTH_TOKEN " sudo sed -i "s/<< AUTH_TOKEN >>/$RUNNER_AUTH_TOKEN/g" /etc/circleci-runner/circleci-runner-config.yaml ``` +Replace AUTH_TOKEN with the actual token copied from the CircleCI dashboard. ### Enable and Start the Runner -Enable the CircleCI runner service to start automatically and verify it’s running: +Enable the CircleCI service to start automatically at boot, then start and verify the runner: ```console sudo systemctl enable circleci-runner @@ -85,6 +92,9 @@ Oct 09 11:15:03 lpprojectsusearm64 circleci-runner[10150]: 11:15:03 6f109 46.059 Oct 09 11:15:03 lpprojectsusearm64 circleci-runner[10150]: 11:15:03 6f109 46.119ms claim app.loop_name=claim: mode=agent res> Oct 09 11:15:03 lpprojectsusearm64 circleci-runner[10150]: 11:15:03 6f109 46.144ms worker loop: claim: app.backoff_ms=5000 a> ``` -Also, you can verify it from the dashboard: +You can also confirm that your runner is connected and active by visiting the Self-Hosted Runners page in the CircleCI web dashboard. ![Self-Hosted Runners alt-text#center](images/dashboard.png "Figure 1: Self-Hosted Runners ") + +Your CircleCI Machine Runner is now installed, configured, and registered on your SUSE Arm64 VM (Google Cloud C4A). +You can now define jobs in your `.circleci/config.yml` that target your Arm-native Resource Class and begin running builds directly on this runner. diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_index.md new file mode 100644 index 0000000000..34091b4fec --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_index.md @@ -0,0 +1,62 @@ +--- +title: Deploy Apache Flink on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This learning path is intended for software developers deploying and optimizing Apache Flink workloads on Linux/Arm64 environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install Apache Flink on a SUSE Arm64 (C4A) instance + - Validate Flink functionality by starting the Flink cluster and running a simple baseline job (e.g., WordCount) on the Arm64 VM + - Benchmark Flink performance using internal JMH-based micro-benchmarks on Arm64 (Aarch64) architecture + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [Apache Flink](https://flink.apache.org/) and its runtime environment + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Databases +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Flink + - Java + - Maven + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Flink documentation + link: https://nightlies.apache.org/flink/flink-docs-lts/ + type: documentation + + - resource: + title: Flink Performance Tool + link: https://github.com/apache/flink-benchmarks/tree/master?tab=readme-ov-file#flink-benchmarks + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/background.md new file mode 100644 index 0000000000..35b8cd2c17 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/background.md @@ -0,0 +1,25 @@ +--- +title: Getting started with Apache Flink on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Apache Flink + +[Apache Flink](https://flink.apache.org/) is an open-source, distributed **stream and batch data processing framework** developed under the [Apache Software Foundation](https://www.apache.org/). + +Flink is designed for **high-performance, low-latency, and stateful computations** on both unbounded (streaming) and bounded (batch) data. It provides a robust runtime and APIs in **Java**, **Scala**, and **Python** for building scalable, fault-tolerant data processing pipelines. + +Flink is widely used for **real-time analytics**, **event-driven applications**, **data pipelines**, and **machine learning workloads**. It integrates seamlessly with popular systems such as **Apache Kafka**, **Hadoop**, and various **cloud storage services**. + +To learn more, visit the [Apache Flink official website](https://flink.apache.org/) and explore the [documentation](https://nightlies.apache.org/flink/flink-docs-release-2.1/). diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md new file mode 100644 index 0000000000..1555729c3f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/baseline.md @@ -0,0 +1,95 @@ +--- +title: Apache Flink Baseline Testing on Google Axion C4A Arm Virtual Machine +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Apache Flink Baseline Testing on GCP SUSE VM +This guide explains how to perform **baseline testing** for Apache Flink after installation on a **GCP SUSE VM**. Baseline testing ensures that the Flink cluster is operational, the environment is correctly configured, and basic jobs run successfully. + +### Download and Extract Maven +Before running Flink jobs, ensure that **Java** and **Maven** are installed on your VM. +Download Maven and extract it: + +```console +cd /opt +sudo wget https://archive.apache.org/dist/maven/maven-3/3.8.6/binaries/apache-maven-3.8.6-bin.tar.gz +sudo tar -xvzf apache-maven-3.8.6-bin.tar.gz +sudo mv apache-maven-3.8.6 /opt/maven +``` + +### Set Environment Variables +Configure the environment so Maven commands are recognized system-wide: + +```console +echo "export M2_HOME=/opt/maven" >> ~/.bashrc +echo "export PATH=\$M2_HOME/bin:\$PATH" >> ~/.bashrc +source ~/.bashrc +``` +Verify the Maven installation: + +```console +mvn -version +``` +At this point, both Java and Maven are installed and ready to use. + +### Start the Flink Cluster +Before proceeding to start the Flink cluster, you need to allow port 8081 from your GCP console. + +Start the Flink cluster using the provided startup script: + +```console +cd $FLINK_HOME +./bin/start-cluster.sh +``` + +You should see output similar to: +```output +Starting cluster. +[INFO] 1 instance(s) of standalonesession are already running on lpprojectsusearm64. +Starting standalonesession daemon on host lpprojectsusearm64. +Starting taskexecutor daemon on host lpprojectsusearm64. +``` + +Verify that the JobManager and TaskManager processes are running: + +```console +jps +``` + +You should see output similar to: +```output +21723 StandaloneSessionClusterEntrypoint +2621 Jps +2559 TaskManagerRunner +``` + +### Access the Flink Web UI + +Open the Flink Web UI in a browser: + +```console +http://:8081 +``` + +- A successfully loaded dashboard confirms the cluster network and UI functionality. +-This serves as the baseline for network and UI validation. + +![Flink Dashboard alt-text#center](images/flink-dashboard.png "Figure 1: Flink Dashboard") + +### Run a Simple Example Job +Execute a sample streaming job to verify that Flink can run tasks correctly: + +```console +cd $FLINK_HOME +./bin/flink run examples/streaming/WordCount.jar +``` + +- Monitor the job in the Web UI or check console logs. +- Confirm that the job completes successfully. + +![Flink Dashboard alt-text#center](images/wordcount.png "Figure 2: Word Count Job") + +Flink baseline testing has been completed. You can now proceed to Flink benchmarking. diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/benchmarking.md new file mode 100644 index 0000000000..30b950740d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/benchmarking.md @@ -0,0 +1,107 @@ +--- +title: Apache Flink Benchmarking +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Apache Flink Benchmarking +This guide provides step-by-step instructions to set up and run **Apache Flink Benchmarks** on a **GCP SUSE VMs**. It covers cloning the repository, building the benchmarks, exploring the JAR, and listing available benchmarks. + +### Clone the Repository +Start by cloning the official Flink benchmarks repository. This repository contains all the benchmark definitions and example jobs. + +```console +cd ~ +git clone https://github.com/apache/flink-benchmarks.git +cd flink-benchmarks +``` + +### Build the Benchmarks with Maven +Use Maven to compile the benchmarks and generate the benchmark JAR. Skip tests to save time. + +```console +mvn clean package -DskipTests +``` +- **mvn clean package** → Cleans previous builds and packages the project. + +After this step, the target directory will contain the compiled **benchmarks.jar**. + +### Explore the JAR Contents +Verify the generated files inside the `target` directory: + +```console +cd target +ls +``` +You should see an output similar to: + +```output +benchmark-0.1.jar classes generated-test-sources maven-status protoc-plugins +benchmarks.jar generated-sources maven-archiver protoc-dependencies test-classes +``` +- **benchmarks.jar**→ The main benchmark JAR file used to run Flink benchmarks. + +### List Available Benchmarks +To view all the benchmarks included in the JAR: + +```console +java -jar benchmarks.jar -l +``` +- `-l` → Lists all benchmarks packaged in the JAR. +- This helps you identify which benchmarks you want to execute on your VM. + +### Run Selected Benchmarks +While the Flink benchmarking project includes multiple suites for state backends, windowing, checkpointing, and scheduler performance, this Learning path focuses on the Remote Channel Throughput benchmark to evaluate network and I/O performance. + +**Remote Channel Throughput**: Measures the data transfer rate between remote channels in Flink, helping to evaluate network and I/O performance. +```console +java -jar benchmarks.jar org.apache.flink.benchmark.RemoteChannelThroughputBenchmark.remoteRebalance +``` +You should see an output similar to: +```output + +Result "org.apache.flink.benchmark.RemoteChannelThroughputBenchmark.remoteRebalance": + 10536.511 ±(99.9%) 60.121 ops/ms [Average] + (min, avg, max) = (10289.593, 10536.511, 10687.736), stdev = 89.987 + CI (99.9%): [10476.390, 10596.633] (assumes normal distribution) + +# Run complete. Total time: 00:25:14 +Benchmark (mode) Mode Cnt Score Error Units +RemoteChannelThroughputBenchmark.remoteRebalance ALIGNED thrpt 30 17445.341 ± 153.256 ops/ms +RemoteChannelThroughputBenchmark.remoteRebalance DEBLOAT thrpt 30 10536.511 ± 60.121 ops/ms +``` + +### Flink Benchmark Metrics Explained + +- **Run Count**: Total benchmark iterations executed, higher count improves reliability. +- **Average Throughput**: Mean operations per second across all iterations. +- **Standard Deviation**: Variation from average throughput, smaller means more consistent. +- **Confidence Interval (99.9%)**: Range where the true average throughput lies with 99.9% certainty. +- **Min Throughput**: The lowest throughput was observed, and it shows worst-case performance. +- **Max Throughput**: Highest throughput observed, shows best-case performance. + +### Benchmark summary on x86_64 +To compare the benchmark results, the following results were collected by running the same benchmark on a `x86 - c4-standard-4` (4 vCPUs, 15 GB Memory) x86_64 VM in GCP, running SUSE: + +| Benchmark | Mode | Count | Score (ops/ms) | Error (±) | Min | Max | Stdev | CI (99.9%) | Units | +|---------------------------------------------------|---------|-------|----------------|-----------|------------|------------|---------|------------------------|--------| +| RemoteChannelThroughputBenchmark.remoteRebalance | ALIGNED | 30 | 24873.046 | 892.673 | 11195.028 | 12425.761 | 421.057 | [11448.649, 12011.275] | ops/ms | +| RemoteChannelThroughputBenchmark.remoteRebalance | DEBLOAT | 30 | 11729.962 | 281.313 | 11195.028 | 12425.761 | 421.057 | [11448.649, 12011.275] | ops/ms | + +### Benchmark summary on Arm64 +Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (SUSE): + +| Benchmark | Mode | Count | Score (ops/ms) | Error (±) | Min | Max | Stdev | CI (99.9%) | Units | +|---------------------------------------------------|---------|-------|----------------|-----------|-----------|-----------|---------|------------------------|--------| +| RemoteChannelThroughputBenchmark.remoteRebalance | ALIGNED | 30 | 17445.341 | 153.256 | 10289.593 | 10687.736 | 89.987 | [10476.390, 10596.633] | ops/ms | +| RemoteChannelThroughputBenchmark.remoteRebalance | DEBLOAT | 30 | 10536.511 | 60.121 | 10289.593 | 10687.736 | 89.987 | [10476.390, 10596.633] | ops/ms | + +### Apache Flink performance benchmarking comparison on Arm64 and x86_64 + +- The **ALIGNED mode** achieved an average throughput of **17,445 ops/ms**, demonstrating higher performance on the Arm64 VM. +- The **DEBLOAT mode** achieved an average throughput of **10,537 ops/ms**, slightly lower due to optimization differences. +- The benchmark confirms that the **Arm64 architecture** efficiently handles Flink's remote channel throughput workloads. +- Overall, the average throughput across both modes is approximately **14,854 ops/ms**, indicating strong baseline performance for Arm64 deployments. diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/flink-dashboard.png b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/flink-dashboard.png new file mode 100644 index 0000000000..390988ae60 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/flink-dashboard.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/wordcount.png b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/wordcount.png new file mode 100644 index 0000000000..1f71fb14a6 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/images/wordcount.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md new file mode 100644 index 0000000000..d61f86be03 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/installation.md @@ -0,0 +1,87 @@ +--- +title: Install Apache Flink +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install Apache Flink on GCP VM +This guide walks you through installing **Apache Flink** and its required dependencies on a **Google Cloud Platform (GCP) SUSE Arm64 Virtual Machine (VM)**. By the end of this section, you will have a fully configured Flink environment ready for job execution and benchmarking. + +### Update the System and Install Java +Before installing Flink, ensure your system packages are up to date and Java is installed. + +```console +sudo zypper refresh +sudo zypper update -y +sudo zypper install -y java-17-openjdk java-17-openjdk-devel +``` +This step ensures you have the latest system updates and the Java runtime needed to execute Flink applications. + +### Download Apache Flink Binary +Next, download the pre-built binary package for **Apache Flink** from the official Apache mirror. + +```console +cd /opt +sudo wget https://dlcdn.apache.org/flink/flink-2.1.0/flink-2.1.0-bin-scala_2.12.tgz +``` +This command retrieves the official Flink binary distribution for installation on your VM. + +{{% notice Note %}} +Flink 2.0.0 introduced Disaggregated State Management architecture, which enables more efficient resource utilization in cloud-native environments, ensuring high-performance real-time processing while minimizing resource overhead. +You can view [this release note](https://flink.apache.org/2025/03/24/apache-flink-2.0.0-a-new-era-of-real-time-data-processing/) + +The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) recommends Flink version 2.0.0, the minimum recommended on the Arm platforms. +{{% /notice %}} + +### Extract the Downloaded Archive +Extract the downloaded `.tgz` archive to make the Flink files accessible for configuration. + +```console +sudo tar -xvzf flink-2.1.0-bin-scala_2.12.tgz +``` +After extraction, you will have a directory named `flink-2.1.0` under `/opt`. + +**Rename the extracted directory for convenience:** +For easier access and management, rename the extracted Flink directory to a simple name like `/opt/flink`. + +```console +sudo mv flink-2.1.0 /opt/flink +``` +This makes future references to your Flink installation path simpler and more consistent. + +### Configure Environment Variables +Set the environment variables so the Flink commands are recognized system-wide. This ensures you can run `flink` from any terminal session. + +```console +echo "export FLINK_HOME=/opt/flink" >> ~/.bashrc +echo "export PATH=\$FLINK_HOME/bin:\$PATH" >> ~/.bashrc +``` + +Additionally, create a dedicated log directory for Flink and assign proper permissions: +```console +sudo mkdir -p /opt/flink/log +sudo chown -R $(whoami):$(id -gn) /opt/flink/log +sudo chmod -R 755 /opt/flink/log +``` + +**Apply the changes:** + +```console +source ~/.bashrc +``` + +### Verify the Installation +To confirm that Flink has been installed correctly, check its version: + +```console +flink -v +``` + +You should see an output similar to: + +```output +Version: 2.1.0, Commit ID: 4cb6bd3 +``` +This confirms that Apache Flink has been installed and is ready for use. diff --git a/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/instance.md new file mode 100644 index 0000000000..2b93bc950d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/flink-on-gcp/instance.md @@ -0,0 +1,31 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. Pick the preferred version for your Operating System. Ensure you select the **Arm image** variant. Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md new file mode 100644 index 0000000000..1b1e96f4bb --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md @@ -0,0 +1,60 @@ +--- +title: From x86 to Arm on GKE - Build, Deploy, and Migrate with Google Axion + +draft: true +cascade: + draft: true + +minutes_to_complete: 90 + +who_is_this_for: This is an advanced topic for cloud, platform, and site reliability engineers operating Kubernetes on Google Cloud who need a prescriptive path to build multi-architecture images and migrate services from x86 to Arm using Google Axion processors. + +learning_objectives: + - Prepare Dockerfiles for multi-architecture builds by adding arm64 support + - Create a dual-architecture GKE standard cluster with two node pools, amd64 and arm64 + - Build and publish multi-architecture images to Artifact Registry using Docker Buildx without using QEMU to emulate Arm instructions + - Deploy a Kubernetes application amd64 first, then migrate to arm64 using Kustomize overlays and progressive rollout + - Optionally automate builds and rollouts with Cloud Build and Skaffold + +prerequisites: + - A [Google Cloud account](https://console.cloud.google.com/) with billing enabled + - A local Linux or macOS computer or Cloud Shell access with Docker, Kubernetes CLI (kubectl), Google Cloud CLI (gcloud), and Git installed + - Basic familiarity with Docker, Kubernetes, and gcloud + +author: + - Rani Chowdary Mandepudi + +### Tags +skilllevels: Advanced +subjects: Containers and Virtualization +armips: + - Neoverse +operatingsystems: + - Linux +tools_software_languages: + - Kubernetes + - GKE + - Skaffold + - Cloud Build + + +further_reading: + - resource: + title: GKE documentation + link: https://cloud.google.com/kubernetes-engine/docs + type: documentation + - resource: + title: Create Arm-based clusters and node pools + link: https://cloud.google.com/kubernetes-engine/docs/how-to/create-arm-clusters-nodes + type: documentation + + + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/cloud-build.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/cloud-build.md new file mode 100644 index 0000000000..711646d610 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/cloud-build.md @@ -0,0 +1,265 @@ +--- +title: Automate builds and rollout with Cloud Build and Skaffold +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Google [**Cloud Build**](https://cloud.google.com/build/docs/set-up) is a managed CI/CD service that runs your containerized build and deploy steps in isolated runners. + +In this section, you'll automate the flow you performed manually: build multi-arch images, deploy to GKE on amd64, then migrate to arm64, and print the app's external IP. + +## What does this pipeline do? + +The pipeline performs the following steps: + +- Authenticates Docker to your Artifact Registry +- Builds and pushes amd64 and arm64 images with Docker Buildx, with QEMU enabled in the runner +- Connects to your GKE cluster +- Applies the amd64 Kustomize overlay, verifies pods, then applies the arm64 overlay and verifies pods again +- Prints the frontend-external LoadBalancer IP at the end + +{{% notice Tip %}} +Run this from the microservices-demo repo root in Cloud Shell. Ensure you completed the previous steps. +{{% /notice %}} + +## Grant IAM permission to the Cloud Build service account + +Cloud Build runs as a per-project service account: `@cloudbuild.gserviceaccount.com`. Grant it the minimal roles needed to build, push, log, and interact with GKE. + +Grant the required roles: + +```bash +# Uses env vars set earlier: PROJECT_ID, REGION, CLUSTER_NAME, GAR +PROJECT_NUMBER="$(gcloud projects describe "${PROJECT_ID}" --format='value(projectNumber)')" +CLOUD_BUILD_SA="${PROJECT_NUMBER}@cloudbuild.gserviceaccount.com" + +gcloud projects add-iam-policy-binding "${PROJECT_ID}" --member="serviceAccount:${CLOUD_BUILD_SA}" --role="roles/cloudbuild.builds.builder" --condition=None --quiet + +gcloud projects add-iam-policy-binding "${PROJECT_ID}" --member="serviceAccount:${CLOUD_BUILD_SA}" --role="roles/container.developer" --condition=None --quiet + +gcloud projects add-iam-policy-binding "${PROJECT_ID}" --member="serviceAccount:${CLOUD_BUILD_SA}" --role="roles/artifactregistry.writer" --condition=None --quiet + +gcloud projects add-iam-policy-binding "${PROJECT_ID}" --member="serviceAccount:${CLOUD_BUILD_SA}" --role="roles/logging.logWriter" --condition=None --quiet +``` + +## Update the Skaffold configuration + +Create a `skaffold.yaml` file for Cloud Build. This lets Cloud Build handle image builds and uses Skaffold only to apply the Kustomize overlays. + +Create the configuration: + +```yaml +# From the repo root (microservices-demo) +[ -f skaffold.yaml ] && cp skaffold.yaml "skaffold.yaml.bak.$(date +%s)" +cat > skaffold.yaml <<'YAML' + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: skaffold/v3 +kind: Config +metadata: + name: app +manifests: + kustomize: + paths: + - kustomize/base +deploy: + kubectl: {} +profiles: +- name: deploy-amd + patches: + - op: replace + path: /manifests/kustomize/paths/0 + value: kustomize/overlays/amd64 +- name: migrate-arm + patches: + - op: replace + path: /manifests/kustomize/paths/0 + value: kustomize/overlays/arm64 +--- +apiVersion: skaffold/v3 +kind: Config +metadata: + name: loadgenerator +requires: +- configs: [app] +manifests: + rawYaml: + - ./kubernetes-manifests/loadgenerator.yaml +deploy: + kubectl: {} +YAML + +``` + +## Create a YAML file for Cloud Build + +This pipeline installs Docker with Buildx in the runner, enables QEMU, builds two services as examples (extend as desired), connects to your cluster, deploys to amd64, verifies, migrates to arm64, verifies, and prints the external IP.  + +Run the commands to create the `cloudbuild.yaml` file. + +```yaml +cat > cloudbuild.yaml <<'YAML' + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START cloudbuild_microservice_demo_cloudbuild] + +# This configuration file is used to build and deploy the app into a +# GKE cluster using Google Cloud Build. +# +# PREREQUISITES: +# - Cloud Build service account must have role: "Kubernetes Engine Developer" + +# USAGE: +# GCP zone and GKE target cluster must be specified as substitutions +# Example invocation: +# `gcloud builds submit --config=cloudbuild.yaml --substitutions=_ZONE=us-central1-b,_CLUSTER=demo-app-staging .` + +substitutions: + _REGION: ${REGION} + _CLUSTER: ${CLUSTER_NAME} + _REPO: ${GAR} + +options: + machineType: "N1_HIGHCPU_8" + logging: CLOUD_LOGGING_ONLY +timeout: "7200s" + +steps: + # 1) Authenticate Docker to Artifact Registry + - name: gcr.io/google.com/cloudsdktool/cloud-sdk + entrypoint: bash + args: + - -ceu + - | + echo "Auth to GAR..." + gcloud auth configure-docker "$(echo "${_REPO}" | awk -F/ '{print $1}')" --quiet + + # 2) Build and push multi-arch images (examples: adservice, cartservice) + - name: gcr.io/google.com/cloudsdktool/google-cloud-cli:stable + entrypoint: bash + env: + - DOCKER_BUILDKIT=1 + - CLOUDSDK_CORE_DISABLE_PROMPTS=1 + args: + - -ceu + - | + apt-get update && apt-get install -y docker.io curl + mkdir -p ~/.docker/cli-plugins/ + curl -sSL https://github.com/docker/buildx/releases/download/v0.14.0/buildx-v0.14.0.linux-amd64 \ + -o ~/.docker/cli-plugins/docker-buildx + chmod +x ~/.docker/cli-plugins/docker-buildx + + # Start Docker daemon in the runner + dockerd > /var/log/dockerd.log 2>&1 & + timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' + + # Enable QEMU for cross-arch builds and create builder + docker run --privileged --rm tonistiigi/binfmt --install all + docker buildx create --name multi --use || true + docker buildx inspect --bootstrap + + # Build and push multi-arch images + docker buildx build --platform linux/amd64,linux/arm64 \ + -t "${_REPO}/adservice:v1" \ + src/adservice --push + + docker buildx build --platform linux/amd64,linux/arm64 \ + -t "${_REPO}/cartservice:v1" \ + src/cartservice/src --push + + # 3) Connect kubectl to the target cluster + - name: gcr.io/google.com/cloudsdktool/cloud-sdk:slim + entrypoint: bash + args: + - -ceu + - | + gcloud container clusters get-credentials "${_CLUSTER}" --region "${_REGION}" + + # 4) Deploy to amd64 node pool + - name: gcr.io/k8s-skaffold/skaffold:v2.16.1 + id: deploy-amd + entrypoint: bash + args: + - -ceu + - | + skaffold deploy --filename=skaffold.yaml --config loadgenerator -p deploy-amd + + # 5) Verify pods on amd64 + - name: gcr.io/google.com/cloudsdktool/cloud-sdk:latest + entrypoint: bash + args: + - -ceu + - | + echo "Pods on amd64:" + kubectl get pods -o wide + + # 6) Migrate to arm64 node pool + - name: gcr.io/k8s-skaffold/skaffold:v2.16.1 + id: migrate-arm + entrypoint: bash + args: + - -ceu + - | + skaffold deploy --filename=skaffold.yaml --config loadgenerator -p migrate-arm + + # 7) Verify pods on arm64 and print the external IP + - name: gcr.io/google.com/cloudsdktool/cloud-sdk:latest + entrypoint: bash + args: + - -ceu + - | + echo "Pods on arm64:" + kubectl get pods -o wide + echo "Fetching external IP for the frontend service..." + IP=$(kubectl get svc frontend-external -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo "Open http://$${IP} in your browser." +YAML +``` + +{{% notice Note %}} +In production, add one build step per microservice (or a loop) and enable caching. The example above builds two images for brevity, mirroring the manual steps you completed earlier.  +{{% /notice %}} + +## Run the pipeline + +Submit the build from the root of the repository: + +```bash +gcloud builds submit --config=cloudbuild.yaml --substitutions=_CLUSTER="${CLUSTER_NAME}",_REGION="${REGION}",_REPO="${GAR}" +``` + +The final step prints in the build description: + +``` +Open http:// in your browser. +``` + +Open the URL to load the storefront and confirm the full build, deploy, and migrate flow is automated. diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-build-push.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-build-push.md new file mode 100644 index 0000000000..f9ba2e79f6 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-build-push.md @@ -0,0 +1,166 @@ +--- +title: Provision a dual-architecture GKE cluster and publish images +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +You are ready to create a GKE cluster with two node pools (amd64 and arm64), then build and push multi-arch images natively on those node pools. + +Each architecture uses its own BuildKit pod, and no QEMU emulation is required. + +## Networking configuration + +GKE uses VPC-native (IP aliasing) and requires two secondary ranges on the chosen subnet: one for Pods and one for Services. + +For the default VPC, GKE creates the secondary ranges automatically. + +Run the commands below in your terminal, adjusting the environment variables as needed for your account: + +```bash +# Set/confirm network variables (adjust to your environment) +export REGION="${REGION:-us-central1}" +export NETWORK="dev-eco-nw-pb" # your VPC +export SUBNET="dev-eco-nw-subnet" # your subnet +export POD_RANGE_NAME="gke-boutique-pods" +export SVC_RANGE_NAME="gke-boutique-svcs" + +# Inspect the subnet and existing ranges +gcloud compute networks subnets list --network "${NETWORK}" --regions "${REGION}" \ + --format="table(name,region,ipCidrRange,secondaryIpRanges.list())" + +# If missing, add two secondary ranges (example CIDRs; ensure no overlap) +gcloud compute networks subnets update "${SUBNET}" --region "${REGION}" --add-secondary-ranges ${POD_RANGE_NAME}=10.8.0.0/14,${SVC_RANGE_NAME}=10.4.0.0/20 +``` + +This approach prevents users on the default VPC from accidentally setting NETWORK/SUBNET variables and passing incorrect flags later. + +## Create the GKE cluster + +Create a GKE Standard cluster with VPC-native (IP aliasing) enabled and no default node pool. You'll add amd64 and arm64 pools in the next step. + +The command below works for both default and custom VPCs. If the NETWORK, SUBNET, and secondary range variables are unset, GKE uses the default VPC and manages ranges automatically. + +```bash +# Cluster vars (reuses earlier PROJECT_ID/REGION/ZONE) +export CLUSTER_NAME="${CLUSTER_NAME:-gke-multi-arch-cluster}" + +# If using the default VPC, you can omit --network/--subnetwork. +# If using a custom VPC, include them and pass the secondary range names you set above. +gcloud container clusters create "${CLUSTER_NAME}" --region "${REGION}" --enable-ip-alias --num-nodes "1" --machine-type "e2-standard-2" ${NETWORK:+--network "${NETWORK}"} ${SUBNET:+--subnetwork "${SUBNET}"} ${POD_RANGE_NAME:+--cluster-secondary-range-name "${POD_RANGE_NAME}"} ${SVC_RANGE_NAME:+--services-secondary-range-name "${SVC_RANGE_NAME}"} +``` + +Now create an x86 (amd64) pool and an Arm (arm64) pool. Use machine types available in your region. The commands below use `c4-standard-*` for x86 and `c4a-standard-*` for Axion: + +```bash +# amd64 pool (x86) +gcloud container node-pools create amd64-pool --cluster="${CLUSTER_NAME}" --region="${REGION}" --machine-type="c4-standard-16" --num-nodes="1" --image-type="COS_CONTAINERD" --quiet + +# arm64 pool (Axion) +gcloud container node-pools create arm64-pool --cluster="${CLUSTER_NAME}" --region="${REGION}" --machine-type="c4a-standard-16" --num-nodes="1" --image-type="COS_CONTAINERD" --quiet + +# delete the tiny default pool +gcloud container node-pools delete default-pool --cluster="${CLUSTER_NAME}" --region="${REGION}" --quiet +``` + +Connect kubectl and confirm node architectures: + +```bash +gcloud container clusters get-credentials "${CLUSTER_NAME}" --region "${REGION}" +kubectl config current-context +kubectl get nodes -o wide +kubectl get nodes -L kubernetes.io/arch +``` + +You should see nodes for both architectures. In zonal clusters (or when a pool has `--num-nodes=1` in a single zone), expect one amd64 and one arm64 node. In regional clusters, `--num-nodes` is per zone, so with three zones you'll see three amd64 and three arm64 nodes. + +## Create the Buildx builder on GKE + +Now run a BuildKit pod on an amd64 node and another on an arm64 node. Buildx routes each platform's build to the matching pod. These are native builds with no QEMU emulation. + +```bash +# Namespace for BuildKit pods +kubectl create ns buildkit --dry-run=client -o yaml | kubectl apply -f - + +# Create the builder (amd64 node) +docker buildx create --driver kubernetes --name gke-native --use --driver-opt namespace=buildkit,replicas=1,loadbalance=sticky,nodeselector=kubernetes.io/arch=amd64 --platform linux/amd64 + +# Append an arm64 node to the same builder +docker buildx create --driver kubernetes --append --name gke-native --driver-opt namespace=buildkit,replicas=1,loadbalance=sticky,nodeselector=kubernetes.io/arch=arm64 --platform linux/arm64 + +# Bootstrap and verify pods +docker buildx inspect gke-native --bootstrap +kubectl -n buildkit get pods -o wide +``` + +You now have a multi-node Buildx builder named `gke-native`. Each BuildKit pod is pinned to a specific CPU architecture using node selectors. + +## Build and push all services + +You can now build all services for `linux/amd64` and `linux/arm64` using the GKE-backed builder. + +Run the commands: + +```bash +cat << 'EOF' > build-all-multiarch.sh +#!/usr/bin/env bash +set -euo pipefail + +: "${GAR:?Set GAR like REGION-docker.pkg.dev/PROJECT/REPO first}" + +services=( + adservice + cartservice # special context below + checkoutservice + currencyservice + emailservice + frontend + paymentservice + productcatalogservice + recommendationservice + shippingservice + loadgenerator +) + +for svc in "${services[@]}"; do + # cartservice Dockerfile path differs + if [ "$svc" = "cartservice" ] && [ -d "src/cartservice/src" ]; then + ctx="src/cartservice/src" + else + ctx="src/${svc}" + fi + + echo ">>> Building ${svc} for amd64+arm64..." + docker buildx build --builder gke-native --platform linux/amd64,linux/arm64 --provenance=false -t "${GAR}/${svc}:v1" "${ctx}" --push +done +EOF + +chmod +x build-all-multiarch.sh +./build-all-multiarch.sh +``` + +Each tag you push is a manifest list that points to two images, one per architecture. + +## Verify manifest lists and per-arch pulls + +List pushed images: + +```bash +gcloud artifacts docker images list "${GAR}" +``` + +Inspect one tag to confirm it shows both platforms: + +```bash +docker buildx imagetools inspect "${GAR}/adservice:v1" +``` + +The output is: + +```output +Platform: linux/amd64 +Platform: linux/arm64 +``` + +You are now ready to prepare the application manifests and deploy the application. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-deploy.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-deploy.md new file mode 100644 index 0000000000..6dc0e4b539 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/gke-deploy.md @@ -0,0 +1,142 @@ +--- +title: Prepare manifests and deploy on GKE +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +You'll now configure the application manifests to use your Artifact Registry images and create Kustomize overlays for different CPU architectures. This allows you to deploy the same application to both x86 and Arm node pools. + +## Prepare deployment manifests + +Replace sample image references with your Artifact Registry path and tag, then create Kustomize overlays to select nodes by architecture. + +### Point base manifests at your images + +Replace the image references with your references: + +```bash +# Replace the sample repo path with your GAR (from earlier: ${GAR}) +find kustomize/base -name "*.yaml" -type f -exec \ + sed -i "s|us-central1-docker.pkg.dev/google-samples/microservices-demo|${GAR}|g" {} + + +# Replace the sample tag with your tag +find kustomize/base -name "*.yaml" -type f -exec \ + sed -i "s|:v0\.10\.3|:v1|g" {} + + +# Verify changes +grep -r "${GAR}" kustomize/base/ || true +``` + +### Create node-selector overlays + +Create node-selector overlays for targeting specific architectures. + +First, create the directories: + +```bash +mkdir -p kustomize/overlays/amd64 kustomize/overlays/arm64 +``` + +Create the amd64 overlay: + +```bash +cat << 'EOF' > kustomize/overlays/amd64/kustomization.yaml +resources: +- ../../base +patches: +- path: node-selector.yaml + target: + kind: Deployment +EOF + +cat << 'EOF' > kustomize/overlays/amd64/node-selector.yaml +- op: add + path: /spec/template/spec/nodeSelector + value: + kubernetes.io/arch: amd64 +EOF +``` + +Create the arm64 overlay: + +```bash +cat << 'EOF' > kustomize/overlays/arm64/kustomization.yaml +resources: +- ../../base +patches: +- path: node-selector.yaml + target: + kind: Deployment +EOF + +cat << 'EOF' > kustomize/overlays/arm64/node-selector.yaml +- op: add + path: /spec/template/spec/nodeSelector + value: + kubernetes.io/arch: arm64 +EOF +``` + +You now have updated manifests that reference your container images and Kustomize overlays that target specific CPU architectures. + +## Deploy to the x86 (amd64) pool + +Render the amd64 Kustomize overlay (adds `nodeSelector: kubernetes.io/arch=amd64`) and apply it to the cluster. + +Run from the repository root after updating base manifests and setting your kube-context to this cluster: + +```bash +kubectl kustomize kustomize/overlays/amd64 | kubectl apply -f - +``` + +Check pod placement and status: + +```bash +kubectl get pods -o wide +# or include the architecture label on the nodes +kubectl get pods -o=custom-columns=NAME:.metadata.name,NODE:.spec.nodeName,STATUS:.status.phase --no-headers +``` + +Pods should be scheduled on nodes labeled `kubernetes.io/arch=amd64`. + +## Migrate to the Arm (arm64) pool + +Apply the arm64 overlay to move workloads: + +```bash +kubectl kustomize kustomize/overlays/arm64 | kubectl apply -f - +``` + +Verify pods have moved to arm64 nodes: + +```bash +kubectl get pods -o wide +``` + +You should see pods now running on nodes where `kubernetes.io/arch=arm64`. + +## Verify external access + +Get the LoadBalancer IP and open the storefront: + +```bash +kubectl get svc frontend-external +``` + +The output is similar to: + +```output +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +frontend-external LoadBalancer 10.12.3.45 34.123.45.67 80:31380/TCP 3m +``` + +Copy the EXTERNAL-IP value and open it in a new browser tab: + +```console +http:// +``` + +The microservices storefront loads, confirming that your application is accessible and functional on the arm64 node pool. + diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/multi-arch-images.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/multi-arch-images.md new file mode 100644 index 0000000000..c1e6a3dd0b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/multi-arch-images.md @@ -0,0 +1,177 @@ +--- +title: Create build-ready Dockerfiles for both architectures +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +With your environment set up, you're ready to modify the Online Boutique services to support multi-architecture builds. + +You will patch some Dockerfiles so they build cleanly for both architectures. In the next section, you will build and push images using a GKE-native Buildx builder. + +## Services to edit + +Most services already build for both architectures. + +The four listed below need small changes: + +- emailservice +- recommendationservice +- loadgenerator +- cartservice + +These edits don't change application behavior, they only ensure the right compiler headers and runtime libraries are present per architecture. This includes Python native wheels for email/recommendation/loadgen, and system `protoc` for the .NET cartservice. + +{{% notice Note %}} +Production migrations begin with assessing cross-architecture compatibility for each service (base images, native extensions such as CGO/JNI, platform-specific packages, and CI build targets). This section demonstrates minor Dockerfile edits for four representative services. In the referenced Online Boutique release, the remaining services generally build for both **amd64** and **arm64** without modification. +{{% /notice %}} + +### Update the emailservice Dockerfile + +You can review the [emailservice Dockerfile](https://raw.githubusercontent.com/GoogleCloudPlatform/microservices-demo/refs/heads/main/src/emailservice/Dockerfile) before replacing it. + +Run the following command to replace the entire contents of the file with the multi-architecture-compatible version: + +```bash +cat << 'EOF' > src/emailservice/Dockerfile +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# syntax=docker/dockerfile:1.6 +ARG TARGETPLATFORM +FROM --platform=$TARGETPLATFORM python:3.12.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base +FROM --platform=$TARGETPLATFORM base AS builder +RUN apk add --no-cache g++ gcc linux-headers musl-dev libffi-dev openssl-dev +COPY requirements.txt . +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --prefer-binary -r requirements.txt +FROM --platform=$TARGETPLATFORM base +ENV PYTHONUNBUFFERED=1 +ENV ENABLE_PROFILER=1 +RUN apk add --no-cache libstdc++ libgcc libffi openssl +WORKDIR /email_server +COPY --from=builder /usr/local/lib/python3.12/ /usr/local/lib/python3.12/ +COPY . . +EXPOSE 8080 +ENTRYPOINT ["python","email_server.py"] + + +EOF +``` + +Here is a summary of the changes: + +- **BuildKit syntax** unlocks `--mount=type=cache` to speed rebuilds. +- **TARGETPLATFORM** lets Buildx set linux/amd64 vs linux/arm64 explicitly. +- **Dev vs runtime packages:** build stage compiles native wheels; final stage keeps only needed shared libs. +- **`--prefer-binary`** avoids source builds when wheels exist (more reliable across arches). +- **Removed extra `apk update`** since `apk add --no-cache` already avoids stale indexes & caches. + +## Apply updates to the other three services + +Run the following sed commands to automatically patch the remaining Dockerfiles. + +### Update the recommendationservice Dockerfile + +You can review the [recommendationservice Dockerfile](https://raw.githubusercontent.com/GoogleCloudPlatform/microservices-demo/refs/heads/main/src/recommendationservice/Dockerfile) before modifying it. + +Paste the command below to your terminal to update the file with the required multi-architecture changes. + +```bash +sed -i \ + -e '/^# limitations under the License./a ARG TARGETPLATFORM' \ + -e 's|^FROM[[:space:]]\+python:3\.12\.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base|FROM --platform=\$TARGETPLATFORM python:3.12.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base|' \ + -e 's|^FROM[[:space:]]\+base[[:space:]]\+AS[[:space:]]\+builder|FROM --platform=\$TARGETPLATFORM base AS builder|' \ + -e 's|^FROM[[:space:]]\+base$|FROM --platform=\$TARGETPLATFORM base|' \ + -e '/apk add/ s/linux-headers/& musl-dev/' \ + -e '/pip install/ s|-r requirements.txt|--prefix=/install -r requirements.txt|' \ + -e 's|COPY --from=builder /usr/local/lib/python3\.12/ /usr/local/lib/python3\.12/|COPY --from=builder /install/lib/python3.12 /usr/local/lib/python3.12/|' \ + src/recommendationservice/Dockerfile +``` + +Here is a summary of the changes: + +- Make the base image architecture-aware +- Let native wheels build cleanly +- Keep the runtime slim & predictable + +### Update loadgenerator Dockerfile + +You can review the [loadgenerator Dockerfile](https://raw.githubusercontent.com/GoogleCloudPlatform/microservices-demo/refs/heads/main/src/loadgenerator/Dockerfile) before modifying it. + +Paste the command below to your terminal to run `sed` to update the file with the required multi-architecture changes. + +```bash +FILE=src/loadgenerator/Dockerfile + +# Platform plumbing (TARGETPLATFORM) + fix FROM lines +sed -i \ + -e '/^FROM --platform=\$BUILDPLATFORM python:3\.12\.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base/i ARG TARGETPLATFORM' \ + -e 's/^FROM --platform=\$BUILDPLATFORM python:3\.12\.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base/FROM --platform=$TARGETPLATFORM python:3.12.8-alpine@sha256:54bec49592c8455de8d5983d984efff76b6417a6af9b5dcc8d0237bf6ad3bd20 AS base/' \ + -e 's/^FROM base AS builder$/FROM --platform=$TARGETPLATFORM base AS builder/' \ + -e 's/^FROM base$/FROM --platform=$TARGETPLATFORM base/' \ + "$FILE" + +# Ensure libgcc is present on runtime apk line that installs libstdc++ +sed -i -E \ + '/^[[:space:]]*&&[[:space:]]*apk add --no-cache[[:space:]]+libstdc\+\+/{/libgcc/! s/libstdc\+\+/libstdc++ libgcc/}' \ + "$FILE" + +# Add musl-dev to the builder deps line +sed -i -E \ + '/^[[:space:]]*&&[[:space:]]*apk add --no-cache[[:space:]]+wget[[:space:]]+g\+\+[[:space:]]+linux-headers/ s/linux-headers/linux-headers musl-dev/' \ + "$FILE" +``` + +Here is a summary of the changes: + +- Make the base image architecture-aware +- Fix native build/run deps +- Keep runtime lean and no flags/app code changed + +### Update cartservice Dockerfile + +You can review the [carkservice Dockerfile](https://raw.githubusercontent.com/GoogleCloudPlatform/microservices-demo/refs/heads/main/src/cartservice/src/Dockerfile) before replacing it. + +Paste the command below to your terminal to update the file with the required multi-architecture changes. + +```bash +FILE=src/cartservice/src/Dockerfile + +# 1) After the ARG line, install protoc in the builder image +sed -i \ + '/^ARG TARGETARCH$/a RUN apt-get update \&\& apt-get install -y --no-install-recommends protobuf-compiler \&\& rm -rf /var/lib/apt/lists/*' \ + "$FILE" + +# 2) In the publish step, inject Protobuf_Protoc=/usr/bin/protoc right after the first line +sed -i \ + '/^RUN[[:space:]]\+dotnet publish cartservice\.csproj[[:space:]]*\\$/a \ -p:Protobuf_Protoc=/usr/bin/protoc \\' \ + "$FILE" + +``` + +Here is a summary of the changes: + +- Install the system `protoc` command +- Force MSBuild to use the supplied `protoc` command +- No behavioral changes + +{{% notice Note %}} +`ARG TARGETPLATFORM` + `FROM --platform=$TARGETPLATFORM` is not strictly required if you always build with --platform and your base image is multi-arch. Keeping it is good practice and makes intent explicit and does not change runtime behavior. + +{{% /notice %}} + +After making the Dockerfile modification, all services now support multi-architecture builds. + diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/project-setup.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/project-setup.md new file mode 100644 index 0000000000..635930445e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/project-setup.md @@ -0,0 +1,116 @@ +--- +# User change +title: "Overview and Environment Setup" + +weight: 2 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +This Learning Path demonstrates how to migrate a real microservices application from x86 to Arm (amd64 to arm64) on GKE using multi-architecture container images. The sample application is Google's Online Boutique, a polyglot microservices system that mirrors production architectures and ships with Dockerfiles. It's a realistic, real-world scenario, and the migration can be done with no major code changes. + +## Why Google Axion processors for GKE? + +Google Axion processors bring modern Arm-based compute to GKE, delivering strong price-performance and energy efficiency for cloud-native, scale-out services. With multi-architecture images and mixed node pools, services can migrate from x86 to Arm gradually, with no major code changes. + +### What is Google Axion? + +[Google Axion](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) is Google Cloud's Arm-based CPU family built on Arm Neoverse, designed for general-purpose, cloud-native services and CPU-based AI. Typical workloads include web apps and web servers, containerized microservices, open-source databases, in-memory caches, data analytics, media processing, and CPU-based AI inference and data processing. On GKE, Axion powers the C4A and N4A VM families and is paired with Google's Titanium offloads to free CPU cycles for application work. + +### Why migrate to Arm on GKE? + +There are three factors motivating the move to Google Axion processors: + +- **Price-performance:** run more workload per dollar for scale-out services +- **Energy efficiency:** reduce power usage for always-on microservices +- **Compatibility:** containerized apps typically migrate with build/deploy changes, and don't require code rewrites + +### About the Online Boutique sample application + +[Online Boutique](https://github.com/GoogleCloudPlatform/microservices-demo) is a polyglot microservices storefront, complete with shopping cart, checkout, catalog, ads, and recommendations. It's implemented in Go, Java, Python, .NET, and Node.js, with ready-to-use Dockerfiles and Kubernetes manifests. It's a realistic example for demonstrating an x86 to Arm migration with minimal code changes. + +### Multi-architecture on GKE (pragmatic path) + +This Learning Path presents a pragmatic migration approach that builds both amd64 and arm64 images using Docker Buildx with a Kubernetes driver, where builds run natively inside BuildKit pods on your GKE node pools without requiring QEMU emulation. You'll add an Arm node pool alongside existing x86 nodes, then use node selectors and affinity rules to control placement and migrate safely, service by service. + +### How this Learning Path demonstrates the migration + +You'll migrate the Online Boutique application from x86 to Arm using a practical, low-risk approach that leverages multi-architecture container images and mixed node pools. This allows you to validate each service on Arm before fully committing to the migration, ensuring compatibility and performance meet your requirements. + +The steps below outline the migration process: + +1. Open Google Cloud Shell and set the environment variables. +2. Enable required APIs, create an Artifact Registry repository, and authenticate Docker. +3. Create a GKE Standard cluster with an amd64 node pool and add an arm64 (Axion-based C4A) node pool. +4. Create a Buildx (Kubernetes driver) builder that targets both pools, then build and push multi-architecture images (amd64 and arm64) natively via BuildKit pods. +5. Deploy to amd64 first (Kustomize overlay), validate, then migrate to arm64 (overlay) and verify. +6. Automate builds and rollouts with Cloud Build and Skaffold. + +## Get started in Cloud Shell + +Use [Cloud Shell](https://cloud.google.com/shell/docs/using-cloud-shell) to set variables, enable APIs, create Artifact Registry, authenticate Docker, and clone the sample microservices demo. + +Make sure `kubectl`, `gcloud`, `docker`, and `git` commands are installed. + +{{% notice Note %}} +You can use your local macOS or Linux computer instead of Cloud Shell. Make sure the required software is installed. +{{% /notice %}} + +### Set environment variables + +Run the following commands in your terminal to set the project, region/zone, cluster, and Artifact Registry variables: + +```bash +export PROJECT_ID="$(gcloud config get-value project)" +export REGION="us-central1" +export ZONE="us-central1-a" +export CLUSTER_NAME="gke-multi-arch-cluster" + +# Artifact Registry settings +export REPO="multi-arch-services" +# GAR is the Artifact Registry host/repo prefix used in image tags (e.g., ${GAR}/service:tag) +export GAR="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO}" + +# Ensure gcloud uses these defaults +gcloud config set project "${PROJECT_ID}" +gcloud config set compute/region "${REGION}" +gcloud config set compute/zone "${ZONE}" +``` + +You'll need the environment variables in any shell you use to work on the project. + +### Enable required Google Cloud APIs + +Enable the required APIs so the project can create GKE clusters, push and pull container images in Artifact Registry, and use Cloud Build for CI/CD: + +```bash +gcloud services enable container.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com +``` + +### Create an Artifact Registry (Docker) repository + +Create a Docker repository in Artifact Registry in this region for pushing and pulling your multi-architecture images: + +```bash +gcloud artifacts repositories create "${REPO}" --repository-format=docker --location="${REGION}" --description="Multi-arch images for microservices demo" +``` + +### Authenticate Docker to Artifact Registry + +Authenticate Docker to Artifact Registry so you can push and pull images: + +```bash +gcloud auth configure-docker "${REGION}-docker.pkg.dev" +``` + +### Clone the Online Boutique sample microservices application + +Clone the sample application repository: + +```bash +git clone https://github.com/GoogleCloudPlatform/microservices-demo.git +cd microservices-demo +``` + +You're now ready to start making modifications for arm64 support. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/_index.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/_index.md new file mode 100644 index 0000000000..41cd6a2399 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/_index.md @@ -0,0 +1,66 @@ +--- +title: Build hybrid AKS clusters with Arm nodes and nginx + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: This Learning Path is for developers who want to understand nginx performance on x64 and arm64 deployments by running a hybrid Azure Kubernetes Service (AKS) cluster. + +learning_objectives: + - Create a hybrid AKS cluster with x64 and arm64 nodes + - Deploy nginx's multi-architecture container image, pods, and services to the AKS cluster + - Smoke test nginx from each architecture in the cluster to verify proper installation + - Test the performance of each architecture in the cluster + - Apply the same process to other kubernetes workloads + + +prerequisites: + - An [Azure account](https://azure.microsoft.com/en-us/free/). + - A local machine with [jq](https://jqlang.org/download/), [curl](https://curl.se/download.html), [wrk](https://github.com/wg/wrk), [Azure CLI](/install-guides/azure-cli/) and [kubectl](/install-guides/kubectl/) installed. + +author: + - Geremy Cohen + +### Tags +skilllevels: Introductory + +subjects: Containers and Virtualization +cloud_service_providers: Microsoft Azure + +armips: + - Neoverse + +operatingsystems: + - Linux + +tools_software_languages: + - nginx + - Web Server + +further_reading: + - resource: + title: nginx - High Performance Load Balancer, Web Server, & Reverse Proxy + link: https://nginx.org/ + type: documentation + - resource: + title: nginx Docker Hub + link: https://hub.docker.com/_/nginx + type: documentation + - resource: + title: Azure Kubernetes Service (AKS) documentation + link: https://docs.microsoft.com/en-us/azure/aks/ + type: documentation + - resource: + title: Learn how to tune Nginx + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/nginx_tune/ + type: documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/apply-configuration.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/apply-configuration.md new file mode 100644 index 0000000000..5561d9d82c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/apply-configuration.md @@ -0,0 +1,183 @@ +--- +title: Monitor performance with wrk and btop +weight: 70 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install btop monitoring tool on nginx pods + +Now that you have all your nginx deployments running across Intel and Arm architectures, you can monitor performance across each architecture using wrk to generate load and btop to monitor system performance. + +{{% notice Note %}} +This tutorial uses [wrk](https://github.com/wg/wrk) to generate load, which is readily available on apt and brew package managers. [wrk2](https://github.com/giltene/wrk2) is a modern fork of wrk with additional features. wrk was chosen for this tutorial due to its ease of installation, but if you prefer to install and use wrk2 (or other http load generators) for your testing, feel free to do so. +{{% /notice %}} + +### Install btop and apply optimized configuration + +The `nginx_util.sh` script includes a `put config` command that will: + +- Apply a performance-optimized nginx configuration to all pods +- Install btop monitoring tool on all pods for system monitoring +- Restart pods with the new configuration + +Run the following command to apply the configuration updates: + +```bash +./nginx_util.sh put btop +``` + +You will see output similar to the following: + +```output +Installing btop on all nginx pods... +Installing btop on nginx-amd-deployment-56b547bb47-vgbjj... +✓ btop installed on nginx-amd-deployment-56b547bb47-vgbjj +Installing btop on nginx-arm-deployment-66cb47ddc9-fgmsd... +✓ btop installed on nginx-arm-deployment-66cb547ddc9-fgmsd +Installing btop on nginx-intel-deployment-6f5bff9667-zdrqc... +✓ btop installed on nginx-intel-deployment-6f5bff9667-zdrqc +✅ btop installed on all pods! +``` + +### Check pod restart status + +Check that all pods have restarted with the new configuration: + +```bash +kubectl get pods -n nginx +``` + +You should see all pods with recent restart times. + +{{% notice Note %}} +Because pods are ephemeral, btop will need to be reinstalled if the pods are deleted or restarted. If you get an error saying btop is not found, rerun the `./nginx_util.sh put btop` command to reinstall it. +{{% /notice %}} + + +### Set up real-time performance monitoring + +You can now log in to any pod and use btop to monitor system performance. There are many variables that can affect an individual workload's performance, and btop (like top) is a great first step in understanding those variables. + +{{% notice Note %}} +When performing load generation tests from your laptop, local system and network settings may interfere with proper load generation between your machine and the remote cluster services. To mitigate these issues, it's suggested to install the `nginx_util.sh` script on a [remote Azure instance](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/azure/) in the same region and zone as your K8s cluster for best results. If you aren't seeing at least 70K+ requests/s to either K8s service endpoint, switching to a better located system is advised. +{{% /notice %}} + +Running two btop terminals, one for each pod, is a convenient way to view performance in real time. + +To bring up btop on both Arm and Intel pods: + +1. Open two new terminal windows +2. In one terminal, run `login arm` from the nginx utility script to enter the pod +3. In the second terminal, run `login intel` from the nginx utility script to enter the pod +4. Once inside each pod, run btop to see real-time system monitoring + +The commands are shown below. + +For the Arm terminal: + +```bash +./nginx_util.sh login arm +``` + +For the Intel terminal: + +```bash +./nginx_util.sh login intel +``` + +In both terminals run: + +```bash +btop --utf-force +``` + +You should now see something similar to the image below, with one terminal for each Arm and Intel pod running btop: + +![Project Overview](images/btop_idle.png) + +To visualize performance with btop against the Arm and Intel pods via the load balancer service endpoints, you can use the `nginx_util.sh` wrapper to generate load to both simultaneously: + +```bash +./nginx_util.sh wrk both +``` + +This runs wrk with predefined settings (1 thread, 50 simultaneous connections) to generate load to the K8s architecture-specific endpoints. + +While it runs (for a default of 30s), you can observe some performance characteristics from the btop outputs: + +![Project Overview](images/under_load.png) + +Of particular interest is memory and CPU resource usage per pod. For Intel, red marker 1 shows memory usage for the process, and red marker 2 shows total CPU usage. + +Red markers 3 and 4 show the same metrics for Arm. + +![Project Overview](images/mem_and_cpu.png) + +In addition to the visual metrics, the script also returns runtime results including requests per second and latencies: + +```output +Running wrk against both architectures in parallel... + +Intel: wrk -t1 -c50 -d30 http://172.193.227.195/ +ARM: wrk -t1 -c50 -d30 http://20.252.73.72/ + +======================================== + +INTEL RESULTS: +Running 30s test @ http://172.193.227.195/ + 1 threads and 50 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 752.40us 1.03ms 28.95ms 94.01% + Req/Sec 84.49k 12.14k 103.08k 73.75% + 2528743 requests in 30.10s, 766.88MB read +Requests/sec: 84010.86 +Transfer/sec: 25.48MB + +ARM RESULTS: +Running 30s test @ http://20.252.73.72/ + 1 threads and 50 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 621.56us 565.90us 19.75ms 95.43% + Req/Sec 87.54k 10.22k 107.96k 82.39% + 2620567 requests in 30.10s, 789.72MB read +Requests/sec: 87062.21 +Transfer/sec: 26.24MB + +======================================== +Both tests completed +``` + +### Customize load testing parameters + +The `nginx_util.sh` script shows the results of the load generation, as well as the command lines used to generate them. + +```output +... +Intel: wrk -t1 -c50 -d30 http://172.193.227.195/ +ARM: wrk -t1 -c50 -d30 http://20.252.73.72/ +... +``` + + +Feel free to experiment with by increasing and decreasing client threads, connections, and durations to better understand the performance characteristics under different scenarios. + +For example, to generate load using 500 connections across 4 threads to the Arm service for 5 minutes (300s), you can use the following command: + +```bash +wrk -t4 -c500 -d300 http://20.252.73.72/ +``` + +## Next Steps + +You have learned how to run a sample nginx workload on a dual-architecture (Arm and Intel) Azure Kubernetes Service. + +You learned how to generate load with the wrk utility and monitor runtime metrics with btop. + +Here are some ideas for further exploration: + +* What do the performance curves look like between the two architectures as a function of load? +* How do larger instance types scale versus smaller ones? + +You now have the knowledge to experiment with your own workloads on Arm-based AKS nodes to identify performance and efficiency opportunities unique to your own environments. diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/create-test-utility.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/create-test-utility.md new file mode 100644 index 0000000000..f3cb97605c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/create-test-utility.md @@ -0,0 +1,52 @@ +--- +title: Create the test utility +weight: 20 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Test utility script + +In this section, you'll create a utility script to test and manage your nginx services across both architectures. The script will be used throughout the Learning Path to test services, apply configurations, and access pods. + +### Script functionality + +The `nginx_util.sh` script provides three main functions: + +- **`curl intel|arm|multiarch`** - Test nginx services and show which pod served the request +- **`put btop`** - Install btop monitoring tool on all pods +- **`login intel|arm`** - Interactive bash access to architecture-specific pods + +The script conveniently bundles test and logging commands into a single place, making it easy to test, troubleshoot, and view services. + +### Download the utility script + +{{% notice Note %}} +The following utility `nginx_util.sh` is provided for your convenience. + +It's a wrapper for kubectl and other commands, utilizing [curl](https://curl.se/). Make sure you have curl installed before running. + +You can click on the link below to review the code before downloading. +{{% /notice %}} + +Copy and paste the following commands into a terminal to download and create the `nginx_util.sh` script: + +```bash +curl -o nginx_util.sh https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/refs/heads/main/nginx_util.sh +chmod +x nginx_util.sh +``` + +In the folder you ran the curl command, you should now see the `nginx_util.sh` script. Test it by running: + +```bash +./nginx_util.sh +``` + +The output presents the usage instructions: + +```output +Invalid first argument. Use 'curl', 'wrk', 'put', or 'login'. +``` + +You're now ready to deploy nginx to the Intel nodes in the cluster. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-arm.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-arm.md new file mode 100644 index 0000000000..db0c57b576 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-arm.md @@ -0,0 +1,123 @@ +--- +title: Deploy nginx on Arm +weight: 50 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Add the Arm deployment and service + +In this section, you'll add nginx on Arm to your existing cluster, completing your multi-architecture Intel/Arm environment for comprehensive performance comparison. + +When applied, the **arm_nginx.yaml** file creates the following K8s objects: + - **Deployment** (`nginx-arm-deployment`) - Pulls the multi-architecture nginx image from DockerHub, launches a pod on the Arm node, and mounts the shared ConfigMap as `/etc/nginx/nginx.conf` + - **Service** (`nginx-arm-svc`) - Load balancer targeting pods with both `app: nginx-multiarch` and `arch: arm` labels + +Copy and paste the following commands into a terminal to download and apply the Arm deployment and service: + +```bash +curl -o arm_nginx.yaml https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/refs/heads/main/arm_nginx.yaml +kubectl apply -f arm_nginx.yaml +``` + +You will see output similar to: + +```output +deployment.apps/nginx-arm-deployment created +service/nginx-arm-svc created +``` + +### Examining the deployment configuration + +Taking a closer look at the `arm_nginx.yaml` deployment file, you'll see settings optimized for the Arm architecture: + +The `nodeSelector` value of `kubernetes.io/arch: arm64` ensures that the deployment only runs on Arm nodes, utilizing the `arm64` version of the nginx container image. + +```yaml + spec: + nodeSelector: + kubernetes.io/arch: arm64 +``` + +The service selector uses both `app: nginx-multiarch` and `arch: arm` labels to target only Arm pods. This dual-label approach allows for both architecture-specific and multi-architecture service routing. + +```yaml + selector: + app: nginx-multiarch + arch: arm +``` + +### Verify the deployment + +Get the status of nodes, pods and services by running: + +```bash +kubectl get nodes,pods,svc -nnginx +``` + +Your output should be similar to the following, showing two nodes, two pods, and two services: + +```output +NAME STATUS ROLES AGE VERSION +node/aks-arm-56500727-vmss000000 Ready 59m v1.32.7 +node/aks-intel-31372303-vmss000000 Ready 63m v1.32.7 + +NAME READY STATUS RESTARTS AGE +pod/nginx-arm-deployment-5bf8df95db-wznff 1/1 Running 0 36s +pod/nginx-intel-deployment-78bb8885fd-mw24f 1/1 Running 0 9m21s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/nginx-arm-svc LoadBalancer 10.0.241.154 48.192.64.197 80:30082/TCP 36s +service/nginx-intel-svc LoadBalancer 10.0.226.250 20.80.128.191 80:30080/TCP 9m22s +``` + +You can also verify the shared ConfigMap is available: + +```bash +kubectl get configmap -nnginx +``` + +The output is similar to: + +```output +NAME DATA AGE +nginx-config 1 10m +``` + +When the pods show `Running` and the service shows a valid `External IP`, you're ready to test the nginx Arm service. + +### Test the nginx web service on Arm + +Run the following command to make an HTTP request to the Arm nginx service using the script you created earlier: + +```bash +./nginx_util.sh curl arm +``` + +You get back the HTTP response, as well as information about which pod served it: + +```output +Using service endpoint 48.192.64.197 for curl on arm service +Response: +{ + "message": "nginx response", + "timestamp": "2025-10-24T22:04:59+00:00", + "server": "nginx-arm-deployment-5bf8df95db-wznff", + "request_uri": "/" +} +Served by: nginx-arm-deployment-5bf8df95db-wznff +``` + +If you see similar output, you have successfully added Arm nodes to your cluster running nginx. + +### Compare both architectures + +Now you can test both architectures and compare their responses: + +```bash +./nginx_util.sh curl intel +./nginx_util.sh curl arm +``` + +Each command will route to its respective architecture-specific service, allowing you to compare performance and verify that your multi-architecture cluster is working correctly. diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-intel.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-intel.md new file mode 100644 index 0000000000..f13d707d0e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-intel.md @@ -0,0 +1,154 @@ +--- +title: Deploy nginx on Intel x86 +weight: 30 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Deployment and service + +In this section, you'll add a new namespace, deployment, and service for nginx on Intel x86. The result will be a K8s cluster running nginx accessible via the Internet through a load balancer. + +To better understand the individual components, the configuration is split into three files: + +`namespace.yaml` - Creates a new namespace called `nginx`, which contains all your K8s nginx objects + +`nginx-configmap.yaml` - Creates a shared ConfigMap (`nginx-config`) containing performance-optimized nginx configuration used by both Intel and Arm deployments + +`intel_nginx.yaml` - Creates the following K8s objects: + - **Deployment** (`nginx-intel-deployment`) - Pulls a multi-architecture [nginx image](https://hub.docker.com/_/nginx) from DockerHub, launches a pod on the Intel node, and mounts the shared ConfigMap as `/etc/nginx/nginx.conf` + - **Service** (`nginx-intel-svc`) - Load balancer targeting pods with both `app: nginx-multiarch` and `arch: intel` labels + + +Run the following commands to download, create, and apply the namespace, ConfigMap, and Intel nginx deployment and service configuration: + +```bash +curl -o namespace.yaml https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/refs/heads/main/namespace.yaml +kubectl apply -f namespace.yaml + +curl -o nginx-configmap.yaml https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/refs/heads/main/nginx-configmap.yaml +kubectl apply -f nginx-configmap.yaml + +curl -o intel_nginx.yaml https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/refs/heads/main/intel_nginx.yaml +kubectl apply -f intel_nginx.yaml + +``` + +You will see output similar to: + +```output +namespace/nginx created +configmap/nginx-config created +deployment.apps/nginx-intel-deployment created +service/nginx-intel-svc created +``` + +### Examine the deployment configuration + +Take a closer look at the `intel_nginx.yaml` deployment file, you'll see some settings that ensure the deployment runs on the Intel x86 node. + +{{% notice Note %}} +The `amd64` architecture label represents x86_64 nodes, which can be either AMD or Intel processors. In this tutorial, we're using Intel x64 nodes. +{{% /notice %}} + +The `nodeSelector` value is set to `kubernetes.io/arch: amd64`. This ensures that the deployment only runs on x86_64 nodes, utilizing the amd64 version of the nginx container image. + +```yaml + spec: + nodeSelector: + kubernetes.io/arch: amd64 +``` + +The `sessionAffinity` tag removes sticky connections to the target pods. This removes persistent connections to the same pod on each request. + +```yaml +spec: + sessionAffinity: None +``` + +The service selector uses both `app: nginx-multiarch` and `arch: intel` labels to target only Intel pods. This dual-label approach allows for both architecture-specific and multi-architecture service routing. + +```yaml + selector: + app: nginx-multiarch + arch: intel +``` + +Because the final goal is to run nginx on multiple architectures, the deployment uses the standard nginx image from DockerHub. This image supports multiple architectures, including amd64 (Intel) and arm64 (Arm). + +```yaml + containers: + - image: nginx:latest + name: nginx +``` + +{{% notice Note %}} +Optionally, you can set the `default Namespace` to `nginx` to simplify future commands by removing the need to specify the `-nnginx` flag each time: +```bash +kubectl config set-context --current --namespace=nginx +``` +{{% /notice %}} + +### Verify the deployment is complete + +It's time to verify everything is running as expected. + +Confirm the nodes, pods, and services are running: + +```bash +kubectl get nodes,pods,svc -nnginx +``` + +Your output should be similar to the following, showing two nodes, one pod, and one service: + +```output +NAME STATUS ROLES AGE VERSION +node/aks-arm-56500727-vmss000000 Ready 50m v1.32.7 +node/aks-intel-31372303-vmss000000 Ready 55m v1.32.7 + +NAME READY STATUS RESTARTS AGE +pod/nginx-intel-deployment-78bb8885fd-mw24f 1/1 Running 0 38s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/nginx-intel-svc LoadBalancer 10.0.226.250 20.80.128.191 80:30080/TCP 39s +``` + +You can also verify the ConfigMap was created: + +```bash +kubectl get configmap -nnginx +``` + +```output +NAME DATA AGE +nginx-config 1 51s +``` + +With the pods in a `Ready` state and the service showing a valid `External IP`, you're now ready to test the nginx Intel service. + +### Test the Intel service + +Run the following to make an HTTP request to the Intel nginx service: + +```bash +./nginx_util.sh curl intel +``` + +You get back the HTTP response, as well as information about which pod served it: + +```output +Using service endpoint 20.3.71.69 for curl on intel service +Response: +{ + "message": "nginx response", + "timestamp": "2025-10-24T16:49:29+00:00", + "server": "nginx-intel-deployment-758584d5c6-2nhnx", + "request_uri": "/" +} +Served by: nginx-intel-deployment-758584d5c6-2nhnx +``` + +If you see similar output, you've successfully configured your AKS cluster with an Intel node, running an nginx deployment and service with the nginx multi-architecture container image. + + diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-multiarch.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-multiarch.md new file mode 100644 index 0000000000..2f6b2695b9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/deploy-multiarch.md @@ -0,0 +1,100 @@ +--- +title: Deploy a nginx multiarch service +weight: 60 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Add a multi-architecture service to your cluster + +You now have nginx running on Intel and Arm nodes with architecture-specific services. In this section, you'll create a multi-architecture service that can route to any available nginx pod regardless of architecture, providing load balancing across both architectures. + +### Create the multiarch service + +The multiarch service targets all pods with the `app: nginx-multiarch` label (all nginx deployments share this label). It uses `sessionAffinity: None` to ensure requests are distributed across all available pods without stickiness, and can route to Intel or Arm pods based on availability and load balancing algorithms. + +Run the following commands to download and apply the multiarch service: + +```bash +curl -sO https://raw.githubusercontent.com/geremyCohen/nginxOnAKS/main/multiarch_nginx.yaml +kubectl apply -f multiarch_nginx.yaml +``` + +You see the following response: + +```output +service/nginx-multiarch-svc created +``` + +Next, get the status of all services by running: + +```bash +kubectl get svc -nnginx +``` + +Your output should be similar to the following, showing three services: + +```output +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +nginx-arm-svc LoadBalancer 10.0.241.154 48.192.64.197 80:30082/TCP 7m52s +nginx-intel-svc LoadBalancer 10.0.226.250 20.80.128.191 80:30080/TCP 16m +nginx-multiarch-svc LoadBalancer 10.0.40.169 20.99.208.140 80:30083/TCP 38s +``` + +Check which pods the multiarch service can route to: + +```bash +kubectl get endpoints nginx-multiarch-svc -nnginx +``` + +You should see both architecture pods listed as endpoints: + +```output +NAME ENDPOINTS AGE +nginx-multiarch-svc 10.244.0.21:80,10.244.1.1:80 47s +``` + +You are ready to test the multiarch service. + +### Test the nginx multiarch service + +Run the following to make HTTP requests to the multiarch nginx service: + +```bash +./nginx_util.sh curl multiarch +``` + +You get back the HTTP response from one of the available pods: + +```output +Using service endpoint 20.99.208.140 for curl on multiarch service +Response: +{ + "message": "nginx response", + "timestamp": "2025-10-24T22:12:23+00:00", + "server": "nginx-arm-deployment-5bf8df95db-wznff", + "request_uri": "/" +} +Served by: nginx-arm-deployment-5bf8df95db-wznff +``` + +Run the command multiple times to see load balancing across architectures: + +```bash +./nginx_util.sh curl multiarch +./nginx_util.sh curl multiarch +./nginx_util.sh curl multiarch +``` + +The responses will show requests being served by different architecture deployments (Intel or Arm), demonstrating that the multiarch service distributes the load across the available pods. + +### Compare architecture-specific versus multiarch routing + +Now you can compare the behavior: + +- **Architecture-specific**: `./nginx_util.sh curl intel` always routes to Intel pods +- **Architecture-specific**: `./nginx_util.sh curl arm` always routes to ARM pods +- **Multiarch**: `./nginx_util.sh curl multiarch` routes to any available pod + +This multiarch service provides high availability and load distribution across your entire multi-architecture cluster. diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/btop_idle.png b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/btop_idle.png new file mode 100644 index 0000000000..cc5d83874f Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/btop_idle.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/mem_and_cpu.png b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/mem_and_cpu.png new file mode 100644 index 0000000000..d3091fc1f0 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/mem_and_cpu.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/under_load.png b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/under_load.png new file mode 100644 index 0000000000..66e7bb658e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/images/under_load.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/spin_up_aks_cluster.md b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/spin_up_aks_cluster.md new file mode 100644 index 0000000000..bab71b30f7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/multiarch_nginx_on_aks/spin_up_aks_cluster.md @@ -0,0 +1,130 @@ +--- +title: Create the AKS Cluster +weight: 10 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Project Overview + +Arm CPUs are widely used in web server workloads on Kubernetes (k8s). In this Learning Path, you'll learn how to deploy [nginx](https://nginx.org/) on Arm-based CPUs within a heterogeneous (x64 and arm64) K8s cluster on Azure's AKS. + +### Benefits of the multi-architecture approach + +Many developers begin their journey with Arm on K8s by adding Arm nodes to an existing x64-based cluster. This has many advantages: + +1. Since you are already familiar with K8s on x64, you can leverage that knowledge to quickly get the core components up and running. +2. Leveraging the multi-architectural container image of your existing x64 workload expedites the migration to Arm with minimal deployment modifications. +3. With both x64 and Arm workloads running in the same cluster, comparing performance across them is simplified. + +This Learning Path explains how to create an initial AKS environment and install nginx on x64. From there, you'll add Arm-based nodes running the same exact workload. You'll see how to run simple tests to verify functionality, and then run performance testing to better understand the performance characteristics of each architecture. + +### Login to Azure using the Azure CLI + +To begin, login to your Azure account using the Azure CLI: + +```bash +az login +``` + +### Create the cluster and resource + +Once logged in, create the resource group and AKS cluster with two node pools: one with Intel-based nodes (Standard_D2s_v6), and one with Arm-based (Standard_D2ps_v6) nodes. + +{{% notice Note %}} +This tutorial uses the `westus2` region, which supports both Intel and Arm VM sizes. You can choose a different region if you prefer, but ensure it supports both VM types and AKS. +{{% /notice %}} + +Set the environment variables as shown below and run the `az aks` commands on your command line. + +```bash +# Set environment variables +export RESOURCE_GROUP=nginx-on-arm-rg +export LOCATION=westus2 +export CLUSTER_NAME=nginx-on-arm + +# Create resource group +az group create --name $RESOURCE_GROUP --location $LOCATION + +# Create AKS cluster with Intel node pool in zone 2 +az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $CLUSTER_NAME \ + --location $LOCATION \ + --zones 2 \ + --node-count 1 \ + --node-vm-size Standard_D2s_v6 \ + --nodepool-name intel \ + --generate-ssh-keys + +# Add ARM node pool in zone 2 +az aks nodepool add \ + --resource-group $RESOURCE_GROUP \ + --cluster-name $CLUSTER_NAME \ + --name arm \ + --zones 2 \ + --node-count 1 \ + --node-vm-size Standard_D2ps_v6 + +``` + +Each command returns JSON output. Verify that `"provisioningState": "Succeeded"` appears in each response. + +### Connect to the cluster + +Verify `kubectl` is available by running: + +```bash +kubectl version --client +``` + +The output should look similar to: + +```output +Client Version: v1.34.1 +Kustomize Version: v5.7.1 +``` + +If `kubectl` is installed the version information is printed. If you don't see the version information printed refer to the [Azure CLI](/install-guides/azure-cli) and [kubectl](/install-guides/kubectl/) install guides. + +Next, set up your newly-created K8s cluster credentials using the Azure CLI: + +```bash +az aks get-credentials --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME +``` + +You should see: + +```output +Merged "nginx-on-arm" as current context in /home/user/.kube/config +``` + +To verify you're connected to the cluster: + +```bash +kubectl cluster-info +``` + +A message similar to the following should be displayed: + +```output +Kubernetes control plane is running at https://nginx-on-a-nginx-on-arm-rg-dd0bfb-eenbox6p.hcp.westus2.azmk8s.io:443 +``` + +With the cluster running, verify the node pools are ready with the following command: + +```bash +kubectl get nodes -o wide +``` + +You should see output similar to: + +```output +NAME STATUS ROLES AGE VERSION +aks-arm-13087205-vmss000002 Ready 6h8m v1.32.7 +aks-intel-39600573-vmss000002 Ready 6h8m v1.32.7 +``` + + +With all nodes showing `Ready` status, you're ready to continue to the next section. diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_index.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_index.md new file mode 100644 index 0000000000..c8f6b189ac --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_index.md @@ -0,0 +1,59 @@ +--- +title: Deploy Ruby on Rails on Arm-based Google Cloud C4A virtual machines + +minutes_to_complete: 40 + +who_is_this_for: This is an introductory topic for developers deploying and optimizing Ruby on Rails workloads in Linux Arm64 environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES (SUSE Linux Enterprise Server) virtual machine on Google Cloud (C4A with Axion processors) + - Install Ruby on Rails on a SUSE Arm64 (C4A) instance + - Validate Ruby on Rails functionality using PostgreSQL as the database + - Benchmark Rails performance using the built-in Ruby Benchmark library on Arm64 (Aarch64) architecture + + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with Ruby programming, the Rails framework, and the [PostgreSQL Relational Database](https://www.postgresql.org/) + +author: Pareena Verma + +### Tags +skilllevels: Introductory +subjects: Web +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - Ruby + - Rails + - PostgreSQL + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: Ruby on Rails documentation + link: https://guides.rubyonrails.org/ + type: documentation + + - resource: + title: Ruby built-in Benchmark documentation + link: https://github.com/ruby/benchmark?tab=readme-ov-file#benchmark + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/background.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/background.md new file mode 100644 index 0000000000..fea7b778b0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/background.md @@ -0,0 +1,32 @@ +--- +title: Get started with Ruby on Rails on Google Axion C4A + +weight: 2 + +layout: "learningpathall" +--- + +## Learn about Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google's custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## Learn about Ruby on Rails + +Ruby on Rails (Rails) is an open-source, server-side web application framework written in Ruby. + +It allows developers to build database-backed web applications quickly using convention over configuration, MVC (Model-View-Controller) architecture, and built-in tools for routing, database migrations, and view rendering. + +Rails is widely used for web applications, APIs, and full-stack development projects. Learn more from the [Ruby on Rails official website](https://rubyonrails.org/) and the [Ruby on Rails Guides](https://guides.rubyonrails.org/). + +## What you've accomplished and what's next + +Understanding these technologies sets the stage for deploying and optimizing Rails workloads on Arm-based cloud infrastructure. Now that you have the foundational context, you’re ready to set up your development environment and begin working with Ruby on Rails on Google Axion C4A instances. + +In the next section, you’ll install the required tools and configure your environment to run Rails applications on Arm-based Google Cloud VMs. This hands-on setup will help you build, test, and deploy Rails projects efficiently on Arm architecture. + +Continue to the installation steps to get started. + diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/baseline.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/baseline.md new file mode 100644 index 0000000000..4d9e425529 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/baseline.md @@ -0,0 +1,337 @@ +--- +title: Set up Ruby on Rails baseline testing +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +Follow these steps to install PostgreSQL, connect it to a Ruby on Rails app, and verify everything works on a SUSE Arm64 Google Cloud C4A VM. + +## Install PostgreSQL and development headers + +Install PostgreSQL and its development headers on your SUSE system: + +```console +sudo zypper install postgresql-devel postgresql-server +``` + +This installs two packages: +- `postgresql-server` - the PostgreSQL database service +- `postgresql-devel` - development headers needed to compile the `pg` gem that connects Rails to PostgreSQL + +The development headers are essential because Rails uses the `pg` gem to communicate with PostgreSQL, and this gem needs to be compiled during installation. + +Start PostgreSQL and enable it to run at boot: + +```console +sudo systemctl start postgresql +sudo systemctl enable postgresql +systemctl status postgresql +``` +The output is similar to: + +```output +● postgresql.service - PostgreSQL database server + Loaded: loaded (/usr/lib/systemd/system/postgresql.service; enabled; vendor preset: disabled) + Active: active (running) since Tue 2025-11-04 21:25:59 UTC; 18s ago + Main PID: 26997 (postgres) + Tasks: 7 + CPU: 372ms + CGroup: /system.slice/postgresql.service + ├─ 26997 /usr/lib/postgresql15/bin/postgres -D /var/lib/pgsql/data + └─ ... (other postgres processes) +``` +If the Active state is running, PostgreSQL is ready. + +## Create a PostgreSQL user for Rails + +Create a dedicated PostgreSQL user for your Rails app: + +```console +sudo -u postgres psql -c "CREATE USER gcpuser WITH SUPERUSER PASSWORD 'your_password';" +``` +This command creates a user named `gcpuser` with superuser privileges. You’ll use this user in your Rails configuration. + +## Set environment variables + +Set environment variables for Rails to connect to PostgreSQL: + +```console +export PGUSER=gcpuser +export PGPASSWORD=your_password +export PGHOST=localhost +``` + +These variables tell Rails how to connect to your PostgreSQL database: +- `PGUSER` - the PostgreSQL username you created +- `PGPASSWORD` - the password for that user +- `PGHOST` - tells Rails to connect to the local database server + +## Create a new Rails app with PostgreSQL + +Now you'll create a Rails application configured to use PostgreSQL as its database. + +Generate a new Rails app: + +```console +rails new db_test_rubyapp -d postgresql +``` + +This command creates a new Rails application named `db_test_rubyapp` with PostgreSQL as the default database adapter. + +Navigate to your new app directory: + +```console +cd db_test_rubyapp +``` + +Install the required gems: + +```console +bundle install +``` + +The `bundle install` command downloads and installs all the gem dependencies listed in your `Gemfile`, including the `pg` gem that allows Rails to communicate with PostgreSQL. + +You now have a Rails application ready to connect to your PostgreSQL database. + +{{% notice Note %}} +Check `config/database.yml` and make sure `username` and `password` match your PostgreSQL user (`gcpuser`). +{{% /notice %}} + +## Update Rails database configuration + +Open `config/database.yml` and confirm the credentials: + +```console +sudo vi config/database.yml +``` +Set these fields: + +```output +default: &default + adapter: postgresql + encoding: unicode + username: gcpuser + password: your_password + host: localhost + pool: 5 + +development: + <<: *default +``` + +## Change the Authentication Method +By default, PostgreSQL on many Linux distributions (including SUSE) uses the ident authentication method for local connections. This method maps Linux system usernames directly to PostgreSQL roles. While convenient for local access, it prevents password-based authentication, which is necessary for Rails and most application connections. + +To allow Rails to connect using a username and password, change the authentication method in PostgreSQL’s configuration file `pg_hba.conf` from ident to md5: + + +```console +sudo vi /var/lib/pgsql/data/pg_hba.conf +``` +The file location `/var/lib/pgsql/data/pg_hba.conf` is the default data directory path for PostgreSQL on SUSE Linux. + +Find lines like the following in the file: + +```output +# IPv4 local connections: +host all all 127.0.0.1/32 ident +# IPv6 local connections: +host all all ::1/128 ident +``` +Change `ident` to `md5`: + +```output +# IPv4 local connections: +host all all 127.0.0.1/32 md5 +# IPv6 local connections: +host all all ::1/128 md5 +``` +Restart PostgreSQL: + +```console +sudo systemctl restart postgresql +``` + +Verify the change: +```console +sudo systemctl status postgresql +``` +The service should show as active (running). + +## Create and Initialize the Database +Once PostgreSQL is configured and Rails can authenticate, you can create your application’s development and test databases. +This step verifies that Rails is correctly connected to PostgreSQL and that the pg gem is working on your Arm64 environment. + +Run the following command from inside your Rails app directory: +```console +rails db:create +``` +The expected output is: + +```output +Created database 'db_test_rubyapp_development' +Created database 'db_test_rubyapp_test' +``` +This output confirms that Rails successfully. It connected to the PostgreSQL service using the credentials from `config/database.yml` and created two new databases — one for development and one for testing. + +## Generate a Scaffold for Testing +To verify your Ruby on Rails and PostgreSQL integration, you’ll create a small scaffold application. +A scaffold is a Rails generator that automatically builds a model, controller, views, and database migration, allowing you to test CRUD (Create, Read, Update, Delete) operations quickly. + +For this example, you’ll create a simple Task Tracker app that manages tasks with titles and due dates. + +Run the following command inside your Rails project directory: + +```console +rails generate scaffold task title:string due_date:date +``` +This command generates a model, controller, views, and migration for tasks. + +Apply the migration: + +```console +rails db:migrate +``` +You’ll see output similar to: + +```output +== 20251006101717 CreateTasks: migrating ====================================== +-- create_table(:tasks) + -> 0.0127s +== 20251006101717 CreateTasks: migrated (0.0128s) ============================= +``` + +## Verify the tasks table in PostgreSQL + +Check that the `tasks` table exists: + +```console +sudo -u postgres psql +``` +In the PostgreSQL shell, run: + +```console +\c db_test_rubyapp_development +\d tasks +\q +``` +Connect to PostgreSQL as the database superuser and inspect the table structure: + +```console +sudo -u postgres psql +``` + +In the PostgreSQL shell, connect to your Rails development database and examine the tasks table: + +```console +\c db_test_rubyapp_development +\d tasks +\q +``` + +This sequence of commands does the following: +- `sudo -u postgres psql` → Launches the PostgreSQL shell as the superuser `postgres`. +- `\c db_test_rubyapp_development` → Connects to the Rails app's development database. +- `\d tasks` → Displays the schema (columns and types) of the `tasks` table. +- `\q` → Exit from the PostgreSQL shell. + +You should see output similar to: +```output +psql (15.10) +Type "help" for help. + +postgres=# \c db_test_rubyapp_development +You are now connected to database "db_test_rubyapp_development" as user "postgres". +db_test_rubyapp_development=# \d tasks + Table "public.tasks" + Column | Type | Collation | Nullable | Default +------------+--------------------------------+-----------+----------+----------------------------------- + id | bigint | | not null | nextval('tasks_id_seq'::regclass) + title | character varying | | | + due_date | date | | | + created_at | timestamp(6) without time zone | | not null | + updated_at | timestamp(6) without time zone | | not null | +Indexes: + "tasks_pkey" PRIMARY KEY, btree (id) +``` + +## Configure Google Cloud firewall to allow port 3000 + +Your Rails app runs on port 3000 by default. To access it from your browser, you need to configure Google Cloud's firewall to allow incoming connections on this port. + +Navigate to the firewall settings in your Google Cloud Console: + +- Open the **Navigation menu** (hamburger icon in the top left) +- Go to **VPC network** → **Firewall** +- Click **Create Firewall Rule** + +![Google Cloud Console navigation menu showing VPC network section expanded with Firewall option highlighted, displaying the standard Google Cloud interface with blue sidebar navigation and white background alt-text#center](images/firewall1.png "Create firewall rule") + +Fill in the details as below: + +- Provide a **name** for your desired port (for example, `allow-3000-ingress`). + +![Google Cloud Console firewall rule creation form showing Name field with allow-3000-ingress entered, Direction of traffic set to Ingress, Action set to Allow, Targets dropdown showing All instances in the network option, and Source IP ranges field containing 0.0.0.0/0 for unrestricted access, displayed in the standard white Google Cloud interface with blue accent colors alt-text#center](images/firewall2.png "Allow-3000-ingress ") + + +- Set **Direction of Traffic** to **Ingress**. + +- Set **Target** to **All Instances in the network**. You can also select **Specific Tags**. + +- Set the **Source IPv4 range** to `0.0.0.0/0`, for global access. + +![Google Cloud Console firewall rule creation form showing Direction of traffic field set to Ingress with a dropdown menu, Target field displaying All instances in the network option, and Source IP ranges field visible below, all within the standard Google Cloud interface featuring white background and blue accent elements alt-text#center](images/firewall3.png "Setting the target") +In the **Protocols and ports** section, select **TCP** and enter `3000` in the port field: + +![Google Cloud Console firewall rule configuration page showing Protocols and ports section with TCP checkbox selected and port 3000 entered in the Specified ports field, displaying Allow on match radio button selected, with clean white interface and blue Google Cloud styling alt-text#center](images/firewall4.png "Protocols and ports") + + +- Select **Create**. Your firewall rule is created and appears in the Firewall policies page: + +![Google Cloud Console Firewall Policies page showing a list of firewall rules including the newly created allow-3000-ingress rule with status enabled, displaying rule names, directions, priorities, and actions in a clean white interface with blue Google Cloud branding alt-text#center](images/firewall5.png "Create Firewall rule") + + ## OS firewall (firewalld) on SUSE +Once done, go back to your VM, install FirewallD: +```console +sudo zypper install firewalld +``` +Now start FirewallD and run the commands to allow port 3000: + +```console +sudo systemctl start firewalld +sudo systemctl enable firewalld +sudo firewall-cmd --permanent --add-port=3000/tcp +sudo firewall-cmd --reload +``` + +## Start Rails +Now that port 3000 is allowed in your VM’s ingress firewall rules, you can start the Rails server using the following command: + +```console +rails server -b 0.0.0.0 +``` +This command lets you access Rails from your browser using the VM’s external IP. + + +## Access the Rails application: +Open a web browser on your local machine and enter the following URL in the address bar: + +``` +http://[YOUR_VM_EXTERNAL_IP]:3000 +``` +- Replace `` with the public IP of your GCP VM. + +You will see a Rails welcome page in your browser if everything is set up correctly, as shown below: + +![Rails default welcome page displaying Ruby on Rails framework logo with green and red styling, welcome message, and navigation links for About your application environment, getting started guide, and Rails documentation on a clean white background alt-text#center](images/rails-web.png "Ruby/Rails welcome page") + +With port 3000 reachable and the welcome page loading, your Rails stack on SUSE Arm64 (C4A Axion) is verified end-to-end and you can proceed to benchmarking. + +## What you've accomplished + +You set up a Ruby on Rails app with PostgreSQL on a Google Cloud C4A Arm-based VM running SUSE Linux. You installed and configured PostgreSQL, created a database user, connected Rails, verified connectivity, generated a scaffold, and made your app accessible over the network. Your Rails stack is now ready for benchmarking and performance testing on Arm. diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/benchmarking.md new file mode 100644 index 0000000000..fb6df489c3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/benchmarking.md @@ -0,0 +1,140 @@ +--- +title: Benchmark Ruby on Rails +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Overview +In this section you will benchmark Ruby on Rails using Ruby’s built-in `Benchmark` library to measure execution time for database inserts, queries, and CPU computations on GCP SUSE VMs, providing insights into performance metrics and bottlenecks. + +## Locate the Rails app folder +Navigate into the folder of your Rails application. This is where Rails expects your application code, models, and database configurations to be located. All commands related to your app should be run from this folder. + +```console +cd ~/db_test_rubyapp +```` + +## Create the benchmark file + +Create a new Ruby file called `benchmark.rb` to measure your Rails application's performance: + +```console +vi benchmark.rb +``` + +This file contains the benchmarking code that tests different aspects of your application's performance. +Copy the following code into `benchmark.rb`. This code measures three different aspects of your Rails application's performance: + + +```ruby +require 'benchmark' + +n = 1000 + +Benchmark.bm do |x| + x.report("DB Insert:") do + n.times do + Task.create(title: "Benchmark Task", due_date: Date.today) + end + end + + x.report("DB Query:") do + n.times do + Task.where(title: "Benchmark Task").to_a + end + end + + x.report("Computation:") do + n.times do + (1..10_000).reduce(:+) + end + end +end +``` +This benchmarking script tests three key areas of your Rails application's performance: + +- The `require 'benchmark'` statement loads Ruby's built-in benchmarking library, which provides precise timing measurements for code execution. +- The variable `n = 1000` sets how many times each test runs - you can adjust this number to simulate lighter or heavier workloads depending on your testing needs. +- The `Benchmark.bm` method creates a benchmarking block that measures and reports the performance of different tasks. Within this block, three different tests run to evaluate your application: + + - The DB Insert test creates 1,000 new `Task` records in your PostgreSQL database. This measures how efficiently your application can write data, which is crucial for understanding performance during high-volume data entry operations. + + - The DB Query test retrieves those same `Task` records from the database. This measurement shows how quickly your application can read data, helping you understand performance during data-heavy read operations like report generation or search functionality. + + - The Computation test performs a mathematical calculation (summing numbers 1 through 10,000) repeatedly without any database interaction. This gives you a baseline for pure CPU performance, showing how your application handles processing-intensive tasks that don't involve external resources. + +This code gives you a basic understanding of how your Rails app performs under different types of workloads. + +## Run the benchmark inside Rails +Now that your benchmark file is ready, run it within the Rails environment using the following command: + +```console +rails runner benchmark.rb +``` +`rails runner` runs any Ruby script in the context of your Rails application. + +It automatically loads your Rails environment, including: + - All models (like `Task`) + - Database connections + - Configuration and dependencies + +This ensures that your benchmark can interact with the PostgreSQL database through ActiveRecord, rather than running as a plain Ruby script. + +You should see output similar to: + +```output + user system total real +DB Insert: 2.271645 0.050236 2.321881 ( 2.721631) +DB Query: 3.379849 0.009345 3.389194 ( 3.389613) +Computation: 0.410907 0.000000 0.410907 ( 0.410919) +``` +## Interpret the benchmark results + +The output shows four different timing measurements that help you understand where your application spends its time. + +- The user time measures how long your Ruby code actually ran on the CPU. This represents the pure processing time for your application logic, calculations, and Ruby operations. + +- The system time tracks how long your application spent waiting for system-level operations like database queries, file I/O, and network requests. Higher system time usually indicates bottlenecks in external resources. + +- The total time simply adds user and system time together, giving you the complete CPU processing time your application consumed. + +- The real time shows the actual wall-clock time that passed from start to finish. This includes everything: CPU processing, waiting for the database to respond, network delays, and any other factors that made your application pause. Real time is often higher than total time because your application might wait for resources that are busy with other tasks. + +When real time significantly exceeds total time, it typically indicates that your application is spending considerable time waiting for external resources rather than actively processing data. + +## Benchmark summary on Arm64 + +Here are the performance results from running the benchmark on a `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP with SUSE: + +| Task | User Time | System Time | Total Time | Real Time | +|------|-----------|-------------|------------|-----------| +| DB Insert | 2.27 sec | 0.05 sec | 2.32 sec | 2.72 sec | +| DB Query | 3.38 sec | 0.01 sec | 3.39 sec | 3.39 sec | +| Computation | 0.41 sec | 0.00 sec | 0.41 sec | 0.41 sec | + +## What these results tell you + +- Database operations (insert and query) take significantly longer than pure computation, with queries being the slowest operation. +- System time is minimal across all tasks, indicating efficient system resource usage on Arm64. +- Real time closely matches total time for most operations, showing minimal waiting for external resources. +- Computation tasks run very efficiently, demonstrating strong CPU performance on Axion processors. + +## Key takeaways + +When you analyze the benchmarking results, you'll notice several important patterns on Google Cloud Axion C4A Arm-based instances: + +- Consistent performance - Ruby and PostgreSQL are both natively optimized for Arm, which provides stable and predictable latency across different workloads. +- Database optimization opportunities - the results show that database I/O remains the primary bottleneck. You can improve database-heavy performance using techniques such as: +- Query caching +- Connection pooling +- Asynchronous queries +- Strong compute performance - Axion's Arm cores combined with Ruby's YJIT compiler demonstrate excellent CPU utilization for compute-intensive tasks that don't rely heavily on I/O operations. + +Ruby on Rails runs efficiently on Google Cloud's Axion-based C4A Arm64 instances, making them a solid choice for Rails applications. + +## What you've accomplished + +You’ve benchmarked your Ruby on Rails application on a Google Cloud C4A Arm-based VM using Ruby’s built-in Benchmark library. You measured database insert and query speeds, as well as CPU computation performance, and interpreted the results to identify optimization opportunities. With these insights, you’re equipped to further tune your Rails workloads for Arm and confidently deploy performance-sensitive applications on Arm-based cloud infrastructure. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall1.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall1.png new file mode 100644 index 0000000000..63efb19dc3 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall1.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall2.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall2.png new file mode 100644 index 0000000000..5419beb2d3 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall2.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall3.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall3.png new file mode 100644 index 0000000000..3f9b15187d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall3.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall4.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall4.png new file mode 100644 index 0000000000..20212ec627 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall4.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall5.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall5.png new file mode 100644 index 0000000000..1395e748de Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/firewall5.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/rails-web.png b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/rails-web.png new file mode 100644 index 0000000000..a41f6f85be Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/images/rails-web.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/installation.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/installation.md new file mode 100644 index 0000000000..7d7661e4d0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/installation.md @@ -0,0 +1,112 @@ +--- +title: Install Ruby on Rails on SUSE Linux +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you'll install Ruby, Rails, and essential supporting tools on your Google Cloud C4A instance running SUSE Enterprise Linux. The steps ensure your environment is ready to build, deploy, and optimize Ruby on Rails applications on Arm-based infrastructure. + +## Update system packages +Start by updating your system packages to ensure you have the latest security patches and development tools needed for Ruby installation: + +```console +sudo zypper update +``` +## Install required dependencies +Install essential development libraries and tools that Ruby needs to compile and run properly on your SUSE Arm64 system: + +```console +sudo zypper install git curl gcc make patch libyaml-devel libffi-devel libopenssl-devel readline-devel zlib-devel gdbm-devel bzip2 bzip2-devel +``` + +## Install rbenv +`rbenv` is a lightweight Ruby version manager that enables you to install and manage multiple Ruby versions on the same system. This is particularly useful for developers running different Rails applications that require specific Ruby versions. + +Use rbenv to manage multiple Ruby versions and ensure compatibility across different Rails projects: + +```console +git clone https://github.com/rbenv/rbenv.git ~/.rbenv +echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bashrc +echo 'eval "$(rbenv init -)"' >> ~/.bashrc +source ~/.bashrc +``` +These commands configure rbenv for your environment by doing the following: +- Cloning the rbenv repository to your home directory. +- Adding rbenv to your PATH so the shell can find it. +- Configuring rbenv to initialize automatically in new shell sessions. +## Install ruby-build plugin +Install the `ruby-build` plugin to enable rbenv to compile and install Ruby versions from source: + +```console +git clone https://github.com/rbenv/ruby-build.git "$(rbenv root)"/plugins/ruby-build +``` + +This plugin adds the `rbenv install` command, which you'll use to download, compile, and install specific Ruby versions optimized for your Arm64 architecture. +## Install Ruby + +Now that rbenv and ruby-build are configured, install Ruby 3.4.6 and set it as your default version: + +```console +rbenv install 3.4.6 +rbenv global 3.4.6 +ruby -v +``` + +This process accomplishes several tasks: +- Downloads and compiles Ruby 3.4.6 from source, optimized for your Arm64 architecture. +- Sets Ruby 3.4.6 as the default version system-wide for your user account. +- Verifies the installation by displaying the active Ruby version. + +The compilation process can take several minutes as Ruby builds natively for your Arm processor. + +You should see output similar to: +```output +ruby 3.4.6 (2025-09-16 revision dbd83256b1) +PRISM [aarch64-linux] +``` +{{% notice Note %}} +Ruby 3.4.0 and later introduced major performance enhancements, especially in YJIT (Yet Another Ruby JIT), Ruby’s Just-In-Time compiler. These enhancements are particularly beneficial for Arm architectures, as YJIT has been optimized to deliver better performance on such platforms. To leverage these improvements, upgrade to Ruby 3.4.0 or later. +For further information, see the [Ruby 3.4.0 release notes](https://www.ruby-lang.org/en/news/2024/12/25/ruby-3-4-0-released/). + +The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) recommends Ruby version 3.4.0. +{{% /notice %}} + +## Install Bundler +Bundler is Ruby’s dependency management tool. It ensures that all required gems (libraries) for your Rails application are installed and consistent across development, test, and production environments. + +Install Bundler globally: + +```console +gem install bundler +``` +This command installs Bundler for the active Ruby version managed by `rbenv`. +## Install Rails + +Rails is a web framework for Ruby that makes building web applications faster and easier. Install Rails to start creating web applications on your Arm-based system: + +```console +gem install rails +``` + +This command downloads and installs the latest version of Rails, along with all its dependencies, optimized for your Arm64 architecture. +## Verify your Rails installation + +Check that Rails installed correctly and is accessible in your environment: + +```console +rails -v +``` + +The output is similar to: +```output +Rails 8.0.3 +``` + +This confirms Rails is ready to use for building web applications on your Arm-based system. + +## What you've accomplished + +You’ve completed the installation of Ruby and Rails on your Google Cloud C4A Arm-based SUSE Linux VM. Your environment is now ready for Arm-native Rails development, with all dependencies, version management, and performance enhancements in place. You’re prepared to start building, testing, and optimizing Ruby on Rails applications on Arm infrastructure. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/instance.md b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/instance.md new file mode 100644 index 0000000000..b6b0e89cb2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ruby-on-rails/instance.md @@ -0,0 +1,35 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type, as shown below: + + ![Google Cloud Console machine configuration interface showing C4A series selected and c4a-standard-4 machine type highlighted with 4 vCPUs and 16 GB memory specifications alt-text #center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + +- Under **OS and Storage**, select **Change**, then select an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. Pick the preferred version for your operating system. Ensure you select the **Arm image** variant. Select **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Select **Create** to launch the instance. + +## What you've accomplished + +You've successfully created a Google Axion C4A Arm virtual machine on GCP. Your Arm-based cloud environment is ready for Ruby on Rails development. diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/1_Overview.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/1_Overview.md new file mode 100644 index 0000000000..c26a391eaa --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/1_Overview.md @@ -0,0 +1,19 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Linux kernel profiling with Arm Streamline + +Performance tuning is not limited to user-space applications—kernel modules can also benefit from careful analysis. [Arm Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer) is a powerful software profiling tool that helps developers understand performance bottlenecks, hotspots, and memory usage, even inside the Linux kernel. This learning path explains how to use Arm Streamline to profile a simple kernel module. + +### Why profile a kernel module? + +Kernel modules often operate in performance-critical paths, such as device drivers or networking subsystems. Even a small inefficiency in a module can affect the overall system performance. Profiling enables you to: + +- Identify hotspots (functions consuming most CPU cycles) +- Measure cache and memory behavior +- Understand call stacks for debugging performance issues diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/2_build_kernel_image.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/2_build_kernel_image.md new file mode 100644 index 0000000000..03860d4453 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/2_build_kernel_image.md @@ -0,0 +1,71 @@ +--- +title: Build Linux image +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Build a debuggable kernel image + +For this learning path we will be using [Buildroot](https://github.com/buildroot/buildroot) to build a Linux image for Raspberry Pi 3B+ with a debuggable Linux kernel. We will profile Linux kernel modules built out-of-tree and Linux device drivers built in the Linux source code tree. + +1. Clone the Buildroot Repository and initialize the build system with the default configurations. + + ```bash + git clone https://github.com/buildroot/buildroot.git + cd buildroot + make raspberrypi3_64_defconfig + make menuconfig + make -j$(nproc) + ``` + +2. Change Buildroot configurations to enable debugging symbols and SSH access. + + ```plaintext + Build options ---> + [*] build packages with debugging symbols + gcc debug level (debug level 3) + [*] build packages with runtime debugging info + gcc optimization level (optimize for debugging) ---> + + System configuration ---> + [*] Enable root login with password + (****) Root password # Choose root password here + + Kernel ---> + Linux Kernel Tools ---> + [*] perf + + Target packages ---> + Networking applications ---> + [*] openssh + [*] server + [*] key utilities + ``` + + You might also need to change your default `sshd_config` file according to your network settings. To do that, you need to modify System configuration→ Root filesystem overlay directories to add a directory that contains your modified `sshd_config` file. + +3. By default the Linux kernel images are stripped so we will need to make the image debuggable as we'll be using it later. + + ```bash + make linux-menuconfig + ``` + + ```plaintext + Kernel hacking ---> + -*- Kernel debugging + Compile-time checks and compiler options ---> + Debug information (Rely on the toolchain's implicit default DWARF version) + [ ] Reduce debugging information #un-check + ``` + +4. Now we can build the Linux image and flash it to the the SD card to run it on the Raspberry Pi. + + ```bash + make -j$(nproc) + ``` + +It will take some time to build the Linux image. When it completes, the output will be in `/output/images/sdcard.img` +For details on flashing the SD card image, see [this helpful article](https://www.ev3dev.org/docs/tutorials/writing-sd-card-image-ubuntu-disk-image-writer/). +Now that we have a target running Linux with a debuggable kernel image, we can start writing our kernel module that we want to profile. diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/3_OOT_module.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/3_OOT_module.md new file mode 100644 index 0000000000..420bb00662 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/3_OOT_module.md @@ -0,0 +1,252 @@ +--- +title: Build out-of-tree kernel module +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Creating the Linux Kernel Module + +We will now learn how to create an example Linux kernel module (Character device) that demonstrates a cache miss issue caused by traversing a 2D array in column-major order. This access pattern is not cache-friendly, as it skips over most of the neighboring elements in memory during each iteration. + +To build the Linux kernel module, start by creating a new directory—We will call it **example_module**—in any location of your choice. Inside this directory, add two files: `mychardrv.c` and `Makefile`. + +**Makefile** + +```makefile +obj-m += mychardrv.o +BUILDROOT_OUT := /opt/rpi-linux/buildroot/output # Change this to your buildroot output directory +KDIR := $(BUILDROOT_OUT)/build/linux-custom +CROSS_COMPILE := $(BUILDROOT_OUT)/host/bin/aarch64-buildroot-linux-gnu- +ARCH := arm64 + +all: + $(MAKE) -C $(KDIR) M=$(PWD) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) modules + +clean: + $(MAKE) -C $(KDIR) M=$(PWD) clean +``` + +{{% notice Note %}} +Change **BUILDROOT_OUT** to the correct buildroot output directory on your host machine +{{% /notice %}} + +**mychardrv.c** + +```c +// SPDX-License-Identifier: GPL-2.0 +#include "linux/printk.h" +#include +#include +#include +#include + +// Using fixed major and minor numbers just for demonstration purposes. +// Major number 42 is for demo/sample uses according to +// https://www.kernel.org/doc/Documentation/admin-guide/devices.txt +#define MAJOR_VERSION_NUM 42 +#define MINOR_VERSION_NUM 0 +#define MODULE_NAME "mychardrv" +#define MAX_INPUT_LEN 64 + +static struct cdev my_char_dev; + +/** + * @brief Traverse a 2D matrix and calculate the sum of its elements. + * + * @size: The size of the matrix (number of rows and columns). + * + * This function allocates a 2D matrix of integers, initializes it with the sum + * of its indices, and then calculates the sum of its elements by accessing them + * in a cache-unfriendly column-major order. + * + * Return: 0 on success, or -ENOMEM if memory allocation fails. + */ +int char_dev_cache_traverse(long size) { + int i, j; + long sum = 0; + + int **matrix; + + // Allocate rows + matrix = kmalloc_array(size, sizeof(int *), GFP_KERNEL); + if (!matrix) + return -ENOMEM; + + // Allocate columns and initialize matrix + for (i = 0; i < size; i++) { + matrix[i] = kmalloc_array(size, sizeof(int), GFP_KERNEL); + if (!matrix[i]) { + for (int n = 0; n < i; n++) { + kfree(matrix[n]); + } + kfree(matrix); + return -ENOMEM; + } + + for (j = 0; j < size; j++) + matrix[i][j] = i + j; + } + + // Access in cache-UNFRIENDLY column-major order + for (j = 0; j < size; j++) { + for (i = 0; i < size; i++) { + sum += matrix[i][j]; + } + } + + pr_info("Sum: %ld\n", sum); + + // Free memory + for (i = 0; i < size; i++) + kfree(matrix[i]); + kfree(matrix); + + return 0; +} + +/** + * @brief Gets the size of the list to be created from user space. + * + */ +static ssize_t char_dev_write(struct file *file, const char *buff, + size_t length, loff_t *offset) { + (void)file; + (void)offset; + + ssize_t ret = 0; + char *kbuf; + long size_value; + + // Allocate kernel buffer + kbuf = kmalloc(MAX_INPUT_LEN, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + // copy data from user space to kernel space + if (copy_from_user(kbuf, buff, length)) { + ret = -EFAULT; + goto out; + } + kbuf[length] = '\0'; + + // Convert string to long (Base 10) + ret = kstrtol(kbuf, 10, &size_value); + if (ret) + goto out; + + // Call cache traversal function + ret = char_dev_cache_traverse(size_value); + if (ret) + goto out; + + ret = length; + +out: + kfree(kbuf); + return ret; +} + +static int char_dev_open(struct inode *node, struct file *file) { + (void)file; + pr_info("%s is open - Major(%d) Minor(%d)\n", MODULE_NAME, + MAJOR(node->i_rdev), MINOR(node->i_rdev)); + return 0; +} + +static int char_dev_release(struct inode *node, struct file *file) { + (void)file; + pr_info("%s is released - Major(%d) Minor(%d)\n", MODULE_NAME, + MAJOR(node->i_rdev), MINOR(node->i_rdev)); + return 0; +} + +// File operations structure +static const struct file_operations dev_fops = {.owner = THIS_MODULE, + .open = char_dev_open, + .release = char_dev_release, + .write = char_dev_write}; + +static int __init char_dev_init(void) { + int ret; + // Allocate Major number + ret = register_chrdev_region(MKDEV(MAJOR_VERSION_NUM, MINOR_VERSION_NUM), 1, + MODULE_NAME); + if (ret < 0) + return ret; + + // Initialize cdev structure and add it to kernel + cdev_init(&my_char_dev, &dev_fops); + ret = cdev_add(&my_char_dev, MKDEV(MAJOR_VERSION_NUM, MINOR_VERSION_NUM), 1); + + if (ret < 0) { + unregister_chrdev_region(MKDEV(MAJOR_VERSION_NUM, MINOR_VERSION_NUM), 1); + return ret; + } + + return ret; +} + +static void __exit char_dev_exit(void) { + cdev_del(&my_char_dev); + unregister_chrdev_region(MKDEV(MAJOR_VERSION_NUM, MINOR_VERSION_NUM), 1); +} + +module_init(char_dev_init); +module_exit(char_dev_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yahya Abouelseoud"); +MODULE_DESCRIPTION("A simple char driver with cache misses issue"); +``` + +The module above receives the size of a 2D array as a string through the `char_dev_write()` function, converts it to an integer, and passes it to the `char_dev_cache_traverse()` function. This function then creates the 2D array, initializes it with simple data, traverses it in a column-major (cache-unfriendly) order, computes the sum of its elements, and prints the result to the kernel log. + +## Building and Running the Kernel Module + +1. To compile the kernel module, run make inside the example_module directory. This will generate the output file `mychardrv.ko`. + +2. Transfer the .ko file to the target using scp command and then insert it using insmod command. After inserting the module, we create a character device node using mknod command. Finally, we can test the module by writing a size value (e.g., 10000) to the device file and measuring the time taken for the operation using the `time` command. + + ```bash + scp mychardrv.ko root@:/root/ + ``` + + {{% notice Note %}} + Replace \ with your own target IP address + {{% /notice %}} + +3. To run the module on the target, we need to run the following commands on the target: + + ```bash + ssh root@ + + #The following commands should be running on target device + + insmod /root/mychardrv.ko + mknod /dev/mychardrv c 42 0 + ``` + + {{% notice Note %}} + 42 and 0 are the major and minor number we chose in our module code above + {{% /notice %}} + +4. Now if you run dmesg you should see something like: + + ```log + [12381.654983] mychardrv is open - Major(42) Minor(0) + ``` + +5. To make sure it's working as expected you can use the following command: + + ```bash { output_lines = "2-4" } + time echo '10000' > /dev/mychardrv + # real 0m 38.04s + # user 0m 0.00s + # sys 0m 38.03s + ``` + + The command above passes 10000 to the module, which specifies the size of the 2D array to be created and traversed. The **echo** command takes a long time to complete (around 38 seconds) due to the cache-unfriendly traversal implemented in the `char_dev_cache_traverse()` function. + +With the kernel module built, the next step is to profile it using Arm Streamline. We will use it to capture runtime behavior, highlight performance bottlenecks, and help identifying issues such as the cache-unfriendly traversal in our module. diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/4_sl_profile_OOT.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/4_sl_profile_OOT.md new file mode 100644 index 0000000000..a5950cd2ac --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/4_sl_profile_OOT.md @@ -0,0 +1,93 @@ +--- +title: Profile out-of-tree kernel module +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Use Streamline to profile an out-of-tree kernel module + +Arm Streamline is a tool that uses sampling to measure system performance. Instead of recording every single event (like instrumentation does, which can slow things down), it takes snapshots of hardware counters and system registers at regular intervals. This gives a statistical view of how the system runs, while keeping the overhead small. + +Streamline tracks many performance metrics such as CPU usage, execution cycles, memory access, cache hits and misses, and GPU activity. By putting this information together, it helps developers see how their code is using the hardware. Captured data is presented on a timeline, so you can see how performance changes as your program runs. This makes it easier to notice patterns, find bottlenecks, and link performance issues to specific parts of your application. + +For more details about Streamline and its features, refer to the [Streamline user guide](https://developer.arm.com/documentation/101816/latest/Getting-started-with-Streamline/Introduction-to-Streamline). + +Streamline is included with Arm Performance Studio, which you can download and use for free from [Arm Performance Studio downloads](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio#Downloads). + +For step-by-step guidance on setting up Streamline on your host machine, follow the installation instructions provided in [Streamline installation guide](https://developer.arm.com/documentation/101816/latest/Getting-started-with-Streamline/Install-Streamline). + +### Pushing Gator to the Target and Making a Capture + +Once Streamline is installed on the host machine, you can capture trace data of our Linux kernel module. + +1. To communicate with the target, Streamline requires a daemon, called **gatord**, to be installed and running on the target. gatord must be running before you can capture trace data. There are two pre-built gatord binaries available in Streamline's install directory, one for *Armv7 (AArch32)* and one for *Armv8 or later(AArch64)*. Push **gatord** to the target device using **scp**. + + ```bash + scp /streamline/bin/linux/arm64/gatord root@:/root/gatord + # use arm instead of arm64, if your are using an AArch32 target + ``` + +2. Run gator on the target to start system-wide capture mode. + + ```bash + /root/gatord -S yes -a + ``` + + ![Gator command#center](./images/img01_gator_cmd.png) + +3. Open Streamline and choose *TCP mode*. + +4. Enter your target hostname or IP address. +![Streamline TCP settings#center](./images/img02_streamline_tcp.png) + +5. Click on *Select counters* to open the counter configuration dialogue, to learn more about counters and how to configure them please refer to [counter configuration guide](https://developer.arm.com/documentation/101816/latest/Capture-a-Streamline-profile/Counter-Configuration) + +6. Add `L1 data Cache: Refill` and `L1 Data Cache: Access` and enable Event-Based Sampling (EBS) for both of them as shown in the screenshot and click *Save*. + + {{% notice %}} + To learn more about EBS, please refer to [Streamline user guide](https://developer.arm.com/documentation/101816/9-7/Capture-a-Streamline-profile/Counter-Configuration/Setting-up-event-based-sampling) + {{% /notice %}} + + ![Counter configuration#center](./images/img03_counter_config.png) + +7. In the Command section, we will add the same shell command we used earlier to test our Linux module. + + ```bash + sh -c "echo 10000 > /dev/mychardrv" + ``` + + ![Streamline command#center](./images/img04_streamline_cmd.png) + +8. In the Capture settings dialog, select Add image, add your kernel module file `mychardrv.ko` and click Save. +![Capture settings#center](./images/img05_capture_settings.png) + +9. Start the capture and enter a name and location for the capture file. Streamline will start collecting data and the charts will show activity being captured from the target. +![Streamline timeline#center](./images/img06_streamline_timeline.png) + +### Analyze the capture and inspect the code + +Once the capture is stopped, Streamline automatically analyzes the collected data and provides insights to help identify performance issues and bottlenecks. This section describes how to view these insights, starting with locating the functions related to our kernel module and narrowing down to the exact lines of code that may be responsible for the performance problems. + +1. Open the *Functions tab*. In the counters list, select one of the counters we selected earlier in the counter configuration dialog, as shown: + +![Counter selection#center](./images/img07_select_datasource.png) + +2. In the Functions tab, observe that the function `char_dev_cache_traverse()` has the highest L1 Cache refill rate, which we already expected. + Also notice the Image name on the right, which is our module file name `mychardrv.ko`: + +![Functions tab#center](./images/img08_Functions_Tab.png) + +3. To view the call path of this function, right click on the function name and choose *Select in Call Paths*. + +4. You can now see the exact function that called `char_dev_cache_traverse()`. In the Locations column, notice that the function calls started in the userspace (echo command) and terminated in the kernel space module `mychardrv.ko`: +![Call paths tab#center](./images/img09_callpaths_tab.png) + +5. Since we compiled our kernel module with debug info, we will be able to see the exact code lines that are causing these cache misses. + To do so, double-click on the function name and the *Code tab* opens. This view shows you how much each code line contributed to the cache misses and in bottom half of the code view, you can also see the disassembly of these lines with the counter values of each assembly instruction: +![Code tab#center](./images/img10_code_tab.png) + +{{% notice Note %}} +You may need to configure path prefix substitution in the Code tab to view the source code correctly. For details on how to set this up and for more information about code analysis, please refer to [Streamline user guide](https://developer.arm.com/documentation/101816/latest/Analyze-your-capture/Analyze-your-code?lang=en) +{{% /notice %}} \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/5_inTree_kernel_driver.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/5_inTree_kernel_driver.md new file mode 100644 index 0000000000..cfa99ef04d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/5_inTree_kernel_driver.md @@ -0,0 +1,60 @@ +--- +title: Build in-tree kernel driver +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Build an in-tree Linux kernel driver + +Now that we have learned how to build and profile an out-of-tree kernel module, we will move on to building a driver statically into the Linux kernel. We will then profile it by adding the kernel’s vmlinux file as an image in Streamline’s capture settings. This allows us to view function calls and call paths as before, and also inspect specific sections of the kernel code that may be contributing to performance issues. + +### Creating an in-tree simple character device driver + +We will use the same example character driver we used earlier `mychardrv` except that this time we will be statically linking it to the kernel. + +1. Go to your kernel source directory, in our case, it's located in Buildroot's output directory in `/output/build/linux-custom`. + +2. Copy the `mychardrv.c` file created earlier to `drivers/char` directory. + + ```bash + cd drivers/char + cp ./mychardrv.c + ``` + +3. Add the following configuration to the bottom of the `Kconfig` file to make the kernel configuration system aware of the the new driver we just added. + + ```plaintext + config MYCHAR_DRIVER + tristate "My Character Driver" + default y + help + A simple character device driver for testing. + endmenu + ``` + +4. We also need to modify the `Makefile` in the current directory to make it build the object file for `mychardrv.c`, so we'll add the following line to it. + + ```Makefile + obj-$(CONFIG_MYCHAR_DRIVER) += mychardrv.o + ``` + +### Rebuild and Run the Linux Image + +You can rebuild the Linux image simply by running the **make** command in your Buildroot directory. This rebuilds the Linux kernel including our new device driver and produce a debuggable `vmlinux` ELF file. + +```bash +cd +make -j$(nproc) +``` + +To verify that our driver was compiled into the kernel, you can run the following command: + +```bash +find -iname "mychardrv.o" +``` + +This should return the full path of the object file produced from compiling our character device driver. + +Now you can flash the new `sdcard.img` file produced to your target's SD card. To learn how to flash the sdcard.img file to your SD card, you can look at [this helpful article](https://www.ev3dev.org/docs/tutorials/writing-sd-card-image-ubuntu-disk-image-writer/). This time our driver will be automatically loaded when Linux is booted. diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/6_sl_profile_inTree.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/6_sl_profile_inTree.md new file mode 100644 index 0000000000..18a729bf8c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/6_sl_profile_inTree.md @@ -0,0 +1,28 @@ +--- +title: Profile in-tree kernel driver +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Use Streamline to profile an in-tree kernel driver + +Profiling in-tree drivers follows almost the same process as profiling an out-of-tree kernel module. The steps include: + +1. Transferring gator to the target device using scp. + +2. Launching Streamline, selecting TCP view, and entering the target’s IP or hostname. + +3. Setting up counters and enabling Event-Based Sampling (EBS). + +The main difference is that, instead of adding the kernel module’s object file as the capture image in Capture settings, we now use the Linux ELF file (vmlinux) generated by Buildroot. + +![Vmlinux capture settings#center](./images/img11_vmlinux_capture_settings.png) + +After clicking Save in Capture settings dialog, you can start the capture and analyze it as we did before. +![Vmlinux function tab#center](./images/img12_vmlinux_function_tab.png) + +Since we used vmlinux image we can view our driver functions as well as all other kernel functions that were sampled during our capture. +You can also view the full Call path of any sampled function within the kernel. +![Vmlinux call paths tab#center](./images/img13_vmlinux_callpaths_tab.png) diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/7_sl_SPE.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/7_sl_SPE.md new file mode 100644 index 0000000000..abb5729d4e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/7_sl_SPE.md @@ -0,0 +1,28 @@ +--- +title: Using Streamline with Statistical Profiling Extension +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Using the Statistical Profiling Extension (SPE) for better analysis + +With periodic sampling, Streamline collects CPU performance data using hardware counters and software interrupts. Hardware counters only give totals, so you can’t see which exact instructions caused the events. At best, you can link the counts to a broad section of code. This makes it harder to pinpoint problems. Sampling the Program Counter (PC) or call stack is also limited, since software timers handle both sampling and unwinding. + +The Statistical Profiling Extension (SPE) removes these limits. It samples the PC in hardware, directly inside the CPU pipeline. This adds almost no overhead, so the sampling rate can be much higher. SPE also records extra details about each sampled instruction, giving a much clearer view of how the code runs. For more details on SPE and how it works in Streamline see [this blog post](https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/introduction-to-statistical-profiling-support-in-streamline). + +To find out if your target supports SPE, please see [Streamline user guide](https://developer.arm.com/documentation/101816/9-7/Capture-a-Streamline-profile/Counter-Configuration/Configure-SPE-counters). + +### Profiling Kernel Module Using SPE + +To profile both in-tree and out-of-tree kernel modules, we can use the same setup steps as before. The only change is to add “Arm Statistical Profiling Extension” to the Events to Collect list in the Counter Configuration dialog. +![SPE counter selection#center](./images/img14_spe_select_counters.png) + +After saving the counter configurations, Click Start capture to begin data collection, then wait for Streamline to analyze results. + +To view SPE counter values, Select SPE in the data source drop-down in the Call paths, Functions, or Code view. + +As shown in the image, SPE provides much more data about the profiled code than Event-Based Sampling (EBS), which provides us with deep insights into the CPU performance bottlenecks with very low overhead. It's also possible to view or hide columns from the table in Call paths or Functions views by menu-clicking on the table header and choosing from the list of columns. + +![SPE function tab#center](./images/img15_spe_function_tab.gif) diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/8_summary.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/8_summary.md new file mode 100644 index 0000000000..ef91418e51 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/8_summary.md @@ -0,0 +1,12 @@ +--- +title: Summary +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- +## Summary + +In this learning path, we learned how to build and profile Linux kernel modules step by step. We started with an out-of-tree character driver that had a cache performance issue and then used Arm Streamline to spot where the problem was. Later, we tried the same idea with an in-tree driver and saw how profiling works with the full kernel. Although the example problem was simple, the same methods apply to complex, real-world drivers and scenarios. + +The key takeaway is that profiling isn’t just about making code faster—it’s about understanding how your code talks to the hardware. Streamline gives us a clear picture of what’s happening inside the CPU so we can write better, more efficient drivers. By learning to identify bottlenecks, you will be more confident in fixing them and avoiding common mistakes in kernel programming. diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_index.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_index.md new file mode 100644 index 0000000000..56f917249c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_index.md @@ -0,0 +1,63 @@ +--- +title: Profiling the Linux kernel with Arm Streamline + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: Software developers and performance engineers interested in profiling Linux kernel performance. + +learning_objectives: + - Understand the importance of profiling Linux kernel modules. + - Learn how to set up and use Arm Streamline for kernel profiling. + - Gain hands-on experience in profiling both out-of-tree and in-tree kernel modules. + - Learn to interpret profiling data to identify performance bottlenecks. + - Understand the benefits of using the Statistical Profiling Extension (SPE) for enhanced profiling. + +prerequisites: + - Basic understanding of Linux kernel development and module programming + - Arm-based Linux target device (such as a Raspberry Pi, BeagleBone, or similar board) with SSH access + - Host machine that meets [Buildroot system requirements](https://buildroot.org/downloads/manual/manual.html#requirement) + +author: Yahya Abouelseoud + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Cortex-A + - Neoverse +tools_software_languages: + - Arm Streamline + - Arm Performance Studio + - Linux kernel + - Performance analysis +operatingsystems: + - Linux + + + +further_reading: + - resource: + title: Streamline user guide + link: https://developer.arm.com/documentation/101816/latest/Capture-a-Streamline-profile/ + type: documentation + - resource: + title: Arm Performance Studio Downloads + link: https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer#Downloads + type: website + - resource: + title: Streamline video tutorial + link: https://developer.arm.com/Additional%20Resources/Video%20Tutorials/Arm%20Mali%20GPU%20Training%20-%20EP3-3 + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img01_gator_cmd.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img01_gator_cmd.png new file mode 100644 index 0000000000..98b1042236 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img01_gator_cmd.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img02_streamline_tcp.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img02_streamline_tcp.png new file mode 100644 index 0000000000..25c334860e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img02_streamline_tcp.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img03_counter_config.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img03_counter_config.png new file mode 100644 index 0000000000..c61ef7b6c4 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img03_counter_config.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img04_streamline_cmd.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img04_streamline_cmd.png new file mode 100644 index 0000000000..595e7a71fc Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img04_streamline_cmd.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img05_capture_settings.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img05_capture_settings.png new file mode 100644 index 0000000000..28788e96a7 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img05_capture_settings.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img06_streamline_timeline.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img06_streamline_timeline.png new file mode 100644 index 0000000000..a411bb1d5d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img06_streamline_timeline.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img07_select_datasource.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img07_select_datasource.png new file mode 100644 index 0000000000..4c6231e82e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img07_select_datasource.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img08_Functions_Tab.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img08_Functions_Tab.png new file mode 100644 index 0000000000..cd23986177 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img08_Functions_Tab.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img09_callpaths_tab.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img09_callpaths_tab.png new file mode 100644 index 0000000000..69d6eff093 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img09_callpaths_tab.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img10_code_tab.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img10_code_tab.png new file mode 100644 index 0000000000..78192a3cc5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img10_code_tab.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img11_vmlinux_capture_settings.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img11_vmlinux_capture_settings.png new file mode 100644 index 0000000000..bb84649231 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img11_vmlinux_capture_settings.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img12_vmlinux_function_tab.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img12_vmlinux_function_tab.png new file mode 100644 index 0000000000..899502db42 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img12_vmlinux_function_tab.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img13_vmlinux_callpaths_tab.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img13_vmlinux_callpaths_tab.png new file mode 100644 index 0000000000..231e7eaa5e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img13_vmlinux_callpaths_tab.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img14_spe_select_counters.png b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img14_spe_select_counters.png new file mode 100644 index 0000000000..e7dbc5d6b2 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img14_spe_select_counters.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img15_spe_function_tab.gif b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img15_spe_function_tab.gif new file mode 100644 index 0000000000..d5e54d08a7 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/streamline-kernel-module/images/img15_spe_function_tab.gif differ diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md new file mode 100644 index 0000000000..5b4f2f0c0f --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_index.md @@ -0,0 +1,63 @@ +--- +title: Deploy TypeScript on Google Cloud C4A (Arm-based Axion VMs) + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers deploying and optimizing TypeScript workloads on Arm64 Linux environments, specifically using Google Cloud C4A virtual machines powered by Axion processors. + +learning_objectives: + - Provision an Arm-based SUSE SLES virtual machine on Google Cloud (C4A with Axion processors) + - Install TypeScript on a SUSE Arm64 (C4A) instance + - Validate TypeScript functionality by creating, compiling, and running a simple TypeScript script on the Arm64 VM + - Benchmark TypeScript performance using a JMH-style custom benchmark with perf_hooks on Arm64 architecture + +prerequisites: + - A [Google Cloud Platform (GCP)](https://cloud.google.com/free) account with billing enabled + - Basic familiarity with [TypeScript](https://www.typescriptlang.org/) and Node.js runtime environment + + +author: Pareena Verma + +##### Tags +skilllevels: Introductory +subjects: Web +cloud_service_providers: Google Cloud + +armips: + - Neoverse + +tools_software_languages: + - TypeScript + - node.js + - npm + +operatingsystems: + - Linux + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +further_reading: + - resource: + title: Google Cloud documentation + link: https://cloud.google.com/docs + type: documentation + + - resource: + title: TypeScript documentation + link: https://www.typescriptlang.org/docs/ + type: documentation + + - resource: + title: TypeScript Benchmark documentation + link: https://tech.spiko.io/posts/benchmarking-typescript-type-checking/ + type: documentation + +weight: 1 +layout: "learningpathall" +learning_path_main_page: "yes" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md new file mode 100644 index 0000000000..8c07d02012 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/background.md @@ -0,0 +1,23 @@ +--- +title: Getting started with TypeScript on Google Axion C4A (Arm Neoverse-V2) + +weight: 2 + +layout: "learningpathall" +--- + +## Google Axion C4A Arm instances in Google Cloud + +Google Axion C4A is a family of Arm-based virtual machines built on Google’s custom Axion CPU, which is based on Arm Neoverse-V2 cores. Designed for high-performance and energy-efficient computing, these virtual machines offer strong performance for modern cloud workloads such as CI/CD pipelines, microservices, media processing, and general-purpose applications. + +The C4A series provides a cost-effective alternative to x86 virtual machines while leveraging the scalability and performance benefits of the Arm architecture in Google Cloud. + +To learn more about Google Axion, refer to the [Introducing Google Axion Processors, our new Arm-based CPUs](https://cloud.google.com/blog/products/compute/introducing-googles-new-arm-based-cpu) blog. + +## TypeScript + +TypeScript is an open-source, strongly typed programming language developed and maintained by Microsoft. + +It is a superset of JavaScript, which means all valid JavaScript code is also valid TypeScript, but TypeScript adds static typing, interfaces, and advanced tooling to help developers write more reliable and maintainable code. + +TypeScript is widely used for web applications, server-side development (Node.js), and large-scale JavaScript projects** where type safety and code quality are important. Learn more from the [TypeScript official website](https://www.typescriptlang.org/) and its [handbook and documentation](https://www.typescriptlang.org/docs/). diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md new file mode 100644 index 0000000000..dbc57adcd9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/baseline.md @@ -0,0 +1,102 @@ +--- +title: TypeScript Baseline Testing on Google Axion C4A Arm Virtual Machine +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Baseline Setup for TypeScript +This section walks you through the baseline setup and validation of TypeScript on a Google Cloud C4A (Axion Arm64) virtual machine running SUSE Linux. +The goal is to confirm that your TypeScript environment is functioning correctly, from initializing a project to compiling and executing a simple TypeScript file, ensuring a solid foundation before performance or benchmarking steps. + +### Set Up a TypeScript Project +Before running any tests, you’ll create a dedicated project directory and initialize a minimal TypeScript environment. + +1. Create project folder + +Start by creating a new folder to hold your TypeScript project files: + +```console +mkdir ~/typescript-benchmark +cd ~/typescript-benchmark +``` +This creates a workspace named `typescript-benchmark` in your home directory, ensuring all TypeScript configuration and source files are organized separately from system files and global modules. + +2. Initialize npm project + +Next, initialize a new Node.js project. This creates a `package.json` file that defines your project metadata, dependencies, and scripts. + +```console +npm init -y +``` + +3. Install Node.js type definitions + +To enable TypeScript to properly recognize Node.js built-in APIs (like fs, path, and process), install the Node.js type definitions package: + +```console +npm install --save-dev @types/node +``` + +You should see output similar to: +```output +{ + "name": "typescript-benchmark", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "description": "" +} +``` + +### Baseline Testing +With the TypeScript environment configured, you’ll now perform a baseline functionality test to confirm that TypeScript compilation and execution work correctly on your Google Cloud SUSE Arm64 VM. + +1. Create a Simple TypeScript File + +Create a file named `hello.ts` with the following content: + +```typescript +const greet = (name: string): string => { + return `Hello, ${name}!`; +}; + +console.log(greet("GCP SUSE ARM64")); +``` +This simple function demonstrates TypeScript syntax, type annotations, and basic console output. + +2. Compile TypeScript + +Use the TypeScript compiler (tsc) to transpile the .ts file into JavaScript: + +```console +tsc hello.ts +``` +This generates a new file named `hello.js` in the same directory. + +3. Run compiled JavaScript + +Now, execute the compiled JavaScript using Node.js. This step verifies that: + +- The TypeScript code was successfully compiled into valid JavaScript. +- The JavaScript code runs correctly in the Node.js runtime on your GCP SUSE VM. + +Execute the compiled JavaScript file: + +```console +node hello.js +``` + +You should see output similar to: + +```output +Hello, GCP SUSE ARM64 +``` +You have successfully verified that your TypeScript environment is working correctly. +Next, you can proceed to TypeScript performance benchmarking to measure compilation and runtime performance on your Google Cloud Arm64 VM. diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md new file mode 100644 index 0000000000..567f959845 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/benchmarking.md @@ -0,0 +1,119 @@ +--- +title: TypeScript Benchmarking +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## JMH-style Custom Benchmarking + +This section demonstrates how to benchmark TypeScript functions using a JMH-style (Java Microbenchmark Harness) methodology implemented with Node.js's built-in `perf_hooks` module. +Unlike basic `console.time()` measurements, this approach executes multiple iterations, computes the average runtime, and produces stable and repeatable performance data, useful for evaluating workloads on your Google Cloud C4A (Axion Arm64) VM running SUSE Linux. + +### Create the Benchmark Script +Create a file named `benchmark_jmh.ts` inside your project directory with the content below: + +```typescript +import { performance } from 'perf_hooks'; + +// Function to benchmark +const sumArray = (n: number) => { + let sum = 0; + for (let i = 0; i < n; i++) sum += i; + return sum; +}; + +// Benchmark parameters +const iterations = 10; // Number of repeated runs +const arraySize = 1_000_000; // Size of array +let totalTime = 0; + +// JMH-style repeated runs +for (let i = 0; i < iterations; i++) { + const start = performance.now(); + sumArray(arraySize); + const end = performance.now(); + const timeTaken = end - start; + totalTime += timeTaken; + console.log(`Iteration ${i + 1}: ${timeTaken.toFixed(3)} ms`); +} + +// Compute average execution time +const averageTime = totalTime / iterations; +console.log(`\nAverage execution time over ${iterations} iterations: ${averageTime.toFixed(3)} ms`); +``` +Code explanation: + +| Component | Description | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **`performance.now()`** | Provides high-resolution timestamps (sub-millisecond precision) for accurate timing. | +| **`sumArray(n)`** | A simple CPU-bound function that sums integers from 0 to `n`. This simulates a computational workload suitable for benchmarking raw arithmetic throughput. | +| **`iterations`** | Defines how many times the test runs. Multiple repetitions reduce noise and help average out one-off delays or GC pauses. | +| **Loop and averaging** | Each run’s duration is recorded; the mean execution time is then reported, mirroring how JMH computes stable results in Java microbenchmarks. | + + +This JMH-style benchmarking approach provides more accurate and repeatable performance metrics than a single execution, making it ideal for performance testing on Arm-based systems. + +### Compile the TypeScript Benchmark +First, compile the benchmark file from TypeScript to JavaScript using the TypeScript compiler (tsc): + +```console +tsc benchmark_jmh.ts +``` +This command transpiles your TypeScript code into standard JavaScript, generating a file named `benchmark_jmh.js` in the same directory. +The resulting JavaScript can be executed by Node.js, allowing you to measure performance on your Google Cloud C4A (Arm64) virtual machine. + +### Run the Benchmark +Now, execute the compiled JavaScript file with Node.js: + +```console +node benchmark_jmh.js +``` +You should see output similar to: + +```output +Iteration 1: 2.286 ms +Iteration 2: 0.749 ms +Iteration 3: 1.145 ms +Iteration 4: 0.674 ms +Iteration 5: 0.671 ms +Iteration 6: 0.671 ms +Iteration 7: 0.672 ms +Iteration 8: 0.667 ms +Iteration 9: 0.667 ms +Iteration 10: 0.673 ms + +Average execution time over 10 iterations: 0.888 ms +``` + +### Benchmark Metrics Explained + + * Iteration times → Each iteration represents the time taken for one complete execution of the benchmarked function. + * Average execution time → Calculated as the total of all iteration times divided by the number of iterations. This gives a stable measure of real-world performance. + * Why multiple iterations? + A single run can be affected by transient factors such as CPU scheduling, garbage collection, or memory caching. + Running multiple iterations and averaging the results smooths out variability, producing more repeatable and statistically meaningful data, similar to Java’s JMH benchmarking methodology. + +### Interpretation + +The average execution time reflects how efficiently the function executes under steady-state conditions. +The first iteration often shows higher latency because Node.js performing initial JIT (Just-In-Time) compilation and optimization, a common warm-up behavior in JavaScript/TypeScript benchmarks. + +### Benchmark summary on Arm64 +Results from the earlier run on the `c4a-standard-4` (4 vCPU, 16 GB memory) Arm64 VM in GCP (SUSE): + +| Iteration | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Average | +|-----------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|---------| +| Time (ms) | 2.286 | 0.749 | 1.145 | 0.674 | 0.671 | 0.671 | 0.672 | 0.667 | 0.667 | 0.673 | 0.888 | + +### TypeScript performance benchmarking summary on Arm64 + +When you look at the benchmarking results, you will notice that on the Google Axion C4A Arm-based instances: + +- The average execution time on Arm64 (~0.888 ms) shows that CPU-bound TypeScript operations run efficiently on Arm-based VMs. +- Initial iterations may show slightly higher times due to runtime warm-up and optimization overhead, which is common across architectures. +- Arm64 demonstrates stable iteration times after the first run, indicating consistent performance for repeated workloads. + +This demonstrates that Google Cloud C4A Arm64 virtual machines provide production-grade stability and throughput for TypeScript workloads, whether used for application logic, scripting, or performance-critical services. diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/images/gcp-vm.png b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/images/gcp-vm.png new file mode 100644 index 0000000000..0d1072e20d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/images/gcp-vm.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md new file mode 100644 index 0000000000..a48a49cbf8 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/installation.md @@ -0,0 +1,70 @@ +--- +title: Install TypeScript +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install TypeScript on GCP VM +This section walks you through installing TypeScript and its dependencies on a Google Cloud Platform (GCP) SUSE Arm64 virtual machine. You’ll install Node.js, npm, TypeScript, and ts-node, and verify that everything works correctly. + +Running TypeScript on Google Cloud C4A instances, powered by Axion Arm64 processors, provides a high-performance and energy-efficient platform for Node.js-based workloads. + +### Update SUSE System +Before installing new packages, refresh the repositories and update existing ones to ensure your environment is current and secure: + +```console +sudo zypper refresh +sudo zypper update -y +``` +Keeping your system up to date ensures that dependencies, libraries, and compilers required for Node.js and TypeScript work seamlessly on the Arm64 architecture. + +### Install Node.js and npm +Node.js provides the JavaScript runtime that powers TypeScript execution, while npm (Node Package Manager) manages project dependencies and global tools. + +Install both packages using SUSE’s repositories: + +```console +sudo zypper install -y nodejs npm +``` +This command installs the Node.js runtime and npm package manager on your Google Cloud SUSE Arm64 VM. + +### Install TypeScript globally +TypeScript (tsc) is the compiler that converts .ts files into JavaScript. +`ts-node` lets you run TypeScript files directly without pre-compiling them. It is useful for testing, scripting, and lightweight development workflows. + +Install both globally using npm: + +```console +sudo npm install -g typescript ts-node +``` +The `-g` flag installs packages globally, making tsc and ts-node available system-wide. + +This approach simplifies workflows for developers running multiple TypeScript projects on the same VM. + +### Verify installations +Check that Node.js, npm, TypeScript, and ts-node are all installed correctly: + +```console +node -v +npm -v +tsc -v +ts-node -v +``` + +The expected output is: + +```output +>node -v +v18.20.5 +>npm -v +10.8.2 +>tsc -v +Version 5.9.3 +> ts-node -v +v10.9.2 +``` + +Node.js, npm, and TypeScript are now successfully installed and verified on your Google Cloud C4A (Arm64) virtual machine. +You’re ready to create and execute TypeScript scripts for testing, deployment, or performance benchmarking. diff --git a/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md new file mode 100644 index 0000000000..2b93bc950d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/typescript-on-gcp/instance.md @@ -0,0 +1,31 @@ +--- +title: Create a Google Axion C4A Arm virtual machine on GCP +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this section, you will learn how to provision a Google Axion C4A Arm virtual machine on Google Cloud Platform (GCP) using the `c4a-standard-4` (4 vCPUs, 16 GB memory) machine type in the Google Cloud Console. + +{{% notice Note %}} +For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](https://learn.arm.com/learning-paths/servers-and-cloud-computing/csp/google/). +{{% /notice %}} + +## Provision a Google Axion C4A Arm VM in Google Cloud Console + +To create a virtual machine based on the C4A instance type: +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine > VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. + + ![Create a Google Axion C4A Arm virtual machine in the Google Cloud Console with c4a-standard-4 selected alt-text#center](images/gcp-vm.png "Creating a Google Axion C4A Arm virtual machine in Google Cloud Console") + +- Under **OS and Storage**, select **Change**, then choose an Arm64-based OS image. For this Learning Path, use **SUSE Linux Enterprise Server**. Pick the preferred version for your Operating System. Ensure you select the **Arm image** variant. Click **Select**. +- Under **Networking**, enable **Allow HTTP traffic**. +- Click **Create** to launch the instance. diff --git a/download_configmaps.sh b/download_configmaps.sh new file mode 100755 index 0000000000..a92d353c08 --- /dev/null +++ b/download_configmaps.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Download nginx_arm configmap +kubectl get configmap nginx-arm -n nginx -o yaml > nginx_arm_configmap.yaml + +# Download nginx_intel configmap +kubectl get configmap nginx-intel -n nginx -o yaml > nginx_intel_configmap.yaml + +echo "Downloaded configmaps:" +echo "- nginx_arm_configmap.yaml" +echo "- nginx_intel_configmap.yaml" diff --git a/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html b/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html index 303775fb78..af9da8cedf 100644 --- a/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html +++ b/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html @@ -47,12 +47,15 @@
+ {{ if ne .Params.hide_from_navpane true }} + {{ if (eq .File.Dir $thisdir)}} + {{ $learningpathfiles = $learningpathfiles | append . }} + + + {{partial "navigation/content-navigation.html" (dict "context" . "thisfile" $thisfile "counter" $counter) }} + {{ $counter = add $counter 1 }} + {{ end }} {{ end }} {{end}} {{end}} @@ -91,4 +94,4 @@