diff --git a/.gitpod.yml b/.gitpod.yml
deleted file mode 100644
index a6374e679d..0000000000
--- a/.gitpod.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-tasks:
-  - name: Install Hugo
-    before: brew install hugo
-    init: echo "Your version of Hugo is `hugo version`"
-    command: |
-        hugo
-        bin/pagefind --site "public" --output-subdir ../static/pagefind
-        hugo server -D -F --baseURL $(gp url 1313) --liveReloadPort=443 --appendPort=false --bind=0.0.0.0
-
-ports:
-  - port: 1313
-    onOpen: open-preview
diff --git a/.wordlist.txt b/.wordlist.txt
index 79c3f66d0f..9606eb651b 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -4812,4 +4812,141 @@ learnt
 lof
 BalenaOS
 balenaCloud
-
+MX
+ARMFp
+AndroidDemo
+ApacheBench
+ArmHalideAndroidDemo
+Autoscheduler
+BGR
+BVM
+BenchmarkBubbleSort
+BenchmarkQuickSort
+Botspot
+BoundaryConditions
+BubbleSort
+ByteBuffer
+DGGML
+DNQZJ
+DTLB
+EPYC
+ETag
+EVEX
+Esc
+FuseAll
+FuseBlurAndThreshold
+GGG
+GOPATH
+GOROOT
+GTK
+GetByteArrayElements
+Golang
+Golang’s
+HWC
+Halide
+Halide’s
+ImageParam
+Istio
+KEDA
+Kedify
+Kedify’s
+LLC
+LLE
+MPix
+NIC’s
+Netty
+NoRuntime
+OpenBMC’s
+Parallelization
+QCOW
+QuickSort
+RDom
+RGBRGBRGB
+RRR
+RamFB
+Recomputation
+ReleaseByteArrayElements
+Remmina
+Roubalik
+SAXPY
+ScaledObject
+Scaler
+SetByteArrayRegion
+SoL
+Sor
+Sysoev
+TinyRPS
+UFW
+VLA
+VTOR
+VirtualService
+WindowsOnArm
+XMM
+YMM
+YUV
+ZMM
+Zbynek
+adaptively
+allocs
+apiKey
+armhalideandroiddemo
+autounattend
+autowiring
+benchmarkHttpResponse
+benchmem
+blurThresholdImage
+bvm
+clusterName
+coroutine
+createBitmapFromGrayBytes
+cv
+extractGrayScaleBytes
+fallbacks
+firstlogin
+golang
+gosort
+goweb
+halide
+httpd
+inBytes
+inlines
+inputBuffer
+insturction
+jbyteArray
+keda
+kedify
+keypress
+kts
+llmexport
+loadImageFromAssets
+microarchitectures
+minikube
+oOer
+orgId
+outputArray
+outputBuffer
+parallelization
+parallelize
+parallelized
+parallelizes
+preallocation
+precomputing
+qcow
+recomputation
+reconfig
+reconversion
+refetching
+req
+scaler
+scalers
+sprintf
+stdev
+thresholded
+underperformed
+underperforms
+unvectorized
+uop
+walkthrough
+warmups
+xo
+yi
\ No newline at end of file
diff --git a/assets/contributors.csv b/assets/contributors.csv
index ef6f06ea90..a149228b12 100644
--- a/assets/contributors.csv
+++ b/assets/contributors.csv
@@ -102,3 +102,5 @@ Ker Liu,,,,,
 Rui Chang,,,,,
 Alejandro Martinez Vicente,Arm,,,,
 Mohamad Najem,Arm,,,,
+Zenon Zhilong Xiu,Arm,,zenon-zhilong-xiu-491bb398,,
+Zbynek Roubalik,Kedify,,,,
diff --git a/content/install-guides/dcperf.md b/content/install-guides/dcperf.md
index 024965bb54..e0d32c80e1 100644
--- a/content/install-guides/dcperf.md
+++ b/content/install-guides/dcperf.md
@@ -9,7 +9,7 @@ additional_search_terms:
 - Neoverse
 
 test_images:
-- ubuntu:22.04
+- ubuntu:latest
 test_maintenance: false
 
 layout: installtoolsall
@@ -23,12 +23,12 @@ weight: 1
 
 DCPerf is an open-source benchmarking and microbenchmarking suite originally developed by Meta. It faithfully replicates the characteristics of general-purpose data center workloads, with particular attention to microarchitectural fidelity. DCPerf stands out for accurate simulation of behaviors such as cache misses and branch mispredictions, which are details that many other benchmarking tools overlook.
 
-You can use DCPerf to generate performance data to inform procurement decisions, and for regression testing to detect changes in the environment, such as kernel and compiler changes. 
+You can use DCPerf to generate performance data to inform procurement decisions, and for regression testing to detect changes in the environment, such as kernel and compiler changes.
 
-DCPerf runs on Arm-based servers. The examples below have been tested on an AWS `c7g.metal` instance running Ubuntu 22.04 LTS. 
+DCPerf runs on Arm-based servers. The examples below have been tested on an AWS `c7g.metal` instance running Ubuntu 22.04 LTS.
 
 {{% notice Note %}}
-When running on a server provided by a cloud service, you have limited access to some parameters, such as UEFI settings, which can affect performance. 
+When running on a server provided by a cloud service, you have limited access to some parameters, such as UEFI settings, which can affect performance.
 {{% /notice %}}
 
 ## Install prerequisites
@@ -40,7 +40,7 @@ sudo apt update
 sudo apt install -y python-is-python3 python3-pip python3-venv git
 ```
 
-It is recommended that you install Python packages in a Python virtual environment. 
+It is recommended that you install Python packages in a Python virtual environment.
 
 Set up your virtual environment:
 
@@ -48,7 +48,7 @@ Set up your virtual environment:
 python3 -m venv venv
 source venv/bin/activate
 ```
-If requested, restart the recommended services. 
+If requested, restart the recommended services.
 
 Install the required packages:
 
@@ -65,9 +65,9 @@ cd DCPerf
 
 ## Running the MediaWiki benchmark
 
-DCPerf offers many benchmarks. See the official documentation for the benchmark of your choice. 
+DCPerf offers many benchmarks. See the official documentation for the benchmark of your choice.
 
-One example is the MediaWiki benchmark, designed to faithfully reproduce the workload of the Facebook social networking site. 
+One example is the MediaWiki benchmark, designed to faithfully reproduce the workload of the Facebook social networking site.
 
 Install HipHop Virtual Machine (HHVM), a virtual machine used to execute the web application code:
 
@@ -95,14 +95,14 @@ Compiler: 1704922878_080332982
 Repo schema: 4239d11395efb06bee3ab2923797fedfee64738e
 ```
 
-Confirm security-enhanced Linux (SELinux) is disabled with the following commands: 
+Confirm security-enhanced Linux (SELinux) is disabled with the following commands:
 
 ```bash
 sudo apt install selinux-utils
 getenforce
 ```
 
-You should see the following response: 
+You should see the following response:
 
 ```output
 Disabled
@@ -181,7 +181,7 @@ The metrics file contains several key performance indicators from the benchmark
 
 These metrics help you evaluate the performance and reliability of the system under test. Higher values for successful requests and RPS, and lower response times, generally indicate better performance. The score provides a single value for easy comparison across runs or systems.
 
-## Next steps 
+## Next steps
 
 These are some activities you might like to try next:
 
diff --git a/content/learning-paths/automotive/_index.md b/content/learning-paths/automotive/_index.md
index 97fb52787c..43a8e96e60 100644
--- a/content/learning-paths/automotive/_index.md
+++ b/content/learning-paths/automotive/_index.md
@@ -12,10 +12,10 @@ title: Automotive
 weight: 4
 subjects_filter:
 - Containers and Virtualization: 3
-- Performance and Architecture: 5
+- Performance and Architecture: 6
 operatingsystems_filter:
 - Baremetal: 1
-- Linux: 7
+- Linux: 8
 - macOS: 1
 - RTOS: 1
 tools_software_languages_filter:
@@ -23,10 +23,11 @@ tools_software_languages_filter:
 - Arm Zena CSS: 1
 - C: 2
 - C++: 1
-- Clang: 2
+- Clang: 3
 - DDS: 1
 - Docker: 2
-- GCC: 2
+- FVP: 1
+- GCC: 3
 - Python: 2
 - Raspberry Pi: 1
 - ROS 2: 3
diff --git a/content/learning-paths/automotive/zenacssdebug/_index.md b/content/learning-paths/automotive/zenacssdebug/_index.md
index 9539aaaf9d..9c57db90fa 100644
--- a/content/learning-paths/automotive/zenacssdebug/_index.md
+++ b/content/learning-paths/automotive/zenacssdebug/_index.md
@@ -1,24 +1,21 @@
 ---
 title: Debug Arm Zena CSS Reference Software Stack with Arm Development Studio
 
-draft: true
-cascade:
-    draft: true
 
 minutes_to_complete: 60
 
-who_is_this_for: This is an introductory topic for software developers who wish to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack.
+who_is_this_for: This introductory topic is for software developers who want to use Arm Development Studio to explore and debug the Arm Zena Compute Subsystem (CSS) Reference Software Stack on a Fixed Virtual Platform (FVP).
 
-learning_objectives: 
-    - Set up debug configuration for the Arm Zena CSS FVP
-    - Debug Runtime Security Engine (RSE) from boot time
-    - Debug Safety Island (SI)
-    - Debug Linux OS on Primary Compute cores
+learning_objectives:
+  - Set up and save a debug configuration for the Arm Zena CSS FVP
+  - Start Runtime Security Engine (RSE) debug at reset and step through early boot
+  - Attach to and debug Safety Island (SI) firmware
+  - Attach to the Linux kernel on the primary compute cores and debug user space processes
 
 prerequisites:
-    - Ubuntu 22.04 host machine
-    - You will need [Arm Development Studio 2024.1 (or later)](/install-guides/armds) and an appropriate license
-    - A basic understanding of the Arm Zena CSS software stack and Arm processors
+  - Ubuntu 22.04 host machine
+  - Arm Development Studio 2024.1 or later with a valid license - for support see the [Install Guide for ADS](/install-guides/armds) 
+  - Basic understanding of the Arm Zena CSS software stack, Armv8-A/Armv9-A cores, and Linux
 
 author: Ronan Synnott
 
@@ -26,24 +23,24 @@ author: Ronan Synnott
 skilllevels: Introductory
 subjects: Performance and Architecture
 armips:
-    - Cortex-A
-    - Cortex-R
+  - Cortex-A
+  - Cortex-R
 operatingsystems:
-    - Linux
+  - Linux
 tools_software_languages:
-    - Arm Development Studio
-    - Arm Zena CSS
-
+  - Arm Development Studio
+  - Arm Zena CSS
+  - FVP
 
 further_reading:
-    - resource:
-        title: Arm Zena Compute System (CSS)
-        link: https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem
-        type: website
-    - resource:
-        title: Arm Development Studio
-        link: https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio
-        type: website
+  - resource:
+      title: Arm Zena Compute Subsystem (CSS)
+      link: https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem
+      type: website
+  - resource:
+      title: Arm Development Studio
+      link: https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio
+      type: website
 
 
 ### FIXED, DO NOT MODIFY
diff --git a/content/learning-paths/automotive/zenacssdebug/config.md b/content/learning-paths/automotive/zenacssdebug/config.md
index 81e6367f40..b9d6dbf12f 100644
--- a/content/learning-paths/automotive/zenacssdebug/config.md
+++ b/content/learning-paths/automotive/zenacssdebug/config.md
@@ -1,6 +1,6 @@
 ---
 # User change
-title: "Model Configuration"
+title: "Configure the model"
 
 weight: 4 # 1 is first, 2 is second, etc.
 
@@ -8,56 +8,57 @@ weight: 4 # 1 is first, 2 is second, etc.
 layout: "learningpathall"
 ---
 
-# Debug Configuration
+## Set up a debug configuration for the Zena CSS FVP
 
-Arm Development Studio requires a `Debug Configuration` of the target that it will connect to.
+Now you'll walk through setting up an Arm Development Studio debug configuration for the Zena CSS FVP using the Iris interface. This is a fast, reliable path to a working configuration.
 
-As of Arm Development Studio version 2025.0, there is no such configuration provided 'out-of-the-box' for the Zena CSS FVP. However creating such a configuration is straight forward.
+As of Arm Development Studio 2025.0, there is no out-of-the-box configuration for the Zena CSS FVP. Creating one, however, is straightforward.
 
-See the Arm Development Studio [Getting Started Guide](https://developer.arm.com/documentation/101469/latest/Migrating-from-DS-5-to-Arm-Development-Studio/Connect-to-new-or-custom-models) for full instructions, but they are also summarized below.
+For full guidance, see the Arm Development Studio [Getting Started Guide](https://developer.arm.com/documentation/101469/latest/Migrating-from-DS-5-to-Arm-Development-Studio/Connect-to-new-or-custom-models). A concise, task-focused version is below.
 
-## Launch FVP
+## Launch the FVP (with Iris)
 
-As per previous section, launch FVP with the Iris server enabled:
+Launch the FVP with the Iris server enabled:
 
-```command
+```bash
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100"
 ```
-or if connecting to the FVP remotely:
+If connecting to the FVP remotely, you can use this command:
 
-```command
+```bash
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 -A"
 ```
+
 {{% notice Note %}}
-A local connection is assumed for the remainder of this learning path.
+This example modeled below uses a local connection for the remaining steps.
 {{% /notice %}}
 
-## Configuration Database
-
-Debug Configurations are stored in a configuration database. You must first create a local database in which to store the configuration.
-
-Navigate to `File` > `New` > `Other`, and then select `Configuration Database` > `Configuration Database` from the drop-down list.
+## Create a configuration database in Arm Development Studio
 
-Click `Next`. Give the Database a name, and click `Finish`.
+Debug configurations are stored in a configuration database. Create a local database to store your model configuration:
 
-## Debug Configuration
+- In Arm Development Studio, go to **File > New > Other**.
+- Select **Configuration Database > Configuration Database**.
+- Click **Next**, enter a **Name**, then click **Finish**.
 
-Navigate to the same wizard as above, and select `Model Configuration`.
+## Create a model configuration for Zena CSS FVP (Iris)
 
-Click `Next`, and you will be prompted to select the above `Configuration Database`. Click `Next` again, and you will be prompted to select a Model Interface.
+- Open the same wizard (**File > New > Other**), then choose **Configuration Database > Model Configuration**.
+- Click **Next**, select the **Configuration Database** you created, then click **Next**.
+- For **Model Interface**, choose **Iris**, then click **Next**.
+- Choose **Browse for model running on local host**. The debugger detects and interrogates the FVP.  
+- If connecting remotely, choose **Connect to model running on either local or remote host** and provide the host and port.
 
-Select `Iris` from the pulldown, and click `Next`.
+Arm Development Studio generates a `model.mdf` file that enumerates all CPUs in the FVP.
 
-You will then be prompted to locate the model to connect to.
+Optionally, update **Manufacturer Name** (for example, `Arm`) and **Platform Name** (for example, `Zena_CSS_FVP`). Then **Save** and **Import** the model into the configuration database.
 
-Select `Browse for model running on local host`. The FVP will be detected and interrogated by the debugger.
-
-{{% notice Note %}}
-Use `Connect to model running on either local or remote host` if connecting remotely.
+{{% notice Tip %}}
+If the FVP is not detected, verify the Iris server is running on the expected port (`7100` by default) and that your firewall allows local connections. For remote connections, confirm the host is reachable and the port is open.
 {{% /notice %}}
 
 A `model.mdf` file will be created that identifies all CPUs within the FVP.
 
-You can change the `Manufacturer Name` and `Platform Name` to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then `Save`, and `Import` into the configuration database.
+You can change the **Manufacturer Name** and **Platform Name** to something more meaningful (such as `Arm` and `Zena_CSS_FVP`), then **Save**, and **Import** into the configuration database.
 
 The debugger is now aware of the FVP and you are ready to debug.
diff --git a/content/learning-paths/automotive/zenacssdebug/connect.md b/content/learning-paths/automotive/zenacssdebug/connect.md
index 921f80b467..ebedbe8acd 100644
--- a/content/learning-paths/automotive/zenacssdebug/connect.md
+++ b/content/learning-paths/automotive/zenacssdebug/connect.md
@@ -1,6 +1,6 @@
 ---
 # User change
-title: "Debug Connections"
+title: "Create debug connections"
 
 weight: 5 # 1 is first, 2 is second, etc.
 
@@ -8,68 +8,79 @@ weight: 5 # 1 is first, 2 is second, etc.
 layout: "learningpathall"
 ---
 
-## Debug Connections
+## Overview
 
 You are now ready to create debug connections for each of the sub-systems within Zena CSS. In this section you will create the connections, which will be subsequently enhanced in the following section. You may prefer to fully set up one such connection before moving to others.
 
-Arm Development Studio has full support for Heterogeneous systems such as Zena CSS, and so you can connect to all processors simultaneously.
+Arm Development Studio has full support for heterogeneous systems such as Zena CSS, and so you can connect to all processors simultaneously.
 
-### Debug connection project
+## Create a project for connection files
 
-First, create a project to store these connections (`.launch` files) in.
+First, create a project to store these connections (`.launch` files).
 
-Select `File` > `New...` > `Project` > `General` > `Project`, and give it a meaningful name (`Connections`).
+Select **File** > **New...** > **Project** > **General** > **Project**, and give it a meaningful name (for example, `Connections`).
 
-### RSE (Cortex-M55)
+## Create an RSE (Cortex-M55) model connection
 
-Runtime Security Engine (RSE) is based on [Cortex-M55](https://developer.arm.com/Processors/Cortex-M55) core and is a security subsystem fulfilling the role of Root of Trust.
+Runtime Security Engine (RSE) is based on the [Cortex-M55](https://developer.arm.com/Processors/Cortex-M55) core and is a security subsystem fulfilling the role of Root of Trust.
 
-Select `File` > `New` > `Model Connection`.
+Select **File** > **New** > **Model Connection**.
 
 {{% notice Note %}}
-You can also use `File` > `New` > `Other` > `Arm Debugger` > `Model Connection`, or
-
-`Create a debug connection...` shortcut in the `Debug Control` pane.
+You can also use **File** > **New** > **Other** > **Arm Debugger** > **Model Connection**, or the **Create a debug connection**... shortcut in the **Debug Control** pane.
 {{% /notice %}}
 
-Specify a connection name (`RSE`), and associate with the above `Connections` project. Click `Next`.
+Specify a connection name (`RSE`), and associate with the above `Connections` project. Click **Next**.
 
-Locate the FVP based on the name you gave it previously (`Zena_CSS_FVP`). The text filter can help you locate it easily.
+Locate the FVP based on the name you gave it previously (`Zena_CSS_FVP`). You can use the text filter to locate it quickly.
 
-You will then be presented with the `Edit configuration` pane. In the `Connection` tab, scroll down to locate `Bare Metal Debug` > `Arm_Cortex-M55`.
+You will then be presented with the **Edit configuration** pane. In the **Connection** tab, scroll down to locate **Bare Metal Debug** > **Arm_Cortex-M55**.
 
-As you will be later launching the FVP with the software stack loaded, select `Connect to an already running model`.
+As you will be later launching the FVP with the software stack loaded, select **Connect to an already running model**.
 
-Assuming the same host will be running both the FVP and the debugger, specify the `Connection address` as the default `127.0.0.1:7100`.
+Assuming the same host will be running both the FVP and the debugger, specify the **Connection address** as the default `127.0.0.1:7100`.
 
 {{% notice Note %}}
-`127.0.0.1` is the same as `localhost`, that is the same host machine as is running the FVP.
+`127.0.0.1` is the same as `localhost`, which targets the host running the FVP. For a remote FVP, specify the remote IP address and start the FVP with `-A`. Port `7100` is the default Iris port and can be adjusted if needed.
+{{% /notice %}}
+
+Arm Development Studio creates `RSE.launch` inside the **Connections** project.
+
+## Create a Safety Island (Cortex-R82AE) model connection
 
-It is also possible to connect to a remote host by specifying appropriate IP address, and launching FVP with the `-A` option.
+The Safety Island is based on the [Cortex-R82AE](https://developer.arm.com/Processors/Cortex-R82AE) core and manages power, clocks, and CMN control.
 
-`7100` is the default port number. You may need to change this if necessary.
+Follow the same steps as for RSE, with this change:
+
+In **Edit configuration**, expand **Bare Metal Debug** and select **Arm_Cortex-R82AE**.
+
+{{% notice Tip %}}
+To save time, copy `RSE.launch` to `SI.launch` and update the CPU selection to **Arm_Cortex-R82AE**.
 {{% /notice %}}
 
-Click `Apply` to save the connection information, and `Close`. Observe that `RSE.launch` is created inside the `Connections` project.
+## Create Primary compute (Cortex-A720AE) connections
 
-### Safety Island (Cortex-R82AE)
+Primary compute comprises four clusters intended to run a rich OS such as Linux. Each cluster has four [Cortex-A720AE](https://developer.arm.com/Processors/Cortex-A720AE) cores alongside a [DSU-120AE](https://developer.arm.com/Processors/DSU-120AE) DynamIQ Shared Unit.
 
-The Safety Island is a subsystem based on [Cortex-R82AE](https://developer.arm.com/Processors/Cortex-R82AE) core. The software running on the Safety Island is responsible for power, clock and CMN control.
+You will create two connections: one for bare-metal initialization and one with Linux kernel awareness for SMP debug.
 
-The procedure to create this connection is very similar to the above, other than to select `Bare Metal Debug` > `Arm_Cortex-R82AE` from the drop-down.
+### Primary init (bare metal, CPU0 only)
 
-{{% notice %}}
-For convenience you can copy-and-paste `RSE.launch` as `SI.launch` and just modify the CPU.
-{{% /notice %}}
+Create `Primary_init.launch`:
+
+- Select **File > New > Model Connection**.
+- Select your `Zena_CSS_FVP` model.
+- In **Edit configuration**, expand **Bare Metal Debug** and select **ARM_Cortex-A720AE_0** to attach to CPU0 only. This leaves other CPUs running.
 
-### Primary Compute (Cortex-A720AE)
+### Primary Linux (SMP, OS awareness)
 
-The Primary Compute consists of four processor clusters to run a rich OS such as Linux. Each processor cluster includes four [Cortex-A720AE](https://developer.arm.com/Processors/Cortex-A720AE) cores and a [DSU-120AE](https://developer.arm.com/Processors/DSU-120AE) DynamIQ Shared Unit.
+Create **Primary_Linux.launch** for Linux kernel debug with OS awareness:
 
-The application processors will be debugged in an SMP configuration with Linux Kernel awareness.
+- Use **File > New > Model Connection**.
+- Select your **Zena_CSS_FVP** model.
+- In **Edit configuration**, expand **Linux Kernel Debug** and choose **ARM_Cortex-A720AEx16 SMP Cluster 1**.  
+  This connects to all 16 Cortex-A720AE processors described in the FVP. Only cores 0 to 3 are used by the default Linux configuration.
 
-As shown above, create `Primary_init.launch` connection and scroll to `Bare Metal Debug` > `ARM_Cortex-A720AE_0`. This will connect to just CPU0, leaving the other CPUs free to run.
+To learn more about OS awareness in Arm Debugger, see the [OS awareness documentation](https://developer.arm.com/documentation/101470/latest/Debugging-Embedded-Systems/About-OS-awareness).
 
-To debug the Linux kernel you can make use of the [OS awareness](https://developer.arm.com/documentation/101470/latest/Debugging-Embedded-Systems/About-OS-awareness) feature of the Arm Debugger.
 
-Create `Primary_Linux.launch` connection and scroll to `Linux Kernel Debug` > `ARM_Cortex-A720AEx16 SMP Cluster 1`. This will connect to all 16 `Cortex-A720AE` processors present in the FVP, though only cores 0-3 are used.
diff --git a/content/learning-paths/automotive/zenacssdebug/launch.md b/content/learning-paths/automotive/zenacssdebug/launch.md
index 5aba66e2c8..6422a11389 100644
--- a/content/learning-paths/automotive/zenacssdebug/launch.md
+++ b/content/learning-paths/automotive/zenacssdebug/launch.md
@@ -1,6 +1,6 @@
 ---
 # User change
-title: "Launch FVP"
+title: "Launch the FVP"
 
 weight: 3 # 1 is first, 2 is second, etc.
 
@@ -8,49 +8,46 @@ weight: 3 # 1 is first, 2 is second, etc.
 layout: "learningpathall"
 ---
 
-## Launch FVP
+## Start the FVP from the build environment
 
-You can now launch the FVP within the virtual environment with the software stack loaded:
+You can launch the FVP within the build environment with the software stack loaded:
 
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose"
 ```
-Refer to the [documentation](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html#run-the-fvp) for more details.
-While you can continue to use this method to launch the FVP whilst debugging, this command does not enable the Iris debug server inside the model, and so will not be debuggable.
 
-Additional command options are necessary.
+See the [Arm Zena CSS User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html#run-the-fvp) for further information.
 
-You will use the following. See output of `FVP_RD_Aspen --help` for full list and explanation. Options are case-sensitive.
+While you can continue to use this method during debugging, it does not enable the Iris debug server in the model, so the system cannot be debugged from Arm Development Studio. Additional command-line options are required.
 
-| Option                | Alias    | Notes                                         |
-|---------------------- |--------- |---------------------------------------------- |
-| `--iris-server`       | `-I`     | Start Iris Debug Server                       |
-| `--iris-port`         |          | Specify a port number (default = `7100`)      |
-| `--run`               | `-R`     | Run simulation when debug server started      |
-| `--iris-allow-remote` | `-A`     | Allow remote connections (if different hosts) |
+You will use the following options (see `FVP_RD_Aspen --help` for the full list). Options are case-sensitive.
 
-### Launch FVP with additional options
+| Option                  | Alias | Notes                                                 |
+|-------------------------|:-----:|-------------------------------------------------------|
+| `--iris-server`         | `-I`  | Start the Iris debug server                           |
+| `--iris-port <port>`    |       | Set the Iris port (default `7100`)                    |
+| `--run`                 | `-R`  | Run the simulation when the debug server starts       |
+| `--iris-allow-remote`   | `-A`  | Allow remote connections (only if required)           |
 
-To launch the FVP with additional options, modify the above command by adding `--` and then the options.
+## Enable the Iris debug server for Arm Development Studio
 
-For example, to launch the model with the debug server and hold at the initial reset condition:
+Append `--` to pass model options through `runfvp`.
 
+Start the model with the debug server and hold at reset:
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100"
 ```
 
-To launch the model and start running (so that it can start to boot up):
-
+Start the model with the debug server and begin execution so that boot can progress:
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run"
 ```
 
-To launch the model so that remote hosts can access it (not recommended if not needed), using options aliases:
-
+If required, allow remote debug connections using option aliases:
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- -I -A --iris-port 7100"
 ```
 
 {{% notice Note %}}
-It is recommended to specify the port number used even if it is the default as that must match the debug connection setting (see later).
+Even when using the default, specify the Iris port explicitly so it matches your debugger connection settings. If you enable remote connections, ensure your firewall allows inbound access to the chosen port.
 {{% /notice %}}
diff --git a/content/learning-paths/automotive/zenacssdebug/primarycompute.md b/content/learning-paths/automotive/zenacssdebug/primarycompute.md
index ae48551864..c3c41e4ae3 100644
--- a/content/learning-paths/automotive/zenacssdebug/primarycompute.md
+++ b/content/learning-paths/automotive/zenacssdebug/primarycompute.md
@@ -1,14 +1,13 @@
 ---
 # User change
-title: "Debug Primary Compute and Linux"
+title: "Debug primary compute and Linux"
 
 weight: 8 # 1 is first, 2 is second, etc.
 
 # Do not modify these elements
 layout: "learningpathall"
 ---
-
-## Debug Primary Compute
+## Debug primary compute
 
 The Primary Compute application processors (`Cortex-A720AE`) are the final processors to be enabled.
 
@@ -16,11 +15,11 @@ As before, you can connect whilst powered down and monitor the point that they a
 
 You can debug the initialization code and the final Linux Operating System (OS) threads.
 
-### Connect debugger to target
+## Connect debugger to target
 
 Use the following debugger commands in the `Primary_init.launch` to load the symbols for the `BL2` initialization code, setting a breakpoint at `bl2_entrypoint`.
 
-Note that an address "offset" is used to specify the exception level that the image is relevant to. If the processor changes exception level, the debug information would need to also be loaded to the corresponding EL address space.
+Note that an address offset is used to specify the Exception Level (EL) that the image is relevant to. If the processor changes Exception Level, the debug information would need to also be loaded to the corresponding EL address space.
 
 For example the processors start in `EL3` and move to `EL2N` when the Linux kernel is enabled.
 
@@ -29,43 +28,52 @@ stop
 add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-a/2.11.0+git/image/firmware/bl2.elf EL3:0x0
 tbreak bl2_entrypoint
 ```
+
 {{% notice Note %}}
-Exact paths may differ for your set up.
+Exact paths might differ depending on your build output.
 {{% /notice %}}
 
-Run the code to the `bl2_entrypoint` and you can debug as expected.
+Run to **bl2_entrypoint** and step through as required.
 
-### Debug Linux kernel modules
+{{% notice Tip %}}
+Symbol loading is Exception Level–aware. If execution changes Exception Level, load symbols into the corresponding EL address space. For example, the processors start in EL3 and transition to EL2N when the Linux kernel is enabled.
+{{% /notice %}}
 
-To make use of the OS awareness feature, disconnect `Primary_init` and connect to `Primary_Linux` as created previously. Load the symbols from the `vmlinux` image.
+## Debug the Linux kernel with OS awareness (symmetric multiprocessing)
 
-``` text
+Switch to the `Primary_Linux.launch` connection you created earlier to enable Arm Development Studio OS awareness for the Linux kernel. Load the kernel symbols and set source mapping if your kernel sources are located outside the default paths:
+
+```text
 stop
 add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/linux-yocto/6.6.54+git/linux-fvp_rd_aspen-standard-build/vmlinux EL2N:0x0
 set substitute-path /usr/src/kernel/ /arm-auto-solutions/build/tmp_baremetal/work-shared/fvp-rd-aspen/kernel-source/
 ```
+
 Run the FVP until the OS prompt appears.
 
 {{% notice %}}
-If you are only interested in kernel debug, modify the launch command for the FVP to include `--run` to start execution immediately.
+If you only need kernel debugging, start the model with the debug server **and** begin execution immediately by adding `--run`:
 
-``` command
+```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100 --run"
 ```
 {{% /notice %}}
 
-You can now enable the `Threads` view in the `Debug Control` pane.
+## View Linux threads with OS awareness
 
-Right-click on the connection, and select `Display Threads`. You can also do this by entering `thread` in the `Command` pane.
+Enable the **Threads** view to inspect kernel threads instead of raw CPUs:
 
-The view will then change from listing the 16 application processors to the OS threads.
+In **Debug Control**, right-click the **Primary_Linux** connection and select **Display Threads**
+2. Alternatively, enter `thread` in the **Command** pane.
+
+The view changes from listing the 16 application processors to the active OS threads.
 
 {{% notice Note %}}
-A warning of the form:
-``` text
+You might see a warning like:
+```text
 WARNING(ROS60): Could not enable OS support as the OS does not appear to be initialized. This might be caused by a mismatch between the loaded symbols and the code on the target or because the OS is not up and running. Enabling OS support will be re-attempted when the target next stops.
 ```
-may be emitted if the OS is not booted when you connect. It can safely be ignored.
+This occurs if the OS has not completed boot when you connect; it is safe to ignore and will clear after the next target stop.
 {{% /notice %}}
 
 You have successfully learnt how to use Arm Development Studio to explore and debug the Arm Zena CSS Reference Software Stack.
diff --git a/content/learning-paths/automotive/zenacssdebug/rse.md b/content/learning-paths/automotive/zenacssdebug/rse.md
index 007a7a17f9..0580a209aa 100644
--- a/content/learning-paths/automotive/zenacssdebug/rse.md
+++ b/content/learning-paths/automotive/zenacssdebug/rse.md
@@ -8,72 +8,80 @@ weight: 6 # 1 is first, 2 is second, etc.
 layout: "learningpathall"
 ---
 
-## Debug RSE from reset
+## Overview
 
-Let us start by debugging the initial code that executes on the Cortex-M55 within the RSE block.
+You'll now move on to debug the initial code that runs on the Runtime Security Engine (RSE) based on Cortex-M55 in the Zena CSS FVP. You will launch the model with the Iris debug server, connect from Arm Development Studio, load Trusted Firmware-M (TF‑M) symbols, and step from reset.
 
-### Launch FVP
+## Launch the FVP and hold at reset
 
-Start a new `tmux` session for the FVP (if necessary):
+Start a new `tmux` session for the FVP if needed:
 ```command
 tmux new-session -s arm-auto-solutions
 ```
-and navigate to your code repository.
-
-To debug from reset, launch the FVP with the Iris server but do not run. This will hold the FVP in the initial reset condition.
 
+Navigate to your code repository, then launch the FVP with Iris **without** running so it stays at reset:
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100"
 ```
-The FVP will start and generate various informational messages. Once initialized you should see something similar to:
 
+The FVP initializes and prints information messages, for example:
 ```output
 ...
 Info: RD_Aspen: RD_Aspen.css.smb.rse_flashloader: FlashLoader: Saved 64MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/rse-flash-image.img'
 Info: RD_Aspen: RD_Aspen.ros.flash_loader: FlashLoader: Saved 128MB to file '~/arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/ap-flash-image.img'
 ```
-
 Note that execution has not started.
 
-### Connect the debugger
+{{% notice Tip %}}
+If you need remote debugging, start the FVP with `-A` and ensure the chosen Iris port (default `7100`) is reachable through your firewall.
+{{% /notice %}}
 
-Using the `RSE` connection created in the previous section, connect the debugger to the FVP. Observe that the processor is stopped before the first instruction has been executed.
+## Connect the debugger to RSE (Cortex-M55)
 
-In fact, the FVP is configured to have the vector table (`VTOR_S`) start at `0x11000000`, and if you inspect memory at that address the vector table will be populated. However no debug information is visible. Debug information must be loaded.
+Use the **RSE** model connection you created earlier to attach the debugger. The processor is stopped before the first instruction.
 
-In the `Debug Pane`, select `Load...` from the pane menu, and select `Add Symbols file`.
+The FVP configures the secure vector table (**VTOR_S**) at `0x11000000`. If you inspect memory at that address, the vector table is populated, but source is not visible until you load symbols.
 
-Browse to the `bl1_1.axf` file which is likely at:
-
-``` bash
-/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf
-```
-Debug symbols will be loaded, but likely no source will be displayed. This is because the build was performed within the virtual environment but the debugger is running outside of that.
+Load TF‑M symbols and map sources:
 
-You will be prompted to enter a path substitution to locate the sources. You can refer to the lowest common path so that all subsequent source files will also be located successfully.
+- In **Debug Control**, open the pane menu and choose **Load...**
+- Select **Add Symbols file**.
+- Choose the TF‑M image, for example:
+   ```bash
+   /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf
+   ```
+When prompted for **substitute path**, map build-time paths to your local sources, for example:
+   ```bash
+   /usr/src/debug/trusted-firmware-m/2.1.0/
+   /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/
+   ```
 
-``` bash
-/usr/src/debug/trusted-firmware-m/2.1.0/
-/arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/"
+Step one instruction to fetch the reset handler and stop there:
+```text
+stepi
 ```
-Finally, to perform a single instruction step (`stepi`) to allow the processor to fetch the address of the `Reset_Handler` and stop there.
-
-You can now step through the code, set breakpoints, and inspect the target as the code proceeds.
 
-### Automate setup
+You can now step through code, set breakpoints, and inspect the target as the code proceeds.
 
-For convenience, it is possible to automate these actions every time you connect by entering them as `Debugger Commands` in the `.launch` configuration.
+{{% notice Note %}}
+Paths vary by environment. Use your actual build output and source locations when adding symbols or configuring path substitution.
+{{% /notice %}}
 
-Open (double-click) the `.launch` file, and navigate to the `Debugger` pane.
+## Automate setup with debugger commands
 
-Enable `Execute debugger commands`, and enter the following (note pathing for your setup). You can copy the exact commands from the `Command` or `History` pane whilst performing the above GUI configuration.
+Automate the connection steps by adding **Debugger Commands** to the `.launch` configuration so they run on every attach:
 
-It is recommended to have an explicit `stop` command as symbols cannot be loaded whilst the target is running.
+- Open (double-click) your **RSE.launch** file.
+- Go to the **Debugger** tab.
+- Enable **Execute debugger commands**.
+- Add commands similar to the following (adjust paths as needed).
 
-``` text
+```text
 stop
 add-symbol-file /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/build/bin/bl1_1.axf
 set substitute-path /usr/src/debug/trusted-firmware-m/2.1.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/trusted-firmware-m/2.1.0/git/tfm/
 stepi
 ```
-![Debugger pane](debugger_commands.png)
+
+![RSE.launch in Arm Development Studio showing Debugger pane with TF-M symbols loaded and path substitution mapping alt-text#center](debugger_commands.png "RSE Debugger pane with TF-M symbol loading and source path substitution")
+
diff --git a/content/learning-paths/automotive/zenacssdebug/safetyisland.md b/content/learning-paths/automotive/zenacssdebug/safetyisland.md
index 951e973531..02663a4d34 100644
--- a/content/learning-paths/automotive/zenacssdebug/safetyisland.md
+++ b/content/learning-paths/automotive/zenacssdebug/safetyisland.md
@@ -9,19 +9,25 @@ layout: "learningpathall"
 ---
 ## Debug Safety Island code from beginning
 
-The Safety Island (Cortex-R82AE) is released from reset by the RSE code, and so the RSE code must proceed to that point before the Safety Island core can execute.
+The Safety Island subsystem based on the Cortex-R82AE is released from reset by RSE code. To debug Safety Island from first instruction, you must let the RSE (Cortex‑M55) code reach the point where it enables Safety Island on the Zena CSS FVP.
 
-### Launch FVP
+## Launch the FVP and reconnect RSE
 
-If necessary, restart the FVP in the reset state as before, and reconnect `RSE`.
+If necessary, start (or restart) the FVP held at reset and reconnect the RSE model connection in Arm Development Studio:
 
 ```command
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose -- --iris-server --iris-port 7100"
 ```
 
-Set up the `SI` connection in a similar way as the `RSE` connection. Use the following commands in the `Debugger` pane. This will load debug symbols and perform the necessary path substitution. You can then set a breakpoint on the entry of the `SI` code, `arch_exception_reset`.
+{{% notice Tip %}}
+For remote debugging, add `-A` and ensure the chosen Iris port (default `7100`) is reachable.
+{{% /notice %}}
+
+## Connect the debugger to Safety Island (Cortex-R82AE)
 
-``` text
+Configure the **SI** model connection similarly to **RSE**. Add the following **Debugger commands** to load symbols, set up source path substitution, and break at the Safety Island reset entry (`arch_exception_reset`):
+
+```text
 stop
 add-symbol-file /arm-auto-solutions/build/tmp_baremetal/deploy/images/fvp-rd-aspen/si0_ramfw.elf
 set substitute-path /usr/src/debug/scp-firmware/2.14.0/ /arm-auto-solutions/build/tmp_baremetal/work/fvp_rd_aspen-poky-linux/scp-firmware/2.14.0/git/
@@ -29,22 +35,22 @@ b arch_exception_reset
 ```
 
 {{% notice Note %}}
-Exact paths may differ for your set up.
+Paths vary by environment. Use your actual build output and source locations when adding symbols or configuring path substitution.
 {{% /notice %}}
 
-### Start execution
+## Start execution to release Safety Island
 
-Select the `RSE` connection in the `Debug Control` pane, and start execution (this will be unavailable in the `SI` connection, as that is currently powered down).
+In **Debug Control**, select the **RSE** connection and start execution (run). The **SI** connection remains unavailable to run until Safety Island is powered up.
 
-The `RSE` code will run until the point that the `SI` is enabled. This is reflected in the output log.
+When RSE enables Safety Island, you will see a log message like:
 
-``` output
+```output
 [INF] BL2: SI CL0 post load start
 ```
 
-#### Full output log
+## Full output log
 
-The full output lof is shown here for your reference:
+The full output log is shown here for your reference:
 
 ``` output
 Trying ::1...
diff --git a/content/learning-paths/automotive/zenacssdebug/zena.md b/content/learning-paths/automotive/zenacssdebug/zena.md
index fffc489bbb..a6a8edb1e9 100644
--- a/content/learning-paths/automotive/zenacssdebug/zena.md
+++ b/content/learning-paths/automotive/zenacssdebug/zena.md
@@ -8,66 +8,66 @@ weight: 2 # 1 is first, 2 is second, etc.
 layout: "learningpathall"
 ---
 
-# Arm Zena Compute Subsystem
+## Arm Zena Compute Subsystem
 
-The Arm Zena Compute Subsystem (CSS) consists of a high-performance Arm Cortex-A720AE Application Processor (Primary Compute) system augmented with an Arm Cortex-R82AE based Safety Island (SI) and real-time domain to host additional system safety monitoring and real-time services.
+The Arm Zena Compute Subsystem (CSS) consists of a high-performance Arm Cortex-A720AE application processor system (primary compute), augmented with an Arm Cortex-R82AE–based Safety Island (SI) and a real-time domain to host additional system-safety monitoring and real-time services.
 
-The system additionally includes a Runtime Security Engine (RSE) used for the secure boot of the system elements and the runtime secure services.
+The system also includes a Runtime Security Engine (RSE), which is used for secure boot of the system elements and to provide runtime secure services.
 
-The Arm Zena CSS software stack provides an open-source, integrated solution running on a Fixed Virtual Platform (FVP).
+The Arm Zena CSS Reference Software Stack provides an open-source, integrated solution running on a Fixed Virtual Platform (FVP). Both the reference software stack and the FVP are freely available.
 
-The reference software stack and the FVP are freely available.
+For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem).
 
-For more information, see [Arm Zena Compute Subsystem (CSS)](https://developer.arm.com/Compute%20Subsystems/Arm%20Zena%20Compute%20Subsystem) and associated links.
+## Build the software stack
 
-## Build software stack
+Follow the steps to download and build the software stack in the [Arm Zena CSS User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html).
 
-Follow the steps to download and build the software stack in the [User Guide](https://arm-auto-solutions.docs.arm.com/en/v2.0/rd-aspen/user_guide/reproduce.html).
-
-The default `Arm Automotive Solutions Demo` build is used.
+Here the default **Arm Automotive Solutions Demo** build is used.
 
 {{% notice Note %}}
-The focus of this Learning Path is to demonstrate the **debug** of the software stack.
+The primary focus of this Learning Path is to demonstrate how to debug the software stack.
 {{% /notice %}}
 
-## Verify correct build and execution
+## Verify the build and execution
 
-Once the software stack has been built, you can verify that it runs successfully with the command:
+After you build the software stack, verify that it runs successfully:
 
-``` command
+```bash
 kas shell -c "../layers/meta-arm/scripts/runfvp -t tmux --verbose"
 ```
 
-The system will run through the boot process until a Linux prompt is available (in `terminal_ns_uart0`).
+The system runs through the boot process until a Linux prompt is available (in `terminal_ns_uart0`).
 
-Use `Ctrl+C` on the command terminal to terminate.
+Press **Ctrl+C** in the command terminal to terminate the process.
 
 ## Install FVP (optional)
 
-The FVP is downloaded and installed as part of the build process above.
+The FVP is downloaded and installed as part of the build process.
 
-The `Arm-Zena-CSS-FVP` can also be independently downloaded from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs).
+You can also separately download Arm-Zena-CSS-FVP from the Arm Developer [website](https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms/Automotive%20FVPs).
 
 See also the Arm Ecosystem FVPs and Architecture Envelope Models [Install Guide](/install-guides/fm_fvp/eco_fvp/).
 
 {{% notice Note %}}
-For legacy reasons the FVP is named is `FVP_RD_Aspen`.
+For legacy reasons, the FVP is named `FVP_RD_Aspen`.
 {{% /notice %}}
 
-# Arm Development Studio
+## Arm Development Studio
+
+Arm Development Studio is a software development environment with multicore debug support for Arm CPUs. It provides early support for the latest processors and works seamlessly with FVPs.
 
-Arm Development Studio is a software development solution with support of multicore debug for Arm CPUs. It provides the earliest support for the latest processors.
+The CPUs implemented within Arm Zena CSS are supported by Arm Development Studio 2024.0 and later; however, 2024.1 or later is recommended for Linux OS debug support. At the time of writing, the latest version is 2025.0, which is used for this Learning Path.
 
-The CPUs implemented within Arm Zena CSS are supported by Arm Development Studio 2024.0 and later, though 2024.1 or later is recommended for appropriate Linux OS support. At time of writing the latest version available is 2025.0, and that is the version used for this learning path.
+For more information, see [Arm Development Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio).
 
-For more information see [Arm Development Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio).
+Arm Development Studio is a commercial, license-managed product. For installation and setup instructions, see the [Arm Development Studio Install Guide](/install-guides/armds/).
 
-Arm Development Studio is a commercial, license managed, product. For installation and set up instructions, see this [Install Guide](/install-guides/armds/).
+Launch the IDE and create a new workspace folder.
 
-Launch the IDE. It is recommended to create a new workspace folder.
+If you’re prompted by the launcher (this prompt is disabled by default), create a new folder there. 
 
-If prompted by the launcher (this is disabled by default) create a new folder there, else select `File` > `Switch Workspace` > `Other...`.
+Otherwise, select **File** > **Switch Workspace** > **Other**.
 
 {{% notice Note %}}
-To enable this prompt by default, navigate to `Window` > `Preferences` > `General` > `Startup and Shutdown` > `Workspaces`, and enable `Prompt for workspace on startup`.
+To enable the workspace prompt, go to **Window** > **Preferences** > **General** > **Startup and Shutdown** > **Workspaces**, and enable **Prompt for workspace on startup**.
 {{% /notice %}}
diff --git a/content/learning-paths/cross-platform/intrinsics/_index.md b/content/learning-paths/cross-platform/intrinsics/_index.md
index 9f286a16ca..d6eea54443 100644
--- a/content/learning-paths/cross-platform/intrinsics/_index.md
+++ b/content/learning-paths/cross-platform/intrinsics/_index.md
@@ -23,8 +23,7 @@ author: Jason Andrews
 test_images:
 - amd64/ubuntu:latest
 - arm64v8/ubuntu:latest
-test_link: https://github.com/armflorentlebeau/arm-learning-paths/actions/runs/4312122327
-test_maintenance: true
+test_maintenance: false
 
 ### Tags
 skilllevels: Advanced
diff --git a/content/learning-paths/cross-platform/topdown-compare/1-top-down.md b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md
new file mode 100644
index 0000000000..de65d5cd6f
--- /dev/null
+++ b/content/learning-paths/cross-platform/topdown-compare/1-top-down.md
@@ -0,0 +1,197 @@
+---
+title: Top-down performance analysis
+weight: 3
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## What are the differences between Arm and x86 PMU counters?
+
+This is a common question from software developers and performance engineers. 
+
+Both Arm and x86 CPUs provide sophisticated Performance Monitoring Units (PMUs) with hundreds of hardware counters. Instead of trying to list all available counters and compare microarchitecture, it makes more sense to focus on the performance methodologies they enable and the calculations used for performance metrics. 
+
+While the specific counter names and formulas differ between architectures, both have converged on top-down performance analysis methodologies that categorize performance bottlenecks into four buckets: Retiring, Bad Speculation, Frontend Bound, and Backend Bound.
+
+This Learning Path provides a comparison of how Arm and x86 processors implement top-down
+analysis, highlighting the similarities in approach while explaining the architectural differences in counter events and formulas.
+
+## Introduction to top-down performance analysis
+
+Top-down methodology makes performance analysis easier by shifting focus from individual performance
+counters to pipeline slot utilization. Instead of trying to interpret dozens of seemingly unrelated metrics, you can systematically identify bottlenecks by attributing each CPU pipeline slot to one of four categories.
+
+- Retiring: pipeline slots that successfully complete useful work
+- Bad Speculation: slots wasted on mispredicted branches
+- Frontend Bound: slots stalled due to instruction fetch/decode limitations
+- Backend Bound: slots stalled due to execution resource constraints
+
+The methodology uses a hierarchical approach that allows you to drill down only into the dominant bottleneck category, and avoid the complexity of analyzing all possible performance issues at the same time. 
+
+The next sections compare the Intel x86 methodology with the Arm top-down methodology. AMD also has an equivalent top-down methodology which is similar to Intel, but uses different counters and calculations. 
+
+## Intel x86 top-down methodology
+
+Intel uses a slot-based accounting model where each CPU cycle provides multiple issue slots. A slot is a hardware resource needed to process operations. More slots means more work can be done. The number of slots depends on the design but current processor designs have 4, 6, or 8 slots. 
+
+### Hierarchical Structure
+
+Intel uses a multi-level hierarchy that typically extends to 4 levels of detail.
+
+**Level 1 (Top-Level):**
+
+At Level 1, all pipeline slots are attributed to one of four categories, providing a high-level view of whether the CPU is doing useful work or stalling.
+
+- Retiring = `UOPS_RETIRED.RETIRE_SLOTS / SLOTS`
+- Bad Speculation = `(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + N * RECOVERY_CYCLES) / SLOTS`
+- Frontend Bound = `IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS`
+- Backend Bound = `1 - (Frontend + Bad Spec + Retiring)`
+
+Where `SLOTS = 4 * CPU_CLK_UNHALTED.THREAD` on most Intel cores.
+
+**Level 2 breakdown:**
+
+Level 2 drills into each of these to identify broader causes, such as distinguishing between frontend latency and bandwidth limits, or between memory and core execution stalls in the backend.
+
+- Frontend Bound covers frontend latency vs. frontend bandwidth
+- Backend Bound covers memory bound vs. core bound
+- Bad Speculation covers branch mispredicts vs. machine clears
+- Retiring covers base vs. microcode sequencer
+
+**Level 3 breakdown:**
+
+Level 3 provides fine-grained attribution, pinpointing specific bottlenecks like DRAM latency, cache misses, or port contention, which makes it possible to identify the exact root cause and apply targeted optimizations.
+
+- Memory Bound includes L1 Bound, L2 Bound, L3 Bound, DRAM Bound, Store Bound
+- Core Bound includes Divider, Ports Utilization
+- And many more specific categories
+
+**Level 4 breakdown:**
+
+Level 4 provides the specific microarchitecture events that cause the inefficiencies. 
+
+### Key Performance Events
+
+Intel processors expose hundreds of performance events, but top-down analysis relies on a core set:
+
+| Event Name                                      | Purpose                                                                              |
+| :---------------------------------------------- | :----------------------------------------------------------------------------------- |
+| `UOPS_RETIRED.RETIRE_SLOTS`                     | Count retired micro-operations (Retiring)                                            |
+| `UOPS_ISSUED.ANY`                               | Count issued micro-operations (helps quantify Bad Speculation)                       |
+| `IDQ_UOPS_NOT_DELIVERED.CORE`                   | Frontend delivery failures (Frontend Bound)                                          |
+| `CPU_CLK_UNHALTED.THREAD`                       | Core clock cycles (baseline for normalization)                                       |
+| `BR_MISP_RETIRED.ALL_BRANCHES`                  | Branch mispredictions (Bad Speculation detail)                                       |
+| `MACHINE_CLEARS.COUNT`                          | Pipeline clears due to memory ordering or faults (Bad Speculation detail)            |
+| `CYCLE_ACTIVITY.STALLS_TOTAL`                   | Total stall cycles (baseline for backend breakdown)                                  |
+| `CYCLE_ACTIVITY.STALLS_MEM_ANY`                 | Aggregate stalls from memory hierarchy misses (Backend → Memory Bound)               |
+| `CYCLE_ACTIVITY.STALLS_L1D_MISS`                | Stalls due to L1 data cache misses                                                   |
+| `CYCLE_ACTIVITY.STALLS_L2_MISS`                 | Stalls waiting on L2 cache misses                                                    |
+| `CYCLE_ACTIVITY.STALLS_L3_MISS`                 | Stalls waiting on last-level cache misses                                            |
+| `MEM_LOAD_RETIRED.L1_HIT` / `L2_HIT` / `L3_HIT` | Track where loads are satisfied in the cache hierarchy                               |
+| `MEM_LOAD_RETIRED.L3_MISS`                      | Loads missing LLC and going to memory                                                |
+| `MEM_LOAD_RETIRED.DRAM_HIT`                     | Loads serviced by DRAM (DRAM Bound detail)                                           |
+| `OFFCORE_RESPONSE.*`                            | Detailed classification of off-core responses (L3 vs. DRAM, local vs. remote socket) |
+
+
+Using the above levels of metrics you can find out which of the 4 top-level categories are causing bottlenecks.
+
+### Arm top-down methodology
+
+Arm developed a similar top-down methodology for Neoverse server cores. The Arm architecture uses an 8-slot rename unit for pipeline bandwidth accounting.
+
+### Two-Stage Approach
+
+Unlike Intel's hierarchical model, Arm employs a two-stage methodology:
+
+**Stage 1: Topdown analysis**
+
+- Identifies high-level bottlenecks using the same four categories
+- Uses Arm-specific PMU events and formulas
+- Slot-based accounting similar to Intel but with Arm event names
+
+**Stage 2: Micro-architecture exploration**
+
+- Resource-specific effectiveness metrics grouped by CPU component
+- Industry-standard metrics like MPKI (Misses Per Kilo Instructions)
+- Detailed breakdown without strict hierarchical drilling
+
+### Stage 1 formulas 
+
+Arm uses different top-down metrics based on different events but the concept is similar.
+
+| Metric | Formula | Purpose |
+| :-- | :-- | :-- |
+| Backend bound | `100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * 8))` | Backend resource constraints |
+| Frontend bound | `100 * ((STALL_SLOT_FRONTEND / (CPU_CYCLES * 8)) - (BR_MIS_PRED / (4 * CPU_CYCLES)))` | Frontend delivery issues |
+| Bad speculation | `100 * (1 - (OP_RETIRED/OP_SPEC)) * (1 - (STALL_SLOT/(CPU_CYCLES * 8))) + (BR_MIS_PRED / (4 * CPU_CYCLES))` | Misprediction recovery |
+| Retiring | `100 * (OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/(CPU_CYCLES * 8)))` | Useful work completed |
+
+### Stage 2 resource groups
+
+Instead of hierarchical levels, Arm organizes detailed metrics into effectiveness groups as shown below:
+
+- Branch Effectiveness: Misprediction rates, MPKI
+- ITLB/DTLB Effectiveness: Translation lookaside buffer efficiency
+- L1I/L1D/L2/LL Cache Effectiveness: Cache hit ratios and MPKI
+- Operation Mix: Breakdown of instruction types (SIMD, integer, load/store)
+- Cycle Accounting: Frontend vs. backend stall percentages
+
+### Key performance events 
+
+Neoverse cores expose approximately 100 hardware events optimized for server workloads, including:
+
+| Event Name            | Purpose / Usage                                                                          |
+| :-------------------- | :--------------------------------------------------------------------------------------- |
+| `CPU_CYCLES`          | Core clock cycles (baseline for normalization).                                          |
+| `OP_SPEC`             | Speculatively executed micro-operations (used as slot denominator).                      |
+| `OP_RETIRED`          | Retired micro-operations (used to measure useful work).                                  |
+| `INST_RETIRED`        | Instructions retired (architectural measure; used for IPC, MPKI normalization).          |
+| `INST_SPEC`           | Instructions speculatively executed (needed for operation mix and speculation analysis). |
+| `STALL_SLOT`          | Total stall slots (foundation for efficiency metrics).                                   |
+| `STALL_SLOT_FRONTEND` | Stall slots due to frontend resource constraints.                                        |
+| `STALL_SLOT_BACKEND`  | Stall slots due to backend resource constraints.                                         |
+| `BR_RETIRED`          | Branches retired (baseline for branch misprediction ratio).                              |
+| `BR_MIS_PRED_RETIRED` | Mispredicted branches retired (branch effectiveness, speculation waste).                 |
+| `L1I_CACHE_REFILL`    | Instruction cache refills (frontend stalls due to I-cache misses).                       |
+| `ITLB_WALK`           | Instruction TLB walks (frontend stalls due to translation).                              |
+| `L1D_CACHE_REFILL`    | Data cache refills (backend stalls due to L1D misses).                                   |
+| `L2D_CACHE_REFILL`    | Unified L2 cache refills (backend stalls from L2 misses).                                |
+| `LL_CACHE_MISS_RD`    | Last-level/system cache read misses (backend stalls from LLC/memory).                    |
+| `DTLB_WALK`           | Data TLB walks (backend stalls due to translation).                                      |
+| `MEM_ACCESS`          | Total memory accesses (baseline for cache/TLB effectiveness ratios).                     |
+
+
+## Arm compared to x86 
+
+### Conceptual similarities
+
+Both architectures adhere to the same fundamental top-down performance analysis philosophy:
+
+1. Four-category classification: Retiring, Bad Speculation, Frontend Bound, Backend Bound
+2. Slot-based accounting: Pipeline utilization measured in issue or rename slots
+3. Hierarchical analysis: Broad classification followed by drill-down into dominant bottlenecks
+4. Resource attribution: Map performance issues to specific CPU micro-architectural components
+
+### Key Differences
+
+| Aspect | x86 Intel | Arm Neoverse |
+| :-- | :-- | :-- |
+| Hierarchy Model | Multi-level tree (Level 1 → Level 2 → Level 3+) | Two-stage: Topdown Level 1 + Resource Groups |
+| Slot Width | 4 issue slots per cycle (typical) | 8 rename slots per cycle (Neoverse V1) |
+| Formula Basis | Micro-operation (uop) centric | Operation and cycle centric |
+| Event Naming | Intel-specific mnemonics | Arm-specific mnemonics |
+| Drill-down Strategy | Strict hierarchical descent | Exploration by resource groups |
+
+### Event Mapping Examples
+
+| Performance Question | x86 Intel Events | Arm Neoverse Events |
+| :-- | :-- | :-- |
+| Frontend bound? | `IDQ_UOPS_NOT_DELIVERED.*` | `STALL_SLOT_FRONTEND` |
+| Bad speculation? | `BR_MISP_RETIRED.*` | `BR_MIS_PRED_RETIRED` |
+| Memory bound? | `CYCLE_ACTIVITY.STALLS_L3_MISS` | `L1D_CACHE_REFILL`, `L2D_CACHE_REFILL` |
+| Cache effectiveness? | `MEM_LOAD_RETIRED.L3_MISS_PS` | Cache refill metrics / Cache access metrics |
+
+While it doesn't make sense to directly compare PMU counters for the Arm and x86 architectures, it is useful to understand the top-down methodologies for each so you can do effective performance analysis and compare you code running on each architecture. 
+
+Continue to the next step to try a code example.
\ No newline at end of file
diff --git a/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md
new file mode 100644
index 0000000000..1050cceb5b
--- /dev/null
+++ b/content/learning-paths/cross-platform/topdown-compare/2-code-examples.md
@@ -0,0 +1,268 @@
+---
+title: Performance analysis code example
+weight: 4
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+##  Example code
+
+To compare top-down on Arm and x86 you can run a small example to gain some practical experience. 
+
+You can prepare the application and test it on both x86 and Arm Linux systems. You will need a C compiler installed, [GCC](/install-guides/gcc/native/) or Clang, and [Perf](/install-guides/perf/) installed on each system. Refer to the package manager for your Linux distribution for installation information. 
+
+Use a text editor to copy the code below to a file named `test.c`
+
+```C
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <number_of_iterations>\n", argv[0]);
+        return 1;
+    }
+
+    long long num_iterations = atoll(argv[1]);
+    if (num_iterations <= 0) {
+        fprintf(stderr, "Number of iterations must be a positive integer.\n");
+        return 1;
+    }
+
+    // Using volatile tells the compiler not to optimize this variable away.
+    // We initialize it to a non-trivial value.
+    volatile double result = 1.23456789;
+
+    printf("Performing %lld dependent floating-point divisions...\n", num_iterations);
+
+    // This loop creates a long dependency chain of floating-point divisions.
+    // Division is a high-latency operation. The dependency between iterations
+    // means the CPU backend will be stalled waiting for the result of the
+    // previous division before it can start the next one. This creates a
+    // classic backend-bound scenario, specifically core-bound.
+    for (long long i = 0; i < num_iterations; ++i) {
+        result /= 1.00000001;
+    }
+
+    printf("Done. Final result: %f\n", (double)result);
+
+    return 0;
+}
+```
+
+This  program takes a single command-line argument specifying the number of iterations to run. It performs that many sequential floating-point divisions in a loop, using a volatile variable to prevent compiler optimization, and prints the final result. 
+
+It's a contrived example used to create a dependency chain of high-latency operations (divisions), simulating a CPU-bound workload where each iteration must wait for the previous one to finish.
+
+Build the application using GCC:
+
+```console
+gcc -O3 -march=native -o test test.c
+```
+
+You can also use Clang by substituting `clang` instead of `gcc` in the command above.
+
+Run the application and pin it to one core to make the numbers more consistent:
+
+```console
+taskset -c 1 ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+```
+
+## Collect x86 top-down level 1 metrics 
+
+Linux Perf computes top-down level 1 breakdown as described in the previous section for Retiring, Bad Speculation, Frontend Bound, and Backend Bound.
+
+Use `perf stat` to on the pinned core to collect the metrics. 
+
+```console
+taskset -c 1 perf stat -C 1 --topdown ./test 1000000000 
+```
+
+The output will be similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+
+ Performance counter stats for 'CPU(s) 1':
+
+                                    retiring      bad speculation       frontend bound        backend bound 
+S0-D0-C1           1                 8.5%                 0.0%                 0.1%                91.4% 
+
+       6.052117775 seconds time elapsed
+```
+
+You see a very large `backend bound` component for this program. 
+
+You can also run with the `-M topdownl1` argument on Perf. 
+
+```console
+taskset -c 1 perf stat -C 1 -M topdownl1  ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+
+ Performance counter stats for 'CPU(s) 1':
+
+     3,278,902,619      uops_issued.any           #     0.00 Bad_Speculation          (14.30%)
+    19,185,808,092      cpu_clk_unhalted.thread   #     0.04 Retiring                 (14.30%)
+     3,275,536,897      uops_retired.retire_slots                                     (14.30%)
+         1,065,517      int_misc.recovery_cycles                                      (14.30%)
+     3,263,874,383      uops_issued.any           #     0.96 Backend_Bound            (14.33%)
+        28,107,558      idq_uops_not_delivered.core                                     (28.64%)
+           631,768      int_misc.recovery_cycles                                      (42.90%)
+    19,173,526,414      cpu_clk_unhalted.thread                                       (57.17%)
+    19,176,373,078      cpu_clk_unhalted.thread   #     0.00 Frontend_Bound           (42.79%)
+        25,090,380      idq_uops_not_delivered.core                                     (42.79%)
+     <not counted>      cpu_clk_unhalted.thread                                     
+
+       6.029283206 seconds time elapsed
+```
+
+Again, showing `Backend_Bound` value very high (0.96). 
+
+If you want to learn more, you can continue with the level 2 and level 3 analysis.
+
+
+## Use the Arm top-down methodology
+
+Make sure you install the Arm top-down tool.
+
+Use the [Telemetry Solution install guide](/install-guides/topdown-tool/) for information about installing `topdown-tool`. 
+
+Collect instructions per cycle (IPC):
+
+```console
+taskset -c 1 topdown-tool -m General ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 2 (uarch metrics)
+=======================
+[General]
+Instructions Per Cycle 0.355 per cycle
+```
+
+Connect the stage 1 metrics:
+
+```console
+taskset -c 1 topdown-tool -m Cycle_Accounting ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 1 (Topdown metrics)
+=========================
+[Cycle Accounting]
+Frontend Stalled Cycles 0.04% cycles
+Backend Stalled Cycles. 88.15% cycles
+```
+
+This confirms the example has high backend stalls as on x86. 
+
+You can continue to use the `topdown-tool` for additional microarchitecture exploration.
+
+For L1 data cache:
+
+```console
+taskset -c 1 topdown-tool -m L1D_Cache_Effectiveness  ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 2 (uarch metrics)
+=======================
+[L1 Data Cache Effectiveness]
+L1D Cache MPKI............... 0.023 misses per 1,000 instructions
+L1D Cache Miss Ratio......... 0.000 per cache access
+```
+
+For L1 instruction cache:
+
+```console
+taskset -c 1 topdown-tool -m L1D_Cache_Effectiveness  ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 2 (uarch metrics)
+=======================
+[L1 Data Cache Effectiveness]
+L1D Cache MPKI............... 0.022 misses per 1,000 instructions
+L1D Cache Miss Ratio......... 0.000 per cache access
+```
+
+For last level cache: 
+
+```console
+taskset -c 1 topdown-tool -m LL_Cache_Effectiveness  ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 2 (uarch metrics)
+=======================
+[Last Level Cache Effectiveness]
+LL Cache Read MPKI.............. 0.017 misses per 1,000 instructions
+LL Cache Read Miss Ratio........ 0.802 per cache access
+LL Cache Read Hit Ratio......... 0.198 per cache access
+```
+
+For operation mix:
+
+```console
+taskset -c 1 topdown-tool -m Operation_Mix ./test 1000000000
+```
+
+The output is similar to:
+
+```output
+Performing 1000000000 dependent floating-point divisions...
+Done. Final result: 0.000056
+Stage 2 (uarch metrics)
+=======================
+[Speculative Operation Mix]
+Load Operations Percentage.......... 16.70% operations
+Store Operations Percentage......... 16.59% operations
+Integer Operations Percentage....... 33.61% operations
+Advanced SIMD Operations Percentage. 0.00% operations
+Floating Point Operations Percentage 16.45% operations
+Branch Operations Percentage........ 16.65% operations
+Crypto Operations Percentage........ 0.00% operations
+```
+
+
+## Summary
+
+Both Arm Neoverse and modern x86 cores expose hardware events that Perf aggregates into the same top-down categories. Names of the PMU counters differ, but the level 1 categories are the same. 
+
+If you are working on both architectures you can use the same framework with minor differences between Intel's hierarchical structure and Arm's two-stage resource groups to systematically identify and resolve performance bottlenecks. 
+
diff --git a/content/learning-paths/cross-platform/topdown-compare/_index.md b/content/learning-paths/cross-platform/topdown-compare/_index.md
new file mode 100644
index 0000000000..275149ff08
--- /dev/null
+++ b/content/learning-paths/cross-platform/topdown-compare/_index.md
@@ -0,0 +1,59 @@
+---
+title: "Compare Arm and x86 Top-Down Performance Analysis"
+
+minutes_to_complete: 30
+
+draft: true
+cascade:
+    draft: true
+
+who_is_this_for: This is an advanced topic for software developers who want to understand the similarities and differences between Arm and x86 top-down performance analysis. 
+
+learning_objectives:
+     - Describe the similarities and differences between top-down performance analysis on x86 and Arm Linux systems.
+     - Run applications on both architectures and understand how performance analysis is done on each system.
+
+prerequisites:
+    - Familiarity with performance analysis on Linux systems using Perf. 
+    - Arm and x86 Linux systems to try code examples.
+
+author:
+    - Jason Andrews
+
+### Tags
+skilllevels: Advanced
+subjects: Performance and Architecture
+armips:
+    - Neoverse
+operatingsystems:
+    - Linux
+tools_software_languages:
+    - GCC
+    - Clang
+
+shared_path: true
+shared_between:
+    - servers-and-cloud-computing
+    - automotive
+
+further_reading:
+    - resource:
+        title: Arm Neoverse V1 Top-down Methodology for Performance Analysis & Telemetry Specification
+        link: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/arm-neoverse-v1-top-down-methodology
+        type: blog
+    - resource:
+        title: Performance Analysis and Tuning on Modern CPUs
+        link: https://www.amazon.com/Performance-Analysis-Tuning-Modern-CPUs/dp/B0DNQZJ92S
+        type: documentation
+    - resource:
+        title: How to use the Arm Performance Monitoring Unit and System Counter
+        link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/arm_pmu/).
+        type: website
+
+
+### FIXED, DO NOT MODIFY
+# ================================================================================
+weight: 1                       # _index.md always has weight of 1 to order correctly
+layout: "learningpathall"       # All files under learning paths have this same wrapper
+learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
+---
diff --git a/content/learning-paths/cross-platform/topdown-compare/_next-steps.md b/content/learning-paths/cross-platform/topdown-compare/_next-steps.md
new file mode 100644
index 0000000000..c3db0de5a2
--- /dev/null
+++ b/content/learning-paths/cross-platform/topdown-compare/_next-steps.md
@@ -0,0 +1,8 @@
+---
+# ================================================================================
+#       FIXED, DO NOT MODIFY THIS FILE
+# ================================================================================
+weight: 21                  # Set to always be larger than the content in this path to be at the end of the navigation.
+title: "Next Steps"         # Always the same, html page title.
+layout: "learningpathall"   # All files under learning paths have this same wrapper for Hugo processing.
+---
diff --git a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md
index d17480fcbc..b83ed43fbc 100644
--- a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md
+++ b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md
@@ -1,28 +1,30 @@
 ---
-title: Migrating SIMD code to the Arm architecture
+title: "Migrate SIMD code to the Arm architecture"
 weight: 3
 
-### FIXED, DO NOT MODIFY
-layout: learningpathall
+# FIXED, DO NOT MODIFY
+layout: "learningpathall"
 ---
 
-## Vectorization on x86 vs. Arm
+## Vectorization on x86 and Arm
 
-Migrating SIMD (Single Instruction, Multiple Data) code from x86 extensions to Arm extensions is an important task for software developers aiming to optimize performance on Arm platforms. 
+Migrating SIMD (Single Instruction, Multiple Data) code from x86 extensions to Arm extensions is a key task for software developers aiming to optimize performance on Arm platforms. 
 
-Understanding the mapping between x86 instruction sets like SSE, AVX, and AMX to Arm's NEON, SVE, and SME extensions is essential for ensuring portability and high performance. This Learning Path provides an overview to help you design a migration plan, leveraging Arm features such as scalable vector lengths and advanced matrix operations, to effectively adapt your code.
+Understanding the mapping from x86 instruction sets such as SSE, AVX, and AMX to Arm’s NEON, SVE, and SME extensions is essential for achieving portability and high performance. This Learning Path provides an overview to help you design a migration plan in which you can leverage Arm features such as scalable vector lengths and advanced matrix operations to adapt your code effectively.
 
-Vectorization is a key optimization strategy where one instruction processes multiple data elements simultaneously. It drives performance in HPC, AI/ML, signal processing, and data analytics.  
+Vectorization is a key optimization strategy where one instruction processes multiple data elements simultaneously. It drives performance in High-Performance Computing (HPC), AI and ML, signal processing, and data analytics.
 
-Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers a mix of fixed-width, for NEON,  and scalable vectors for SVE and SME ranging from 128 to 2048 bits.  
+Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers fixed-width vectors for NEON and scalable vectors for SVE and SME, ranging from 128 to 2048 bits.
 
-If you are interested in migrating SIMD software to Arm, understanding these differences ensures portable, high-performance code.
+If you are migrating SIMD software to Arm, understanding these differences will help you write portable, high-performance code.
 
 ## Arm vector and matrix extensions
 
+This section provides some more information about the Arm vector and matrix extensions and shows you when to use each, how they map from SSE/AVX/AMX, and what changes in your programming model (predication, gather/scatter, tiles, streaming mode).
+
 ### NEON
 
-NEON is a 128-bit SIMD extension available across all Armv8 cores, including both mobile and Neoverse platforms. It is particularly well-suited for multimedia processing, digital signal processing (DSP), and packet processing workloads. Conceptually, NEON is equivalent to x86 SSE or AVX-128, making it the primary target for migrating SSE workloads. Compiler support for auto-vectorization to NEON is mature, simplifying the migration process for developers.
+NEON is a 128-bit SIMD extension available across Armv8-A cores, including Neoverse and mobile. It is well suited to multimedia, DSP, and packet processing. Conceptually, NEON is closest to x86 SSE and AVX used in 128-bit mode, making it the primary target when migrating many SSE workloads. Compiler auto-vectorization to NEON is mature, reducing the need for manual intrinsics.
 
 ### Scalable Vector Extension (SVE)
 
@@ -30,116 +32,100 @@ SVE introduces a revolutionary approach to SIMD with its vector-length agnostic
 
 ### Scalable Matrix Extension (SME)
 
-SME is designed to accelerate matrix multiplication and is similar to AMX. Unlike AMX, which relies on dot-product-based operations, SME employs outer-product-based operations, providing greater flexibility for custom AI and HPC kernels. SME integrates seamlessly with SVE, utilizing scalable tiles and a streaming mode to optimize performance. It is particularly well-suited for AI training and inference workloads, as well as dense linear algebra in HPC applications. 
+SME accelerates matrix multiplication and is similar in intent to AMX. Unlike AMX, which often uses dot-product oriented operations, SME employs outer-product oriented operations. SME integrates with SVE, using scalable tiles and a streaming mode to optimize performance. It is well suited to AI training and inference, as well as dense linear algebra in HPC applications.
 
 ## x86 vector and matrix extensions
 
+Here is a brief overview of the x86 families you’ll likely port from: SSE (128-bit), AVX/AVX-512 (256/512-bit with masking), and AMX (tile-based matrix compute). Use this to identify feature equivalents before mapping kernels to NEON, SVE/SVE2, or SME on Arm.
+
 ### Streaming SIMD Extensions (SSE)
 
 The SSE instruction set provides 128-bit XMM registers and supports both integer and floating-point SIMD operations. Despite being an older technology, SSE remains a baseline for many libraries due to its widespread adoption. 
 
-However, its fixed-width design and limited throughput make it less competitive compared to more modern extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm NEON, enabling a relatively straightforward transition.
+However, its fixed-width design can constrain throughput compared with newer extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm NEON, enabling a relatively straightforward transition.
 
 ### Advanced Vector Extensions (AVX)
 
-The AVX extensions introduce 256-bit YMM registers with AVX and 512-bit ZMM registers with AVX-512, offering significant performance improvements over SSE. Key features include Fused Multiply-Add (FMA) operations, masked operations in AVX-512, and VEX/EVEX encodings that allow for more operands and flexibility. 
-
-Migrating AVX code to Arm requires careful consideration, as AVX maps to NEON for up to 128-bit operations or to SVE for scalable-width operations. Since SVE is vector-length agnostic, porting AVX code often involves refactoring to accommodate this new paradigm.
+AVX provides 256-bit YMM registers, and AVX-512 adds 512-bit ZMM registers. Features include FMA, per-lane masking in AVX-512, and VEX or EVEX encodings. When moving AVX workloads to Arm, 128-bit paths often translate to NEON, while algorithms that scale with vector width are good candidates for SVE. Because SVE is vector-length agnostic, refactor for predication and scalable loops to maintain portability and performance.
 
 ### Advanced Matrix Extensions (AMX)
 
-AMX is a specialized instruction set designed for accelerating matrix operations using dedicated matrix-tile registers, effectively treating 2D arrays as first-class citizens. It is particularly well-suited for AI workloads, such as convolutions and General Matrix Multiplications (GEMMs). 
-
-When migrating AMX workloads to Arm, you can leverage Arm SME, which conceptually aligns with AMX but employs a different programming model based on outer products rather than dot products. This difference requires you to adapt their code to fully exploit SME's capabilities.
+AMX accelerates matrix operations with tile registers configured using a tile palette. It suits AI workloads such as GEMM and convolutions. When migrating AMX kernels to Arm, target SME. While both target matrix compute, AMX commonly expresses dot products, while SME focuses on outer products, so porting often entails algorithmic adjustments.
 
 ## Comparison tables
 
-## SSE vs. NEON
-
-| Feature               | SSE                                                      | NEON                                                      |
-|-----------------------|---------------------------------------------------------------|----------------------------------------------------------------|
-| **Register width**     | 128-bit (XMM registers)                                       | 128-bit (Q registers)                                           |
-| **Vector length model**| Fixed 128 bits                                                | Fixed 128 bits                                                 |
-| **Predication / masking**| Minimal predication; SSE lacks full mask registers           | Conditional select instructions; no hardware mask registers   |
-| **Gather / Scatter**   | No native gather/scatter (introduced in AVX2 and later)       | No native gather/scatter; requires software emulation         |
-| **Instruction set scope**| Arithmetic, logical, shuffle, blend, conversion, basic SIMD  | Arithmetic, logical, shuffle, saturating ops, multimedia, crypto extensions (AES, SHA)|
-| **Floating-point support**| Single and double precision floating-point SIMD operations   | Single and double precision floating-point SIMD operations     |
-| **Typical applications**| Legacy SIMD workloads; general-purpose vector arithmetic      | Multimedia processing, DSP, cryptography, embedded compute    |
-| **Extensibility**      | Extended by AVX/AVX2/AVX-512 for wider vectors and advanced features| NEON fixed at 128-bit vectors; ARM SVE offers scalable vectors but is separate |
-| **Programming model**  | Intrinsics supported in C/C++; assembly used for optimization | Intrinsics widely used; inline assembly less common            |
-
-
-## AVX vs. SVE (SVE2)
-
-| Feature               | x86: AVX / AVX-512                                      | ARM: SVE / SVE2                                               |
-|-----------------------|---------------------------------------------------------|---------------------------------------------------------------|
-| **Register width**     | Fixed: 256-bit (YMM), 512-bit (ZMM)                     | Scalable: 128 to 2048 bits (in multiples of 128 bits)         |
-| **Vector length model**| Fixed vector length; requires multiple code paths or compiler dispatch for different widths | Vector-length agnostic; same binary runs on any hardware vector width |
-| **Predication / masking**| Mask registers for per-element operations (AVX-512)    | Rich predication with per-element predicate registers          |
-| **Gather/Scatter**    | Native gather/scatter support (AVX2 and AVX-512)         | Native gather/scatter with efficient implementation across vector widths |
-| **Key operations**    | Wide SIMD, fused multiply-add (FMA), conflict detection, advanced masking | Wide SIMD, fused multiply-add (FMA), predicated operations, gather/scatter, reduction operations, bit manipulation |
-| **Best suited for**   | HPC, AI workloads, scientific computing, data analytics  | HPC, AI, scientific compute, cloud and scalable workloads     |
-| **Limitations**       | Power and thermal throttling on heavy 512-bit usage; complex software ecosystem | Requires vector-length agnostic programming style; ecosystem and hardware adoption still maturing |
+Use these side-by-side tables to pick the right Arm target and plan refactors. They compare register width, predication/masking, gather/scatter, key operations, typical workloads, and limitations for SSE ↔ NEON, AVX/AVX-512 ↔ SVE/SVE2, and AMX ↔ SME.
 
-## AMX vs. SME
+### A comparison of SSE and NEON
 
-| Feature               | x86: AMX                                                | ARM: SME                                                   |
-|-----------------------|---------------------------------------------------------|------------------------------------------------------------|
-| **Register width**     | Tile registers with fixed dimensions: 16×16 for BF16, 64×16 for INT8 (about 1 KB total) | Scalable matrix tiles integrated with SVE, implementation-dependent tile dimensions |
-| **Vector length model**| Fixed tile dimensions based on data type                    | Implementation-dependent tile dimensions, scales with SVE vector length |
-| **Predication / masking**| No dedicated predication or masking in AMX tiles      | Predication integrated through SVE predicate registers      |
-| **Gather/Scatter**    | Not supported within AMX; handled by other instructions | Supported via integration with SVE’s gather/scatter features |
-| **Key operations**    | Focused on dot-product based matrix multiplication, optimized for GEMM and convolutions | Focus on outer-product matrix multiplication with streaming mode for dense linear algebra |
-| **Best suited for**   | AI/ML workloads such as training and inference, specifically GEMM and convolution kernels | AI/ML training and inference, scientific computing, dense linear algebra workloads |
-| **Limitations**       | Hardware and software ecosystem currently limited (primarily Intel Xeon platforms) | Emerging hardware support; compiler and library ecosystem in development |
+| Feature | SSE | NEON |
+|---|---|---|
+| **Register width** | 128-bit (XMM) | 128-bit (Q) |
+| **Vector length model** | Fixed 128 bits | Fixed 128 bits |
+| **Predication or masking** | Minimal, no dedicated mask registers | No dedicated mask registers; use bitwise selects and conditionals |
+| **Gather/scatter** | No native gather/scatter; gather in AVX2 and scatter in AVX-512 | No native gather/scatter; emulate in software |
+| **Instruction set scope** | Arithmetic, logical, shuffle, convert, basic SIMD | Arithmetic, logical, shuffle, saturating ops; cryptography via Armv8 Cryptography Extensions (AES and SHA) |
+| **Floating-point support** | Single and double precision | Single and double precision |
+| **Typical applications** | Legacy SIMD, general vector arithmetic | Multimedia, DSP, cryptography, embedded compute |
+| **Extensibility** | Extended by AVX, AVX2, and AVX-512 | Fixed at 128-bit; scalable vectors provided by SVE as a separate extension |
+| **Programming model** | Intrinsics in C/C++; assembly for hotspots | Intrinsics widely used; inline assembly less common |
 
+### A comparison of AVX and SVE (SVE2)
 
-## Key Differences for Developers  
+| Feature | x86: AVX or AVX-512 | Arm: SVE or SVE2 |
+|---|---|---|
+| **Register width** | Fixed: 256-bit YMM, 512-bit ZMM | Scalable: 128 to 2048 bits in 128-bit steps |
+| **Vector length model** | Fixed; often multiple code paths for different widths | Vector-length agnostic; same binary adapts to hardware width |
+| **Predication or masking** | Mask registers in AVX-512 | Rich predication via predicate registers |
+| **Gather or scatter** | Gather in AVX2 and scatter in AVX-512 | Native gather and scatter across widths |
+| **Key operations** | Wide SIMD, FMA, conflict detection, advanced masking | Wide SIMD, FMA, predication, gather or scatter, reductions, bit manipulation |
+| **Best suited for** | HPC, AI and ML, scientific computing, analytics | HPC, AI and ML, scientific computing, cloud and scalable workloads |
+| **Limitations** | Power and thermal headroom under heavy 512-bit use; ecosystem complexity | Requires VLA programming style; SVE or SVE2 hardware availability varies by platform |
 
-When migrating from x86 SIMD extensions to Arm SIMD, there are several important architectural and programming differences for you to consider.
+{{% notice Note %}}
+SVE2 extends SVE with richer integer and DSP capabilities for general-purpose and media workloads.
+{{% /notice %}}
 
-### Vector Length Model
+### A comparison of AMX and SME
 
-x86 SIMD extensions such as SSE, AVX, and AVX-512 operate on fixed vector widths, 128, 256, or 512 bits. This often necessitates multiple code paths or compiler dispatch techniques to efficiently exploit available hardware SIMD capabilities. Arm NEON, similar to SSE, uses a fixed 128-bit vector width, making it a familiar, fixed-size SIMD baseline. 
+| Feature | x86: AMX | Arm: SME |
+|---|---|---|
+| **Register model** | Tile registers configured via a palette; fixed per type limits | Scalable matrix tiles integrated with SVE; implementation-dependent dimensions |
+| **Vector length model** | Fixed tile geometry per configuration | Scales with SVE vector length and streaming mode |
+| **Predication or masking** | Predication not inherent to tiles | Predication via SVE predicate registers |
+| **Gather or scatter** | Not provided in AMX tiles; handled elsewhere | Via SVE integration with gather or scatter |
+| **Key operations** | Dot-product oriented GEMM and convolution | Outer-product matrix multiply; streaming mode for dense linear algebra |
+| **Best suited for** | AI and ML training and inference, GEMM and convolution kernels | AI and ML training and inference, scientific and HPC dense linear algebra |
+| **Limitations** | Hardware and software availability limited to specific CPUs | Emerging hardware support; compiler and library support evolving |
 
-In contrast, Arm’s Scalable Vector Extension (SVE) and Scalable Matrix Extension (SME) introduce a vector-length agnostic model. This allows vectors to scale from 128 bits up to 2048 bits depending on the hardware, enabling the same binary to run efficiently across different implementations without modification.
+## The key differences for developers
 
-### Programming and Intrinsics
+The most significant changes when porting include moving from fixed-width SIMD to vector-length-agnostic loop structures, replacing mask-register control with predicate-driven control, and adjusting memory access patterns and compiler flags. Review this section first to minimize rework and preserve portable performance.
 
-x86 offers a comprehensive and mature set of SIMD intrinsics that increase in complexity especially with AVX-512 due to advanced masking and lane-crossing operations. Arm NEON intrinsics resemble SSE intrinsics and are relatively straightforward for porting existing SIMD code. However, Arm SVE and SME intrinsics are designed for a more predicated and vector-length agnostic style of programming. 
+### Vector length model
 
-When migrating to SVE/SME you are encouraged to leverage compiler auto-vectorization with predication support, moving away from heavy reliance on low-level intrinsics to achieve scalable, portable performance.
+x86 SIMD (SSE, AVX, and AVX-512) uses fixed widths of 128, 256, or 512 bits. This often requires multiple code paths or dispatch strategies. Arm NEON is also fixed at 128-bit and is a familiar baseline. SVE and SME introduce vector-length agnostic execution from 128 to 2048 bits so the same binary scales across implementations.
 
-### Matrix Acceleration
+### Programming and intrinsics
 
-For matrix computation, AMX provides fixed-size tile registers optimized for dot-product operations such as GEMM and convolutions. In comparison, Arm SME extends the scalable vector compute model with scalable matrix tiles designed around outer-product matrix multiplication and novel streaming modes. 
+x86 intrinsics are extensive, and AVX-512 adds masks and lane controls that increase complexity. NEON intrinsics look familiar to SSE developers. SVE and SME use predication and scalable loops. Prefer auto-vectorization and VLA-friendly patterns over heavy hand-written intrinsics when portability matters.
 
-SME’s flexible, hardware-adaptable tile sizes and tight integration with SVE’s predication model provide a highly adaptable platform for AI training, inference, and scientific computing. 
+### Matrix acceleration
 
-Both AMX and SME are currently available on limited set of platforms. 
+AMX provides fixed-geometry tile compute optimized for dot products. SME extends Arm’s scalable model with outer-product math, scalable tiles, and streaming mode. Both AMX and SME are currently available on a limited set of platforms.
 
-### Overall Summary
+## Summary
 
-Migrating from x86 SIMD to Arm SIMD entails embracing Arm’s scalable and predicated SIMD programming model embodied by SVE and SME, which supports future-proof, portable code across a wide range of hardware. 
-
-NEON remains important for fixed-width SIMD similar to SSE but may be less suited for emerging HPC and AI workloads that demand scale and flexibility. 
-
-You need to adapt to Arm’s newer vector-length agnostic programming and tooling to fully leverage scalable SIMD and matrix architectures. 
-
-Understanding these key differences in vector models, programming paradigms, and matrix acceleration capabilities helps you migrate and achieve good performance on Arm. 
+Migrating from x86 SIMD to Arm entails adopting Arm’s scalable and predicated programming model with SVE and SME for forward-portable performance, while continuing to use NEON for fixed-width SIMD similar to SSE.
 
 ## Migration tools
 
-There are tools and libraries that help translate SSE intrinsics to NEON intrinsics, which can shorten the migration effort and produce efficient Arm code. These libraries enable many SSE operations to be mapped to NEON equivalents, but some SSE features have no direct NEON counterparts and require workarounds or redesign. 
-
-Overall, NEON is the standard for SIMD on Arm much like SSE for x86, making it the closest analogue for porting SIMD-optimized software from x86 to ARM.
-
-[sse2neon](https://github.com/DLTcollab/sse2neon) is an open-source header library that provides a translation layer from Intel SSE2 intrinsics to Arm NEON intrinsics. It enables many SSE2-optimized codebases to be ported to Arm platforms with minimal code modification by mapping familiar SSE2 instructions to their NEON equivalents. 
-
-
-[SIMD Everywhere (SIMDe)](https://github.com/simd-everywhere/simde) is a comprehensive, header-only library designed to ease the transition of SIMD code between different architectures. It provides unified implementations of SIMD intrinsics across x86 SSE/AVX, Arm NEON, and other SIMD instruction sets, facilitating portable and maintainable SIMD code. SIMDe supports a wide range of SIMD extensions and includes implementations that fall back to scalar code when SIMD is unavailable, maximizing compatibility. 
+Several libraries help translate or abstract SIMD intrinsics to speed up migration. Coverage varies, and some features have no direct analogue.
 
+Here are some of the tools available and their key features: 
 
-[Google Highway](https://github.com/google/highway) is a high-performance SIMD optimized vector hashing and data processing library designed by Google. It leverages platform-specific SIMD instructions, including Arm NEON and x86 AVX, to deliver fast, portable, and scalable hashing functions and vector operations. Highway is particularly well-suited for large-scale data processing, machine learning, and performance-critical applications requiring efficient SIMD usage across architectures. 
+- Sse2neon: an open-source header that maps many SSE2 intrinsics to NEON equivalents. Good for getting code building quickly. Review generated code for performance. See the [sse2neon GitHub repository](https://github.com/DLTcollab/sse2neon).
+- SIMD Everywhere (SIMDe): a header-only portability layer that implements many x86 and Arm intrinsics across ISAs, with scalar fallbacks when SIMD is unavailable. See the [simde-everywhere GitHub repository](https://github.com/simd-everywhere/simde).
+- Google Highway (hwy): a portable SIMD library and APIs that target multiple ISAs, including NEON, SVE where supported, and AVX, without per-ISA code paths. See the [Google highway GitHub repository](https://github.com/google/highway).
 
-You can also review [Porting architecture specific intrinsics](/learning-paths/cross-platform/intrinsics/) for more information.
\ No newline at end of file
+For more on cross-platform intrinsics, see the Learning Path [Porting architecture-specific intrinsics](/learning-paths/cross-platform/intrinsics/).
diff --git a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md
index 015060804c..5d4881fa2d 100644
--- a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md
+++ b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md
@@ -1,24 +1,25 @@
 ---
-title: Vector extension code examples
+title: "Explore vector extension code examples"
 weight: 4
 
-### FIXED, DO NOT MODIFY
-layout: learningpathall
+# FIXED, DO NOT MODIFY
+layout: "learningpathall"
 ---
 
-## SAXPY Example code
+## SAXPY example code
 
-As a way to provide some hands-on experience, you can study and run example code to better understand the vector extensions. The example used here is SAXPY.
+This page walks you through a SAXPY (Single-Precision A·X Plus Y) kernel implemented in plain C and with vector extensions on both Arm (NEON, SVE) and x86 (AVX2, AVX-512). You will see how to build and run each version and how the vector width affects throughput.
 
-SAXPY stands for "Single-Precision A·X Plus Y" and is a fundamental operation in linear algebra. It computes the result of the equation `y[i] = a * x[i] + y[i]` for all elements in the arrays `x` and `y`. 
+SAXPY computes `y[i] = a * x[i] + y[i]` across arrays `x` and `y`. It is widely used in numerical computing and is an accessible way to compare SIMD behavior across ISAs.
 
-SAXPY is widely used in numerical computing, particularly in vectorized and parallelized environments, due to its simplicity and efficiency.
+{{% notice Tip %}}
+If a library already provides a tuned SAXPY (for example, BLAS), use that over hand-written kernels. These examples are for learning and porting.
+{{% /notice %}}
 
-### Reference version
 
-Below is a plain C implementation of SAXPY without any vector extensions. 
+## Reference C version (no SIMD intrinsics)
 
-This serves as a reference for the optimized examples provided later.
+Below is a plain C implementation of SAXPY without any vector extensions which serves as a reference baseline for the optimized examples provided later:
 
 ```c
 #include <stddef.h>
@@ -56,7 +57,7 @@ int main() {
 }
 ```
 
-Use a text editor to copy the code to a file `saxpy_plain.c` and build and run the code using:
+Use a text editor to copy the code to a file called `saxpy_plain.c` and build and run the code using:
 
 ```bash
 gcc -O3 -o saxpy_plain saxpy_plain.c
@@ -65,13 +66,11 @@ gcc -O3 -o saxpy_plain saxpy_plain.c
 
 You can use Clang for any of the examples by replacing `gcc` with `clang` on the command line.
 
-### Arm NEON version (128-bit SIMD, 4 floats per operation)
+## Arm NEON version (128-bit SIMD, 4 floats per operation)
 
-NEON operates on fixed 128-bit registers, able to process 4 single-precision float values simultaneously in every vector instruction. 
+NEON uses fixed 128-bit registers, processing four `float` values per instruction. It is available on most Armv8-A devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads.
 
-This extension is available on most Arm-based devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads. 
-
-The example below processes 16 floats per iteration using four separate NEON operations to improve instruction-level parallelism and reduce loop overhead.
+The example below processes 16 floats per iteration using four separate NEON operations to improve instruction-level parallelism and reduce loop overhead:
 
 ```c
 #include <arm_neon.h>
@@ -139,7 +138,13 @@ gcc -O3 -march=armv8-a+simd -o saxpy_neon saxpy_neon.c
 ./saxpy_neon
 ```
 
-### AVX2 (256-bit SIMD, 8 floats per operation)
+{{% notice Note %}}
+On AArch64, NEON is mandatory; the flag is shown for clarity.
+{{% /notice %}}
+
+
+
+## x86 AVX2 version (256-bit SIMD, 8 floats per operation)
 
 AVX2 doubles the SIMD width compared to NEON, processing 8 single-precision floats at a time in 256-bit registers. 
 
@@ -203,7 +208,7 @@ gcc -O3 -mavx2 -mfma -o saxpy_avx2 saxpy_avx2.c
 ./saxpy_avx2
 ```
 
-### Arm SVE (hardware dependent: 4 to 16+ floats per operation)
+## Arm SVE (hardware dependent: 4 to 16+ floats per operation)
 
 Arm SVE lets the hardware determine the register width, which can range from 128 up to 2048 bits. This means each operation can process from 4 to 64 single-precision floats at a time, depending on the implementation. 
 
@@ -214,6 +219,7 @@ SVE encourages writing vector-length agnostic code: the compiler automatically h
 ```c
 #include <arm_sve.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -270,13 +276,13 @@ gcc -O3 -march=armv8-a+sve -o saxpy_sve saxpy_sve.c
 ./saxpy_sve
 ```
 
-### AVX-512 (512-bit SIMD, 16 floats per operation)
+## x86 AVX-512 version (512-bit SIMD, 16 floats per operation)
 
 AVX-512 provides the widest SIMD registers of mainstream x86 architectures, processing 16 single-precision floats per 512-bit operation. 
 
 AVX-512 availability varies across x86 processors. It's found on Intel Xeon server processors and some high-end desktop processors, as well as select AMD EPYC models.
 
-For very large arrays and high-performance workloads, AVX-512 delivers extremely high throughput, with additional masking features for efficient tail processing.
+For large arrays and high-performance workloads, AVX-512 delivers extremely high throughput, with additional masking features for efficient tail processing.
 
 ```c
 #include <immintrin.h>
@@ -341,7 +347,7 @@ gcc -O3 -mavx512f -o saxpy_avx512 saxpy_avx512.c
 ./saxpy_avx512
 ```
 
-### Summary
+## Summary
 
 Wider data lanes mean each operation processes more elements, offering higher throughput on supported hardware. However, actual performance depends on factors like memory bandwidth, the number of execution units, and workload characteristics. 
 
@@ -349,4 +355,4 @@ Processors also improve performance by implementing multiple SIMD execution unit
 
 Each vector extension requires different intrinsics, compilation flags, and programming approaches. While x86 and Arm vector extensions serve similar purposes and achieve comparable performance gains, you will need to understand the options and details to create portable code. 
 
-You should also look for existing libraries that already work across vector extensions before you get too deep into code porting. This is often a good way to leverage the available SIMD capabilities on your target hardware.
+You can also look for existing libraries that already work across vector extensions before you get too deep into code porting. This is often a good way to leverage the available SIMD capabilities on your target hardware.
diff --git a/content/learning-paths/cross-platform/vectorization-comparison/_index.md b/content/learning-paths/cross-platform/vectorization-comparison/_index.md
index d2a54fe293..a925bb0166 100644
--- a/content/learning-paths/cross-platform/vectorization-comparison/_index.md
+++ b/content/learning-paths/cross-platform/vectorization-comparison/_index.md
@@ -1,21 +1,22 @@
 ---
-title: "Mapping x86 vector extensions to Arm: a migration overview"
-
-minutes_to_complete: 30
+title: "Migrate x86-64 SIMD to Arm64"
 
 draft: true
 cascade:
     draft: true
 
-who_is_this_for: This is an advanced topic for software developers who want to learn how to migrate vectorized code to Arm.
+minutes_to_complete: 30
+
+who_is_this_for: This is an advanced topic for developers migrating vectorized (SIMD) code from x86-64 to Arm64.
 
 learning_objectives:
-     - Understand how Arm vector extensions, including NEON, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures.
-     - Start planning how to migrate your SIMD code to the Arm architecture.
+     - Identify how Arm vector extensions including NEON, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures
+     - Plan a migration strategy using autovectorization, intrinsics, or library substitution
+   
 
 prerequisites:
-    - Familiarity with vector extensions, SIMD programming, and compiler intrinsics.
-    - Access to Linux systems with NEON and SVE support. 
+    - Familiarity with vector extensions, SIMD programming, and compiler intrinsics
+    - Access to Linux systems with NEON and SVE support
 
 author:
     - Jason Andrews
@@ -40,11 +41,11 @@ shared_between:
 
 further_reading:
     - resource:
-        title: SVE Programming Examples
+        title: SVE programming examples
         link: https://developer.arm.com/documentation/dai0548/latest
         type: documentation
     - resource:
-        title: Port Code to Arm Scalable Vector Extension (SVE)
+        title: Port code to Arm Scalable Vector Extension (SVE)
         link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/sve
         type: website
     - resource:
@@ -64,22 +65,24 @@ further_reading:
         link: https://developer.arm.com/documentation/109246/latest
         type: documentation
     - resource:
-        title: Compiler Intrinsics
+        title: Compiler intrinsics (overview)
         link: https://en.wikipedia.org/wiki/Intrinsic_function
         type: website
     - resource:
-        title: ACLE - Arm C Language Extension
+        title: ACLE - Arm C Language Extensions
         link: https://github.com/ARM-software/acle
         type: website
     - resource:
-        title: Application Binary Interface for the Arm Architecture
+        title: Application Binary Interface for the Arm Architecture (AAPCS64)
         link: https://github.com/ARM-software/abi-aa
         type: website
 
-
 ### FIXED, DO NOT MODIFY
 # ================================================================================
 weight: 1                       # _index.md always has weight of 1 to order correctly
 layout: "learningpathall"       # All files under learning paths have this same wrapper
 learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
 ---
+
+   
+
diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md
index 945b031b43..e5d9c4f74d 100644
--- a/content/learning-paths/embedded-and-microcontrollers/_index.md
+++ b/content/learning-paths/embedded-and-microcontrollers/_index.md
@@ -45,11 +45,12 @@ tools_software_languages_filter:
 - CMSIS-DSP: 1
 - CMSIS-Toolbox: 3
 - CNN: 1
+- Computer Vision: 1
 - Containerd: 1
 - DetectNet: 1
 - Docker: 10
 - DSTREAM: 2
-- Edge AI: 1
+- Edge AI: 2
 - Edge Impulse: 1
 - ExecuTorch: 3
 - FastAPI: 1
diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md
index 85b368fbed..6c1ff55547 100644
--- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md
+++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md
@@ -44,6 +44,7 @@ From within the Python virtual environment, run the commands below to download t
 cd $HOME
 git clone https://github.com/pytorch/executorch.git
 cd executorch
+git checkout 188312844ebfb499f92ab5a02137ed1a4abca782
 ```
 
 Run the commands below to set up the ExecuTorch internal dependencies:
@@ -70,7 +71,7 @@ pip list | grep executorch
 ```
 
 ```output
-executorch         0.6.0a0+3eea1f1
+executorch             1.1.0a0+1883128
 ```
 
 ## Next Steps
diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md
index c717770259..c554c0a575 100644
--- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md
+++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-fvp.md
@@ -16,11 +16,11 @@ The Corstone reference system is provided free of charge, although you will have
 
 ## Corstone-320 FVP Setup for ExecuTorch
 
-Navigate to the Arm examples directory in the ExecuTorch repository. Run the following command.
+Run the FVP setup script in the ExecuTorch repository.
 
 ```bash
-cd $HOME/executorch/examples/arm
-./setup.sh --i-agree-to-the-contained-eula
+cd $HOME/executorch
+./examples/arm/setup.sh --i-agree-to-the-contained-eula
 ```
 
 After the script has finished running, it prints a command to run to finalize the installation. This step adds the FVP executables to your system path.
diff --git a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md
index e23ea09452..b3fb6106d3 100644
--- a/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md
+++ b/content/learning-paths/embedded-and-microcontrollers/tfm/_index.md
@@ -18,8 +18,7 @@ author: Pareena Verma
 
 test_images:
 - armswdev/arm-tools:bare-metal-compilers
-test_link: null
-test_maintenance: true
+test_maintenance: false
 
 ### Tags
 skilllevels: Introductory
diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md
index 967482a761..ad8a8ade3f 100644
--- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md
+++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/_index.md
@@ -1,26 +1,23 @@
 ---
-title: Edge AI with PyTorch & ExecuTorch - Tiny Sentiment Analysis on Arm
+title: Edge AI with PyTorch & ExecuTorch - Tiny Rock-Paper-Scissors on Arm
 
-draft: true
-cascade:
-    draft: true
+minutes_to_complete: 60
 
-minutes_to_complete: 90
+who_is_this_for: This learning path is for machine learning developers interested in deploying TinyML models on Arm-based edge devices. You will learn how to train and deploy a machine learning model for the classic game "Rock-Paper-Scissors" on edge devices. You'll use PyTorch and ExecuTorch, frameworks designed for efficient on-device inference, to build and run a small-scale computer vision model.
 
-who_is_this_for: This topic is for machine learning engineers, embedded AI developers, and researchers interested in deploying TinyML models for NLP on Arm-based edge devices using PyTorch and ExecuTorch.
 
 learning_objectives:
-    - Train a custom CNN-based sentiment classification model implemented in PyTorch.
-    - Optimize and convert the model using ExecuTorch for Arm-based edge devices.
-    - Deploy and run inference on the Corstone-320 FVP.
+    - Train a small Convolutional Neural Network (CNN) for image classification using PyTorch.
+    - Understand how to use synthetic data generation for training a model when real-world data is limited.
+    - Optimize and convert a PyTorch model into an ExecuTorch program (.pte) for Arm-based devices.
+    - Run the trained model on a local machine to play an interactive mini-game, demonstrating model inference.
 
-prerequisites:
-   - Basic knowledge of machine learning concepts.
-   - It is advised to complete The Learning Path, [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) before starting this learning path.
-   - Familiarity with Python and PyTorch.
-   - A Linux host machine or VM running Ubuntu 22.04 or higher.
-   - An Arm license to run the examples on the Corstone-320 Fixed Virtual Platform (FVP), for hands-on deployment.
 
+prerequisites:
+   - A basic understanding of machine learning concepts.
+   - Familiarity with Python and the PyTorch library.
+   - Having completed [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm).
+   - An x86 Linux host machine or VM running Ubuntu 22.04 or higher.
 
 author: Dominica Abena O. Amanfo
 
@@ -28,9 +25,12 @@ author: Dominica Abena O. Amanfo
 skilllevels: Introductory
 subjects: ML
 armips:
-    - Cortex-A
+    - Cortex-M
+    - Ethos-U
 tools_software_languages:
     - tinyML
+    - Computer Vision
+    - Edge AI
     - CNN
     - PyTorch
     - ExecuTorch
@@ -38,7 +38,6 @@ tools_software_languages:
 operatingsystems:
     - Linux
 
-
 further_reading:
     - resource:
         title: Run Llama 3 on a Raspberry Pi 5 using ExecuTorch
@@ -56,4 +55,4 @@ further_reading:
 weight: 1                       # _index.md always has weight of 1 to order correctly
 layout: "learningpathall"       # All files under learning paths have this same wrapper
 learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
----
+---
\ No newline at end of file
diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md
index 6f62990675..ac6b5e10a2 100644
--- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md
+++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/env-setup-1.md
@@ -6,17 +6,31 @@ weight: 2
 layout: learningpathall
 ---
 
-## Overview 
-In this course, you will learn how to train and run inference using a Tiny Sentiment Classifier. You'll deploy the model on the Arm Corstone-320 FVP for sentiment analysis. 
+## Overview
+This learning path (LP) is a direct follow-up to the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) learning path. While the previous one introduced you to the core concepts and the toolchain, this one puts that knowledge into practice with a fun, real-world example. You will move from the simple [Feedforward Neural Network](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model) in the previous LP, to a more practical computer vision task: A tiny Rock-Paper-Scissors game, to demonstrate how these tools can be used to solve a tangible problem and run efficiently on Arm-based edge devices.
 
-We will train a lightweight convolutional neural network (CNN)-based sentiment classifier using synthetic text data. This model is optimized for small devices, using embedding layers and 1D convolutions for efficient text classification.
+You will train a lightweight CNN to classify images of the letters R, P, and S as "rock," "paper," or "scissors." The script uses a synthetic data renderer to create a large dataset of these images with various transformations and noise, eliminating the need for a massive real-world dataset.
 
+### What is a Convolutional Neural Network (CNN)?
+A Convolutional Neural Network (CNN) is a type of deep neural network primarily used for analyzing visual imagery. Unlike traditional neural networks, CNNs are designed to process pixel data by using a mathematical operation called **convolution**. This allows them to automatically and adaptively learn spatial hierarchies of features from input images, from low-level features like edges and textures to high-level features like shapes and objects.
 
-## Environment Setup
-Setup your development environment for TinyML by following the first 3 chapters of the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) Learning Path (LP).
+![Image of a convolutional neural network architecture](image.png)
+[Image credits](https://medium.com/@atul_86537/learning-ml-from-first-principles-c-linux-the-rick-and-morty-way-convolutional-neural-c76c3df511f4).
+
+CNNs are the backbone of many modern computer vision applications, including:
+
+- **Image Classification:** Identifying the main object in an image, like classifying a photo as a "cat" or "dog".
+- **Object Detection:** Locating specific objects within an image and drawing a box around them.
+- **Facial Recognition:** Identifying and verifying individuals based on their faces.
+
+For the Rock-Paper-Scissors game, you'll use a tiny CNN to classify images of the letters R, P, and S as the corresponding hand gestures.
 
 
-If you just followed the LP above, you should already have your virtual environment activated. If not, activate it using: 
+
+## Environment Setup
+To get started, follow the first three chapters of the [Introduction to TinyML on Arm using PyTorch and ExecuTorch](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm) Learning Path. This will set up your development environment and install the necessary tools. Return to this LP once you've run the `./examples/arm/run.sh` script in the ExecuTorch repository.
+
+If you just followed the LP above, you should already have your virtual environment activated. If not, activate it using:
 
 ```console
 source $HOME/executorch-venv/bin/activate
@@ -26,8 +40,7 @@ The prompt of your terminal now has `(executorch-venv)` as a prefix to indicate
 Run the commands below to install the dependencies.
 
 ```bash
-pip install argparse json
+pip install argparse numpy pillow torch
 ```
-You are now ready to build the model
-
+You are now ready to create the model.
 
diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md
index 77c0bd59c2..e9ffd439ec 100644
--- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md
+++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fine-tune-2.md
@@ -1,5 +1,5 @@
 ---
-title: Train and Test the Sentiment Classifier
+title: Train and Test the Rock-Paper-Scissors Model
 weight: 3
 
 ### FIXED, DO NOT MODIFY
@@ -14,288 +14,477 @@ Navigate to the Arm examples directory in the ExecuTorch repository.
 cd $HOME/executorch/examples/arm
 ```
 
-Using a file editor of your choice, create a file named tiny_sentiment.py with the code shown below:
+Using a file editor of your choice, create a file named `rps_tiny.py`, copy and paste the code shown below:
 
 ```python
+#!/usr/bin/env python3
+"""
+Tiny Rock–Paper–Scissors CNN (PyTorch) + ExecuTorch export + CLI mini-game.
+
+Usage:
+  # Train (fast) + export .pte + play
+  python rps_tiny.py --epochs 8 --export --play
+
+  # Just train (no export)
+  python rps_tiny.py --epochs 8
+
+  # Export previously trained weights to .pte
+  python rps_tiny.py --export
+
+  # Play the mini-game (uses the best weights on disk)
+  python rps_tiny.py --play
+
+Outputs:
+  - rps_best.pt               (best PyTorch weights)
+  - rps_labels.json           (label map)
+  - rps_tiny.pte              (ExecuTorch program, if --export)
+"""
+
+import argparse, json, math, os, random, sys
+from dataclasses import dataclass
+from typing import Tuple, List
+
+import numpy as np
+from PIL import Image,ImageOps,ImageDraw, ImageFont, ImageFilter
+
 import torch
 import torch.nn as nn
 import torch.optim as optim
-import json
-import numpy as np
 from torch.utils.data import Dataset, DataLoader
-from sklearn.model_selection import train_test_split
-
-
-class SentimentDataset(Dataset):
-   def __init__(self, texts, labels, vocab=None, max_length=50):
-       self.texts = texts
-       self.labels = labels
-       self.max_length = max_length
-      
-       if vocab is None:
-           # Build vocabulary from training data
-           self.vocab = {'<PAD>': 0, '<UNK>': 1}
-           for text in texts:
-               for word in text.lower().split():
-                   if word not in self.vocab:
-                       self.vocab[word] = len(self.vocab)
-       else:
-           self.vocab = vocab
-  
-   def __len__(self):
-       return len(self.texts)
-  
-   def __getitem__(self, idx):
-       text = self.texts[idx].lower().split()
-       # Convert words to indices and pad/truncate to max_length
-       indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in text]
-       if len(indices) < self.max_length:
-           indices += [self.vocab['<PAD>']] * (self.max_length - len(indices))
-       else:
-           indices = indices[:self.max_length]
-      
-       return torch.tensor(indices), torch.tensor(self.labels[idx])
-
-
-class SentimentClassifier(nn.Module):
-   def __init__(self, vocab_size, embed_dim=100, hidden_dim=128, num_classes=2):
-       super().__init__()
-       self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
-       self.conv1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1)
-       self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
-       self.pool = nn.AdaptiveMaxPool1d(1)
-       self.fc1 = nn.Linear(hidden_dim, hidden_dim)
-       self.fc2 = nn.Linear(hidden_dim, num_classes)
-       self.dropout = nn.Dropout(0.5)
-      
-   def forward(self, x):
-       # x shape: (batch_size, seq_len)
-       x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
-       x = x.transpose(1, 2)  # (batch_size, embed_dim, seq_len)
-       x = torch.relu(self.conv1(x))
-       x = self.dropout(x)
-       x = torch.relu(self.conv2(x))
-       x = self.pool(x).squeeze(-1)
-       x = torch.relu(self.fc1(x))
-       x = self.dropout(x)
-       x = self.fc2(x)
-       return x
-
-
-def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20):
-   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-   model = model.to(device)
-   best_val_acc = 0
-  
-   for epoch in range(num_epochs):
-       # Training phase
-       model.train()
-       train_loss = 0
-       train_correct = 0
-       train_total = 0
-      
-       for inputs, labels in train_loader:
-           inputs, labels = inputs.to(device), labels.to(device)
-           optimizer.zero_grad()
-           outputs = model(inputs)
-           loss = criterion(outputs, labels)
-           loss.backward()
-           optimizer.step()
-          
-           train_loss += loss.item()
-           _, predicted = outputs.max(1)
-           train_total += labels.size(0)
-           train_correct += predicted.eq(labels).sum().item()
-      
-       # Validation phase
-       model.eval()
-       val_loss = 0
-       val_correct = 0
-       val_total = 0
-      
-       with torch.no_grad():
-           for inputs, labels in val_loader:
-               inputs, labels = inputs.to(device), labels.to(device)
-               outputs = model(inputs)
-               loss = criterion(outputs, labels)
-              
-               val_loss += loss.item()
-               _, predicted = outputs.max(1)
-               val_total += labels.size(0)
-               val_correct += predicted.eq(labels).sum().item()
-      
-       train_acc = 100. * train_correct / train_total
-       val_acc = 100. * val_correct / val_total
-      
-       print(f'Epoch {epoch+1}/{num_epochs}:')
-       print(f'Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}%')
-       print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.2f}%')
-       print('-' * 60)
-      
-       # Save best model
-       if val_acc > best_val_acc:
-           best_val_acc = val_acc
-           torch.save(model.state_dict(), 'best_sentiment_model.pt')
 
 
+# ---------------------------
+# Config
+# ---------------------------
+SEED = 7
+random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
+
+LABELS = ["rock", "paper", "scissors"]  # indexes: 0,1,2
+IMG_SIZE = 28
+TRAIN_SAMPLES_PER_CLASS = 1000
+VAL_SAMPLES_PER_CLASS = 200
+BATCH = 64
+LR = 2e-3
+EPOCHS_DEFAULT = 6
+WEIGHTS = "rps_best.pt"
+LABELS_JSON = "rps_labels.json"
+PTE_OUT = "rps_tiny.pte"
+
+
+# ---------------------------
+# Synthetic R/P/S renderer
+# ---------------------------
+def _rand(a, b):
+    return a + random.random()*(b-a)
+
+def render_rps(label: str) -> Image.Image:
+    """
+    Render a 28x28 grayscale image for 'rock'/'paper'/'scissors'
+    using the letters R/P/S with random transforms + noise.
+    """
+    ch = {"rock":"R","paper":"P","scissors":"S"}[label]
+    img = Image.new("L", (IMG_SIZE, IMG_SIZE), color=0)
+    d = ImageDraw.Draw(img)
+
+    # Try to get a default truetype; fallback to PIL default bitmap font
+    font = None
+    try:
+        # Use a generic font size that fills the canvas
+        font = ImageFont.truetype(font="Arial.ttf", size=int(_rand(18,24)))
+    except Exception:
+        font = ImageFont.load_default()
+
+    # Random text position
+    bbox = d.textbbox((0, 0), ch, font=font)  # (left, top, right, bottom)
+    w = bbox[2] - bbox[0]
+    h = bbox[3] - bbox[1]
+    x = (IMG_SIZE - w)//2 + int(_rand(-2, 2))
+    y = (IMG_SIZE - h)//2 + int(_rand(-2, 2))
+
+    # Random brightness for foreground
+    fg = int(_rand(180, 255))
+    d.text((x,y), ch, fill=fg, font=font)
+
+    # Slight blur/rotate/shear
+    if random.random()<0.6:
+        img = img.filter(ImageFilter.GaussianBlur(radius=_rand(0.0, 0.7)))
+    if random.random()<0.8:
+        angle = _rand(-18, 18)
+        img = img.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=0)
+
+    # Add mild elastic-ish jitter by affine
+    if random.random()<0.5:
+        dx, dy = _rand(-1.0, 1.0), _rand(-1.0, 1.0)
+        ax = 1 + _rand(-0.05, 0.05)
+        img = img.transform(
+            img.size,
+            Image.AFFINE,
+            (ax, _rand(-0.05,0.05), dx, _rand(-0.05,0.05), 1+_rand(-0.05,0.05), dy),
+            resample=Image.BILINEAR,
+            fillcolor=0
+        )
+
+    # Salt & pepper noise
+    if random.random()<0.8:
+        arr = np.array(img, dtype=np.float32)
+        noise = np.random.randn(*arr.shape)*_rand(3, 12)
+        arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
+        img = Image.fromarray(arr, mode="L")
+
+    return img
+
+
+# ---------------------------
+# Dataset
+# ---------------------------
+@dataclass
+class RPSItem:
+    image: torch.Tensor  # [1,28,28] float32 0..1
+    label: int
+
+class RPSDataset(Dataset):
+    def __init__(self, n_per_class: int, train: bool):
+        self.items: List[RPSItem] = []
+        for idx, name in enumerate(LABELS):
+            for _ in range(n_per_class):
+                img = render_rps(name)
+                # Slightly different augments for train vs val
+                if train and random.random()<0.15:
+                    img = ImageOps.invert(img)
+                t = torch.from_numpy(np.array(img, dtype=np.float32)/255.0)[None, ...]
+                self.items.append(RPSItem(t, idx))
+        random.shuffle(self.items)
+
+    def __len__(self): return len(self.items)
+    def __getitem__(self, i):
+        it = self.items[i]
+        return it.image, torch.tensor(it.label, dtype=torch.long)
+
+
+# ---------------------------
+# Model: Tiny CNN (Ethos-friendly)
+# ---------------------------
+class TinyRPS(nn.Module):
+    """
+    Simple ConvNet:
+    [B,1,28,28] -> Conv3x3(16) -> ReLU -> Conv3x3(32) -> ReLU
+      -> MaxPool2d(2) -> Conv3x3(64) -> ReLU -> MaxPool2d(2)
+      -> flatten -> Linear(128) -> ReLU -> Linear(3)
+    """
+    def __init__(self):
+        super().__init__()
+        self.body = nn.Sequential(
+            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(inplace=True),
+            nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True),
+            nn.MaxPool2d(2),
+        )
+        self.head = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(64*7*7, 128), nn.ReLU(inplace=True),
+            nn.Linear(128, 3)
+        )
+    def forward(self, x):  # x: [B,1,28,28]
+        return self.head(self.body(x))
+
+# AOT entry points expected by aot_arm_compiler
+ModelUnderTest = TinyRPS()
+ModelInputs = (torch.zeros(1, 1, IMG_SIZE, IMG_SIZE, dtype=torch.float32),)
+
+# ---------------------------
+# Train / Eval
+# ---------------------------
+def run_epoch(dl, model, crit, opt=None):
+    train = opt is not None
+    model.train() if train else model.eval()
+    totl=totc=cnt=0
+    with torch.set_grad_enabled(train):
+        for x,y in dl:
+            if train: opt.zero_grad(set_to_none=True)
+            out = model(x)
+            loss = crit(out, y)
+            if train:
+                loss.backward()
+                opt.step()
+            totl += float(loss)*x.size(0)
+            totc += (out.argmax(1)==y).sum().item()
+            cnt  += x.size(0)
+    return totl/cnt, totc/cnt
+
+
+# ---------------------------
+# Export to ExecuTorch (.pte)
+# ---------------------------
+def export_to_pte(model: nn.Module, out_path=PTE_OUT):
+    model.eval()
+    example = torch.zeros(1,1,IMG_SIZE,IMG_SIZE, dtype=torch.float32)
+    exported = None
+    try:
+        try:
+            from torch.export import export
+        except Exception:
+            import torch._export as _export
+            export = _export.export
+        exported = export(model, (example,))
+    except Exception:
+        # Fallback: some older builds expose exir.capture
+        from executorch.exir import capture
+        exported = capture(model, (example,))
+    from executorch import exir
+    edge = exir.to_edge(exported)
+    prog = edge.to_executorch()
+    with open(out_path, "wb") as f:
+        f.write(prog.buffer)
+    print(f"[export] wrote {out_path}")
+
+
+# ---------------------------
+# CLI mini-game
+# ---------------------------
+def ascii_show(img: torch.Tensor) -> str:
+    """Convert [1,28,28] tensor into tiny ASCII block for fun."""
+    chars = " .:-=+*#%@"
+    arr = (img.squeeze(0).numpy()*255).astype(np.uint8)
+    h, w = arr.shape
+    lines=[]
+    for y in range(0,h,2):
+        row=[]
+        for x in range(0,w,1):
+            v = arr[y, x]
+            row.append(chars[min(len(chars)-1, int(v)*len(chars)//256)])
+        lines.append("".join(row))
+    return "\n".join(lines)
+
+def beats(a: int, b: int) -> int:
+    """Return +1 if a beats b, 0 if tie, -1 if loses."""
+    # 0=rock beats 2=scissors, 1=paper beats 0, 2=scissors beats 1
+    if a == b: return 0
+    if (a==0 and b==2) or (a==1 and b==0) or (a==2 and b==1): return +1
+    return -1
+
+def play_game(model: nn.Module):
+    print("\n=== Rock–Paper–Scissors: Play vs Tiny CNN ===")
+    print("Type one of: rock / paper / scissors / quit\n")
+    while True:
+        s = input("Your move> ").strip().lower()
+        if s in ("quit","q","exit"): break
+        if s not in LABELS:
+            print("Invalid. Try: rock / paper / scissors / quit")
+            continue
+        # Generate an image of YOUR move and one for OPPONENT
+        your_idx = LABELS.index(s)
+        your_img = render_rps(s)
+        opp_idx  = random.randint(0,2)
+        opp_img  = render_rps(LABELS[opp_idx])
+
+        # Classify both with the model on CPU
+        def to_tensor(im):
+            return torch.from_numpy(np.array(im, dtype=np.float32)/255.0)[None,None,...]
+        with torch.no_grad():
+            y_logits = model(to_tensor(your_img))
+            o_logits = model(to_tensor(opp_img))
+            y_pred = int(y_logits.argmax(1).item())
+            o_pred = int(o_logits.argmax(1).item())
+            y_conf = torch.softmax(y_logits,1)[0,y_pred].item()
+            o_conf = torch.softmax(o_logits,1)[0,o_pred].item()
+
+        print("\nYou played:", s)
+        print(ascii_show(to_tensor(your_img)[0]))
+        print(f"Model thinks you played: {LABELS[y_pred]} ({y_conf*100:.1f}%)")
+
+        print("\nOpponent played (hidden):")
+        print(ascii_show(to_tensor(opp_img)[0]))
+        print(f"Model thinks opponent played: {LABELS[o_pred]} ({o_conf*100:.1f}%)")
+
+        outcome = beats(y_pred, o_pred)
+        if outcome>0: print("\n🎉 You win!")
+        elif outcome<0: print("\n😅 You lose!")
+        else: print("\n🤝 It's a tie!")
+        print("-"*50)
+
+
+# ---------------------------
+# Main
+# ---------------------------
 def main():
-   # Sample balanced dataset (After successfully completing this LP, you can try your own samples)
-   texts = [
-       "I am very happy today",
-       "This is wonderful",
-       "I love this movie",
-       "Great experience",
-       "I am feeling fantastic",
-       "This is awesome",
-       "I am very sad today",
-       "This is terrible",
-       "I hate this movie",
-       "Worst experience ever",
-       "I am feeling depressed",
-       "This is awful"
-   ]
-   labels = [1] * 6 + [0] * 6  # 1 for positive, 0 for negative
-  
-   # Split dataset
-   train_texts, val_texts, train_labels, val_labels = train_test_split(
-       texts, labels, test_size=0.2, random_state=42, stratify=labels
-   )
-  
-   # Create datasets
-   train_dataset = SentimentDataset(train_texts, train_labels)
-   val_dataset = SentimentDataset(val_texts, val_labels, vocab=train_dataset.vocab)
-  
-   # Create dataloaders
-   train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
-   val_loader = DataLoader(val_dataset, batch_size=4)
-  
-   # Initialize model and training components
-   model = SentimentClassifier(len(train_dataset.vocab))
-   criterion = nn.CrossEntropyLoss()
-   optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
-  
-   # Train model
-   train_model(model, train_loader, val_loader, criterion, optimizer)
-  
-   # Save vocabulary
-   with open('sentiment_vocab.json', 'w') as f:
-       json.dump(train_dataset.vocab, f)
-  
-   # Test mode
-   model.eval()
-   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-   model = model.to(device)
-  
-   while True:
-       text = input("Enter text to analyze (or 'quit' to exit): ")
-       if text.lower() == 'quit':
-           break
-          
-       # Preprocess input
-       indices = [train_dataset.vocab.get(word.lower(), train_dataset.vocab['<UNK>'])
-                 for word in text.split()]
-       if len(indices) < train_dataset.max_length:
-           indices += [train_dataset.vocab['<PAD>']] * (train_dataset.max_length - len(indices))
-       else:
-           indices = indices[:train_dataset.max_length]
-          
-       # Get prediction
-       with torch.no_grad():
-           input_tensor = torch.tensor(indices).unsqueeze(0).to(device)
-           output = model(input_tensor)
-           probabilities = torch.softmax(output, dim=1)
-           prediction = torch.argmax(output).item()
-           confidence = probabilities[0][prediction].item() * 100
-          
-       sentiment = "Positive" if prediction == 1 else "Negative"
-       print(f"Sentiment: {sentiment}")
-       print(f"Confidence: {confidence:.2f}%")
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--epochs", type=int, default=EPOCHS_DEFAULT)
+    ap.add_argument("--no-train", action="store_true", help="skip training (use saved weights)")
+    ap.add_argument("--export", action="store_true", help="export ExecuTorch .pte after training")
+    ap.add_argument("--play", action="store_true", help="play the mini-game after (or without) training")
+    args = ap.parse_args()
+
+    # Always save label map for runners
+    with open(LABELS_JSON, "w") as f:
+        json.dump({"labels": LABELS}, f, indent=2)
+
+    model = TinyRPS()
+
+    if not args.no_train:
+        print("== Building synthetic datasets ==")
+        tr = RPSDataset(TRAIN_SAMPLES_PER_CLASS, train=True)
+        va = RPSDataset(VAL_SAMPLES_PER_CLASS,  train=False)
+        train_loader = DataLoader(tr, batch_size=BATCH, shuffle=True, num_workers=0)
+        val_loader   = DataLoader(va, batch_size=BATCH, shuffle=False, num_workers=0)
+
+        print(f"Train size: {len(tr)}  |  Val size: {len(va)}")
+
+        crit = nn.CrossEntropyLoss()
+        opt = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
+
+        best = -1.0
+        for e in range(1, args.epochs+1):
+            tl, ta = run_epoch(train_loader, model, crit, opt)
+            vl, vaa = run_epoch(val_loader,  model, crit, None)
+            print(f"Epoch {e:02d}/{args.epochs} | train {ta*100:5.2f}% | val {vaa*100:5.2f}%")
+            if vaa > best:
+                best = vaa
+                torch.save(model.state_dict(), WEIGHTS)
+                print(f"  ↑ saved {WEIGHTS} (val {vaa*100:.2f}%)")
+        print("Training done.")
+    else:
+        print("--no-train: skipping training")
+
+    # Load best weights if present
+    if os.path.exists(WEIGHTS):
+        model.load_state_dict(torch.load(WEIGHTS, map_location="cpu"))
+        model.eval()
+        print(f"Loaded weights from {WEIGHTS}")
+    else:
+        print(f"[warn] No weights file {WEIGHTS}; using random init.")
+
+    if args.export:
+        try:
+            export_to_pte(model, PTE_OUT)
+        except Exception as e:
+            print("[export] failed:", e)
+
+    if args.play:
+        play_game(model)
 
 
 if __name__ == "__main__":
-   main()
+    main()
 ```
 
 
-### How This Script Works:
-- Generates a synthetic dataset of positive and negative sentiment samples.
-- Encodes text into numerical format using an embedding layer.
-- Trains a compact CNN model for sentiment classification.
-- Saves the trained model and vocabulary for inference.
-- Once training is complete, the model is saved as tiny_sentiment.pt, and vocabulary is saved to tiny_sentiment_vocab.json.
+### About the Script
+The script handles the entire workflow: data generation, model training, and a simple command-line game.
 
+- **Synthetic Data Generation:** The script includes a function `render_rps()` that generates 28x28 grayscale images of the letters 'R', 'P', and 'S' with random rotations, blurs, and noise. This creates a diverse dataset that's used to train the model.
+- **Model Architecture:** The model, a TinyRPS class, is a simple Convolutional Neural Network (CNN). It uses a series of 2D convolutional layers, followed by pooling layers to reduce spatial dimensions, and finally, fully connected linear layers to produce a final prediction. This architecture is efficient and well-suited for edge devices.
+- **Training:** The script generates synthetic training and validation datasets. It then trains the CNN model using the **Adam optimizer** and **Cross-Entropy Loss**. It tracks validation accuracy and saves the best-performing model to `rps_best.pt`.
+- **ExecuTorch Export:** A key part of the script is the `export_to_pte()` function. This function uses the `torch.export module` (or a fallback) to trace the trained PyTorch model and convert it into an ExecuTorch program (`.pte`). This compiled program is highly optimized for deployment on any target hardware, for example Cortex-M or Cortex-A CPUs for embedded devices.
+- **CLI Mini-Game**: After training, you can play an interactive game. The script generates an image of your move and a random opponent's move. It then uses the trained model to classify both images and determines the winner based on the model's predictions.
 
+### Running the Script:
+
+To train the model, export it, and play the game, run the following command:
 
-To train and test the model with your own inputs, run:
 ```bash
-python ~/executorch/examples/arm/tiny_sentiment.py
+python rps_tiny.py --epochs 8 --export --play
 ```
 
+You'll see the training progress, where the model's accuracy rapidly improves on the synthetic data.
+
+```output
+== Building synthetic datasets ==
+Train size: 3000  |  Val size: 600
+  totl += float(loss)*x.size(0)
+Epoch 01/8 | train 80.03% | val 98.67%
+  ↑ saved rps_best.pt (val 98.67%)
+Epoch 02/8 | train 99.57% | val 100.00%
+  ↑ saved rps_best.pt (val 100.00%)
+Epoch 03/8 | train 99.83% | val 99.83%
+Epoch 08/8 | train 100.00% | val 100.00%
+Training done.
+Loaded weights from rps_best.pt
+[export] wrote rps_tiny.pte
+```
+After training and export, the game will start. Type rock, paper, or scissors and see the model's predictions and what your opponent played.
 
+```output
+=== Rock–Paper–Scissors: Play vs Tiny CNN ===
+Type one of: rock / paper / scissors / quit
 
-{{% notice Note %}}
-The output has been truncated 
-{{% /notice %}}
+Your move> rock
 
-The output should look like:
-```bash
-=== Sentiment Analysis Classifier ===
-This program demonstrates text sentiment classification using PyTorch
-
-Loading dataset...
-Total examples: 12
-Positive examples: 6
-Negative examples: 6
-
-Building vocabulary from training data...
-Vocabulary size: 19 words
-
-Initializing model...
-Starting training...
-Training on device: cpu
-
-Epoch 1/20
-Training: 100%|██████████| 3/3 [00:00<00:00, 62.94it/s, loss=0.2385, acc=44.44%]
-Validation: 100%|███████| 1/1 [00:00<00:00, 633.87it/s, loss=0.2302, acc=66.67%]
-
-Epoch Summary:
-Train Loss: 0.7154, Train Acc: 44.44%
-Val Loss: 0.6906, Val Acc: 66.67%
-New best validation accuracy: 66.67%! Saving model...
-
-.
-.
-.
-Saving vocabulary...
-
-=== Interactive Testing Mode ===
-Enter text to analyze sentiment. Type 'quit' to exit.
-==================================================
-Enter text to analyze (or 'quit' to exit): I am happy
-
-Processing text: "I am happy"
-Tokenization: i am happy
-Padding: Added 47 padding tokens
-
-Analyzing sentiment...
-
-Result:
-Sentiment: Positive
-Confidence: 76.67%
-==================================================
-Enter text to analyze (or 'quit' to exit): I am sad
-
-Processing text: "I am sad"
-Tokenization: i am sad
-Padding: Added 47 padding tokens
-
-Analyzing sentiment...
-
-Result:
-Sentiment: Negative
-Confidence: 63.98%
-==================================================
-Enter text to analyze (or 'quit' to exit): quit
+You played: rock
+
+
+
+
+
+       .=##*++=-:.
+       :**-:-=++**+:
+      .=#+.     :+#=.
+      :*%%#*++==+**-.
+      -*+::-+#%*+-.
+     :+*-.   -*+-
+     -*+:     -**:
+      ..      .=*+.
+               .::.
+Model thinks you played: rock (100.0%)
+
+Opponent played (hidden):
+
+
+
+
+
+        ..:--*###**-
+        -#**--. .:+#*.  .
+        .+#-       +#+
+         -*+.     :+#-
+         .+#+=**###+-.  .
+          -##=:.   .
+    .     .+*:
+          .-**
+  .        :==
+Model thinks opponent played: paper (100.0%)
+
+😅 You lose!
+--------------------------------------------------
+Your move> paper
+
+You played: paper
+
+
+
+
+
+        .--:.
+       .=*+++***+=:
+       :++.     :+*-
+       -+-      .-+-
+      .=*-..   .=+=.
+      :**+++**+++-
+      -*-
+     .++:
+     :+-
+Model thinks you played: paper (100.0%)
+
+Opponent played (hidden):
+
+
+                   .
+
+
+         .:::::-:::.
+        .+*=======+*=
+        .**.       +*-     .
+ .      .=+.      :++:
+        .=*#*###**+=:
+        .=+-   :=+-.
+        .=*:    .-+=:
+    .    -#-.     :=*=
+         :*:       .-+-
+Model thinks opponent played: rock (100.0%)
+
+🎉 You win!
+--------------------------------------------------
+Your move>
 ```
 
-Do not forget to type 'quit' once you are done testing the model. You are now ready to optimize and convert the model using ExecuTorch.
\ No newline at end of file
+Type `quit` to exit the game. In the next chapter, you'll prepare the model to run on the FVP.
\ No newline at end of file
diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md
index 0b0a9f1ac2..b26333edb0 100644
--- a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md
+++ b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/fvp-3.md
@@ -1,66 +1,84 @@
 ---
-title: Run the model on Corstone-320 FVP  
+title: Run the model on Corstone-320 FVP
 weight: 4
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-TODO: Ask Annie to try from her end
+This section guides you through the process of compiling your trained Rock-Paper-Scissors model and running it on a simulated Arm-based edge device, the Corstone-320 Fixed Virtual Platform (FVP). This final step demonstrates the end-to-end workflow of deploying a TinyML model for on-device inference.
+
 ## Compile and build the executable
 
-Start by setting some environment variables that are used by ExecuTorch.
+First, you'll use the Ahead-of-Time (AOT) Arm compiler to convert your PyTorch model into a format optimized for the Arm architecture and the Ethos-U NPU. This process, known as delegation, offloads parts of the neural network graph that are compatible with the NPU, allowing for highly efficient inference.
+
+Set up your environment variables by running the following commands in your terminal:
 
 ```bash
 export ET_HOME=$HOME/executorch
 export executorch_DIR=$ET_HOME/build
 ```
 
-
-Then, generate a `.pte` file using the Arm examples. The Ahead-of-Time (AoT) Arm compiler will enable optimizations for edge devices like the Raspberry Pi and the Corstone-320 FVP. Run it from the ExecuTorch root directory.
-
-Navigate to the root directory using:
+Use the AOT Arm compiler to generate the optimized `.pte` file. This command delegates the model to the Ethos-U85 NPU, applies quantization to reduce model size and improve performance, and specifies the memory configuration. Run it from the ExecuTorch root directory.
 
 ```bash
-cd ../../
+cd $ET_HOME
+python -m examples.arm.aot_arm_compiler --model_name=examples/arm/rps_tiny.py \
+--delegate --quantize --target=ethos-u85-128 \
+--system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Dedicated_Sram
 ```
-You are now in $HOME/executorch and ready to create the model file for ExecuTorch.
 
-```bash
-cd $ET_HOME
-python -m examples.arm.aot_arm_compiler --model_name=examples/arm/tiny_sentiment.py \
---delegate --quantize --target=ethos-u85-256 \
---so_library=cmake-out-aot-lib/kernels/quantized/libquantized_ops_aot_lib.so \
---system_config=Ethos_U85_SYS_DRAM_Mid --memory_mode=Sram_Only
+You should see:
+
+```output
+PTE file saved as rps_tiny_arm_delegate_ethos-u85-128.pte
 ```
 
-From the Arm Examples directory, you build an embedded Arm runner with the `.pte` included. This allows you to get the most performance out of your model, and ensures compatibility with the CPU kernels on the FVP. Finally, generate the executable `arm_executor_runner`.
+Next, you'll build the **Ethos-U runner**, which is a bare-metal executable that includes the ExecuTorch runtime and your compiled model. This runner is what the FVP will execute. Navigate to the runner's directory and use CMake to configure the build.
 
 ```bash
 cd $HOME/executorch/examples/arm/executor_runner
 
-
 cmake -DCMAKE_BUILD_TYPE=Release \
--DCMAKE_TOOLCHAIN_FILE=$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \
--DTARGET_CPU=cortex-m85 \
--DET_DIR_PATH:PATH=$ET_HOME/ \
--DET_BUILD_DIR_PATH:PATH=$ET_HOME/cmake-out \
--DET_PTE_FILE_PATH:PATH=$ET_HOME/tiny_sentiment_arm_delegate_ethos-u85-256.pte \
--DETHOS_SDK_PATH:PATH=$ET_HOME/examples/arm/ethos-u-scratch/ethos-u \
--DETHOSU_TARGET_NPU_CONFIG=ethos-u85-256 \
--DPYTHON_EXECUTABLE=$HOME/executorch-venv/bin/python3 \
--DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid  \
--B $ET_HOME/examples/arm/executor_runner/cmake-out
-
-cmake --build $ET_HOME/examples/arm/executor_runner/cmake-out --parallel -- arm_executor_runner
+      -S "$ET_HOME/examples/arm/executor_runner" \
+      -B "$ET_HOME/examples/arm/executor_runner/cmake-out" \
+      -DCMAKE_TOOLCHAIN_FILE="$ET_HOME/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \
+      -DTARGET_CPU=cortex-m85 \
+      -DET_DIR_PATH="$ET_HOME" \
+      -DET_BUILD_DIR_PATH="$ET_HOME/arm_test/cmake-out" \
+      -DET_PTE_FILE_PATH="$ET_HOME/rps_tiny_arm_delegate_ethos-u85-128.pte" \
+      -DETHOS_SDK_PATH="$ET_HOME/examples/arm/ethos-u-scratch/ethos-u" \
+      -DETHOSU_TARGET_NPU_CONFIG=ethos-u85-128 \
+      -DSYSTEM_CONFIG=Ethos_U85_SYS_DRAM_Mid
+```
+
+You should see output similar to this, indicating a successful configuration:
 
+```bash
+-- *******************************************************
+-- PROJECT_NAME                           : ethos-u-corstone-320
+-- TR_ARENA_SIZE                          :
+-- MESSAGE_HANDLER_ARENA_SIZE             :
+-- *******************************************************
+-- ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE = 0x200000
+-- ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE =
+-- Configuring done (17.1s)
+-- Generating done (0.2s)
+-- Build files have been written to: ~/executorch/examples/arm/executor_runner/cmake-out
 ```
 
-Run the model on the Corstone-320 with the following command:
+Now, build the executable with CMake:
+
+```bash
+cmake --build "$ET_HOME/examples/arm/executor_runner/cmake-out" -j --target arm_executor_runner
+```
+
+### Run the Model on the FVP
+With the `arm_executor_runner` executable ready, you can now run it on the Corstone-320 FVP to see the model on a simulated Arm device.
 
 ```bash
 FVP_Corstone_SSE-320 \
--C mps4_board.subsystem.ethosu.num_macs=256 \
+-C mps4_board.subsystem.ethosu.num_macs=128 \
 -C mps4_board.visualisation.disable-visualisation=1 \
 -C vis_hdlcd.disable_visualisation=1                \
 -C mps4_board.telnetterminal0.start_telnet=0        \
@@ -70,37 +88,30 @@ FVP_Corstone_SSE-320 \
 ```
 
 {{% notice Note %}}
-
 The argument `mps4_board.visualisation.disable-visualisation=1` disables the FVP GUI. This can speed up launch time for the FVP.
-
 {{% /notice %}}
 
 
-Observe that the FVP loads the model file.
+Observe the output from the FVP. You'll see messages indicating that the model file has been loaded and the inference is running. This confirms that your ExecuTorch program is successfully executing on the simulated Arm hardware.
+
 ```output
 telnetterminal0: Listening for serial connection on port 5000
 telnetterminal1: Listening for serial connection on port 5001
 telnetterminal2: Listening for serial connection on port 5002
 telnetterminal5: Listening for serial connection on port 5003
-I [executorch:arm_executor_runner.cpp:412] Model in 0x70000000 $
-I [executorch:arm_executor_runner.cpp:414] Model PTE file loaded. Size: 3360 bytes.
+I [executorch:arm_executor_runner.cpp:489 main()] PTE in 0x70000000 $ Size: 433968 bytes
+I [executorch:arm_executor_runner.cpp:514 main()] PTE Model data loaded. Size: 433968 bytes.
+I [executorch:arm_executor_runner.cpp:527 main()] Model buffer loaded, has 1 methods
+I [executorch:arm_executor_runner.cpp:535 main()] Running method forward
+I [executorch:arm_executor_runner.cpp:546 main()] Setup Method allocator pool. Size: 62914560 bytes.
+I [executorch:arm_executor_runner.cpp:563 main()] Setting up planned buffer 0, size 3920.
+I [executorch:EthosUBackend.cpp:116 init()] data:0x70000070
 ```
 
-You can now test the model. 
-
-## Test the Model
-Test the model with your own inputs with the following command:
-
-
-TODO: Add commands
-
-```bash
-
-```
-
-
-You've successfully trained and tested a CNN model for sentiment analysis on Arm hardware using Executorch.
+{{% notice Note %}}
+The inference itself may take a longer to run with a model this size - note that this is not a reflection of actual execution time.
+{{% /notice %}}
 
-Experiment with different inputs and data samples. This hands-on course showcases the power of TinyML and NLP on resource-constrained devices.
+You've now successfully built, optimized, and deployed a computer vision model on a simulated Arm-based system. This hands-on exercise demonstrates the power and practicality of TinyML and ExecuTorch for resource-constrained devices.
 
-In the next Learning Path, we would compare different model performances and inference times, before and after optimization using ExecuTorch. We would also analyze CPU and memory usage during inference. 
+In a future learning path, you can explore comparing different model performances and inference times before and after optimization. You could also analyze CPU and memory usage during inference, providing a deeper understanding of how the ExecuTorch framework optimizes your model for edge deployment.
\ No newline at end of file
diff --git a/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png
new file mode 100644
index 0000000000..b548f79463
Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/training-inference-pytorch/image.png differ
diff --git a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md
index d1b21d9810..83505cecbd 100644
--- a/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md
+++ b/content/learning-paths/embedded-and-microcontrollers/zephyr/_index.md
@@ -19,7 +19,7 @@ author: Pareena Verma
 test_images:
 - amd64/ubuntu:latest
 test_link: null
-test_maintenance: true
+test_maintenance: false
 
 ### Tags
 skilllevels: Introductory
diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md
index 25ff68127d..b0c3d43298 100644
--- a/content/learning-paths/laptops-and-desktops/_index.md
+++ b/content/learning-paths/laptops-and-desktops/_index.md
@@ -9,13 +9,13 @@ maintopic: true
 operatingsystems_filter:
 - Android: 2
 - ChromeOS: 2
-- Linux: 33
+- Linux: 34
 - macOS: 9
-- Windows: 44
+- Windows: 45
 subjects_filter:
 - CI-CD: 5
 - Containers and Virtualization: 7
-- Migration to Arm: 28
+- Migration to Arm: 29
 - ML: 2
 - Performance and Architecture: 27
 subtitle: Create and migrate apps for power efficient performance
@@ -28,6 +28,7 @@ tools_software_languages_filter:
 - Arm Performance Libraries: 2
 - Arm64EC: 1
 - Assembly: 1
+- Bash: 1
 - C: 8
 - C#: 6
 - C++: 11
@@ -48,6 +49,7 @@ tools_software_languages_filter:
 - Intrinsics: 1
 - JavaScript: 2
 - Kubernetes: 1
+- KVM: 1
 - Linux: 1
 - LLM: 1
 - LLVM: 2
@@ -61,7 +63,9 @@ tools_software_languages_filter:
 - OpenCV: 1
 - perf: 4
 - Python: 6
+- QEMU: 1
 - Qt: 2
+- RDP: 1
 - Remote.It: 1
 - RME: 1
 - Runbook: 18
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md
new file mode 100644
index 0000000000..e6ca39d6f4
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_index.md
@@ -0,0 +1,53 @@
+---
+title: Windows on Arm virtual machine creation using Arm Linux, QEMU, and KVM
+
+draft: true
+cascade:
+    draft: true
+
+minutes_to_complete: 90
+
+who_is_this_for: This is for developers and system administrators who want to automate Windows on Arm virtual machine (VM) creation on Arm Linux systems using QEMU and KVM.
+
+learning_objectives:
+    - Understand the process of creating Windows on Arm virtual machine using Bash scripts.
+    - Run scripts for VM creation and management.
+    - Troubleshoot common VM setup and runtime issues.
+    - Use Windows on Arm virtual machines for software development and testing. 
+
+prerequisites:
+    - An Arm Linux system with KVM support and a minimum of 8GB RAM and 50GB free disk space.
+
+author: Jason Andrews
+
+### Tags
+skilllevels: Introductory
+subjects: Migration to Arm
+armips:
+    - Neoverse
+    - Cortex-A
+operatingsystems:
+    - Linux
+    - Windows
+tools_software_languages:
+    - QEMU
+    - KVM
+    - Bash
+    - RDP
+
+further_reading:
+    - resource:
+        title: Linaro Wiki - Windows on Arm
+        link: https://wiki.linaro.org/LEG/Engineering/Kernel/WindowsOnArm
+        type: documentation
+    - resource:
+        title: Botspot Virtual Machine (BVM) Project
+        link: https://github.com/Botspot/bvm
+        type: website
+
+### FIXED, DO NOT MODIFY
+# ================================================================================
+weight: 1                       # _index.md always has weight of 1 to order correctly
+layout: "learningpathall"       # All files under learning paths have this same wrapper
+learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
+---
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md
new file mode 100644
index 0000000000..c3db0de5a2
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/_next-steps.md
@@ -0,0 +1,8 @@
+---
+# ================================================================================
+#       FIXED, DO NOT MODIFY THIS FILE
+# ================================================================================
+weight: 21                  # Set to always be larger than the content in this path to be at the end of the navigation.
+title: "Next Steps"         # Always the same, html page title.
+layout: "learningpathall"   # All files under learning paths have this same wrapper for Hugo processing.
+---
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png b/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png
new file mode 100644
index 0000000000..4f31c8b4f5
Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/win11-vm-automation/images/win11arm.png differ
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md
new file mode 100644
index 0000000000..47b42a4298
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/prerequisites-1.md
@@ -0,0 +1,80 @@
+---
+title: System requirements
+weight: 2
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+If you are building and testing Windows on Arm software you have a variety of options to run Windows on Arm. You can use local laptops, cloud virtual machines, and CI/CD platforms like GitHub Actions for development tasks.
+
+You can also use a local Arm Linux server to create virtual machines for Windows on Arm software development tasks. This Learning Path explains how to install and use Windows on Arm virtual machines on an Arm Linux system. Two scripts are provided to create and run Windows on Arm virtual machines to make the process easy. 
+
+Before creating a Windows on Arm virtual machine, ensure your Arm Linux system meets the hardware and software requirements. This section covers everything you need to prepare to create a Windows on Arm virtual machine using QEMU and KVM.
+
+## Hardware requirements
+
+You need an Arm Linux system with enough performance, memory, and storage to run a Windows on Arm virtual machine. 
+
+The provided scripts have been tested on a [Thelio Astra](https://system76.com/desktops/thelio-astra-a1.1-n1/configure?srsltid=AfmBOoplXbwXifyxppxFe_oyahYMJHUT0bp2BnIBSH5ADjqgZxB7wW75) running Ubuntu 24.04. 
+
+Thelio Astra is an Arm-based desktop computer designed by System76 for autonomous vehicle development and other general-purpose Arm software development. It uses the Ampere Altra processor, which is based on the Arm Neoverse N1 CPU, and ships with the Ubuntu operating system.
+
+Other Arm Linux systems and other Linux distributions are possible, but have not been tested. General hardware requirements are listed below.
+
+The minimum hardware requirements for the Arm Linux system are:
+
+- 8 cores with hardware virtualization support
+- 8 GB RAM
+- 50 GB free disk space
+
+The scripts automatically allocate resources as listed below, but the details can be customized for your system.
+
+- CPU: half of available cores (minimum 4 cores)
+- Memory: half of available RAM (minimum 4 GB)
+- Disk: 40 GB VM disk
+
+## KVM support 
+
+Kernel-based Virtual Machine (KVM) support is required for hardware-accelerated virtualization and good VM performance.
+
+KVM is a virtualization infrastructure built into the Linux kernel that allows you to run virtual machines with near-native performance. It leverages Arm's hardware virtualization extensions to provide efficient CPU virtualization, while QEMU handles device emulation and management. Without KVM, virtual machines run much slower using software emulation.
+
+Verify your system supports KVM by running:
+
+```console
+sudo apt install cpu-checker -y
+sudo kvm-ok
+```
+
+If KVM is available, you will see the messages:
+
+```output
+INFO: /dev/kvm exists
+KVM acceleration can be used
+```
+
+This confirms that:
+- Your CPU supports hardware virtualization
+- The KVM kernel module is loaded
+- The `/dev/kvm` device exists 
+
+## Required software
+
+The scripts require several software packages. 
+
+Install the packages using the Linux package manager.
+
+```console
+sudo apt update
+sudo apt install qemu-system-arm qemu-utils genisoimage wget curl jq uuid-runtime -y
+```
+
+If needed, the [Remmina](https://remmina.org/) remote desktop (RDP) client is automatically installed by the run script so you don't need to install it now, but you can install it using the command below.
+
+```console
+sudo apt install remmina remmina-plugin-rdp -y
+```
+
+Proceed to the next section to learn about the scripts.
+
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md
new file mode 100644
index 0000000000..b6103b6733
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/understanding-scripts-2.md
@@ -0,0 +1,100 @@
+---
+title: Understanding the virtual machine scripts
+
+weight: 3
+
+layout: "learningpathall"
+---
+
+A GitHub project provides two Bash scripts. Understanding their architecture and design will help you use them effectively and enable you to customize the options for your specific needs.
+
+Start by cloning the project repository from GitHub to your Arm Linux system.
+
+```bash
+git clone https://github.com/jasonrandrews/win11arm.git
+cd win11arm
+```
+
+The remainder of this section explains the structure of the scripts, and the next section provides details to run the scripts to create a Windows virtual machine.
+
+## Project overview
+
+The project includes two Bash scripts. 
+
+- VM create script: `create-win11-vm.sh` handles all VM creation tasks
+- VM run script: `run-win11-vm.sh` manages VM execution and connectivity
+
+All configuration is available using command-line options. 
+
+The VM create script also allows you to perform the entire VM creation with a single command or run each individual step to learn and monitor the process. 
+
+This modular approach allows you to understand each component while maintaining the simplicity of automated execution.
+
+## Virtual machine creation
+
+The creation script, `create-win11-vm.sh` is responsible for building a complete Windows 11 on Arm VM from scratch. It handles everything from directory setup to Windows installation, with each step clearly defined and independently executable.
+
+The script handles resource detection and allocation, provides unattended Windows installation, and has a flexible command line to change default values.
+
+Virtual machine creation includes the following steps:
+
+- Download the Windows 11 for Arm ISO from Microsoft
+- Configure VirtIO drivers for optimal performance
+- Set up automated installation with custom credentials
+- Create optimized disk images 
+
+### Virtual machine creation details
+
+The `create-win11-vm.sh` script implements a four-step process that builds a Windows VM incrementally:
+
+### Step 1: Create VM directory
+
+Step 1 initializes the VM directory structure and configuration. It creates the VM directory, copies initial configuration files, and sets up the basic environment. As a result, the VM directory, configuration files, and connection profiles are created. 
+
+### Step 2: Download Windows
+
+Step 2 downloads the Windows 11 ISO and VirtIO drivers. It downloads the Windows 11 Arm ISO from Microsoft, fetches VirtIO drivers, and prepares unattended installation files. The files created during this step include `installer.iso`, `virtio-win.iso`, and the unattended installation directory. This step takes some time as the Windows ISO download is large, but if you already have the file the script will save time and not repeat the download.
+
+### Step 3: Prepare VM disk image
+
+Step 3 creates the VM disk image and finalizes the installation setup. It builds the unattended installation ISO, creates the main VM disk image, and configures all installation media. The files created during this step include `disk.qcow2` and `unattended.iso`.
+
+{{% notice Note %}}
+The product key used in the scripts is a generic key provided by Microsoft, which allows installation. This key is for testing purposes only and does not activate Windows. If you plan to continue using Windows beyond installation, you should replace it with a genuine product key.
+{{% /notice %}}
+
+### Step 4: First Windows boot
+
+Step 4 executes the Windows installation. It boots the VM with installation media, runs the automated Windows setup, and completes the initial configuration. The result is a fully installed and configured Windows on Arm VM.
+
+Each step builds on the previous one, and you can run them individually for debugging or customization purposes.
+
+## Virtual machine execution
+
+The `run-win11-vm.sh` script runs virtual machines by managing their execution and connectivity. 
+
+The script begins by checking if the VM is already active by validating QEMU processes and PID files. If the VM is running, it skips to establishing an RDP connection; otherwise, it proceeds to start the VM. 
+
+Next, the script launches the VM in headless mode, optimized for RDP access, by configuring QEMU with a headless display, setting up port forwarding, and starting the VM as a background daemon process. 
+
+Once the VM is running, the script waits for the RDP service to become available, configures the Remmina client, and establishes a desktop connection. 
+
+This process ensures seamless access to the VM with proper display scaling and input handling.
+
+## Automatic resource detection and allocation
+
+The scripts try to manage resources based on your system. 
+
+For CPU allocation, `/proc/cpuinfo` is used to determine the total number of CPU cores and use half of the available cores for the VM. A minimum of 2 cores for creation and 4 cores for runtime are required.
+
+For memory allocation, `/proc/meminfo` is used to determine total system RAM and allocate half of the available memory for the VM. A minimum of 2GB is required and memory usage is based on system capacity, with an option to override using a command line parameter. 
+
+For storage, the default VM disk size is 40GB in QCOW2 format. The available disk space is validated before creation.
+
+All settings are customizable using command line arguments. 
+
+## Script Integration and Workflow
+
+The create and run scripts share the same configuration files. Separating creation from execution enables you to create a VM once and then use the run script repeatedly. 
+
+The next section explains how to create and run a Windows on Arm virtual machine.
\ No newline at end of file
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md
new file mode 100644
index 0000000000..e5572a56ed
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-creation-3.md
@@ -0,0 +1,262 @@
+---
+title: "Create a Windows on Arm virtual machine"
+weight: 4
+layout: "learningpathall"
+---
+
+## What is the fastest way to create a new Windows on Arm virtual machine?
+
+The virtual machine creation script creates a complete Windows 11 on Arm virtual machine with the `all` option. The default values are used for all configurable parameters. The location to store the VM files is also provided as an argument.
+
+To create a new VM, run the command:
+
+```console
+./create-win11-vm.sh all $HOME/win11-vm
+```
+
+This single command executes all required virtual machine creation steps as explained in the previous section.
+
+The VM data is stored in the `$HOME/win11-vm` directory, and Windows will install automatically without any user intervention.
+
+Once the VM creation is complete, you'll see:
+
+```output
+QEMU closed successfully.
+Windows installation should be complete!
+You can now use: ./run-win11-vm.sh $HOME/win11-vm
+```
+
+Your Windows on Arm VM is now ready to use. You can proceed to the next section to run the VM or continue on this page to review additional information about modifying default values, running the individual steps of VM creation, and fixing common errors. 
+
+## Configuration options
+
+The creation script supports several options to customize your virtual machine setup.
+
+For example, you can change the Windows user, password, and disk size using the arguments shown below.
+
+```console
+./create-win11-vm.sh all $HOME/win11-vm --username MyUser --password MyPass --disksize 60
+```
+
+The table below lists the configuration options and default values. 
+
+| Flag | Description | Default Value | Example |
+|------|-------------|---------------|---------|
+| `--username <name>` | Windows user name | `win11arm` | `--username Admin` |
+| `--password <pass>` | Windows user password | `win11arm` | `--password MySecurePass` |
+| `--disksize <size>` | Disk size in GB | `40` | `--disksize 60` |
+| `--rdp-port <port>` | RDP port for remote connections | `3389` | `--rdp-port 3390` |
+| `--language <lang>` | Windows language | `"English (United States)"` | `--language "English International"` |
+| `--vm-mem <size>` | VM memory in GB | half of system RAM | `--vm-mem 8` |
+
+
+### Disk space requirements
+
+The creation script checks available disk space before starting.
+
+An estimate of required disk space is shown in the table below. 
+
+| Component | Size | Description |
+|-----------|------|-------------|
+| Windows 11 ISO | ~5GB | Downloaded from Microsoft |
+| VirtIO drivers | ~500MB | Performance drivers |
+| VM disk image | Variable | Default is 40 GB |
+| Temporary files | ~1GB | Installation workspace |
+| Total needed | ~7GB + disk size | Example: 47GB for default 40GB disk |
+
+### Configuration examples
+
+Create a VM with custom disk size and network port:
+
+```console
+./create-win11-vm.sh all $HOME/win11-vm --disksize 60 --rdp-port 3390 --username Admin
+```
+
+Set up a VM with English International language:
+
+```console
+./create-win11-vm.sh all $HOME/win11-vm --language "English International"
+```
+
+## Alternative four-step creation process
+
+The VM creation process consists of four distinct steps that can be run individually. Understanding each step helps with troubleshooting and customization.
+
+### Step 1: Create VM directory structure
+
+```console
+./create-win11-vm.sh create $HOME/win11-vm
+```
+
+Command summary:
+- Creates the VM directory at the specified path
+- Sets up the initial directory structure for VM files
+- Creates a `vm-config.txt` file with your configuration settings for reference
+- Copies the Remmina connection template if available
+
+Files created:
+- `vm-config.txt` - Configuration reference file
+- `connect.remmina` - RDP connection template (if available)
+
+Each VM stores its configuration in `vm-config.txt`:
+
+```bash
+# VM Configuration (for reference)
+# Generated by create-win11-vm.sh v2.0.0
+VM_PATH=$HOME/win11-vm
+USERNAME=win11arm
+PASSWORD=win11arm
+DISKSIZE=40
+RDP_PORT=3389
+LANGUAGE=English (United States)
+VM_MEM=8
+CREATED=Thu Aug 28 10:30:45 UTC 2025
+```
+
+This step is lightweight and completes quickly. It establishes the workspace where all VM files will be stored.
+
+### Step 2: Download Windows 11 and drivers
+
+```console
+./create-win11-vm.sh download $HOME/win11-vm 
+```
+
+Command summary:
+- Downloads the Windows 11 Arm64 ISO directly from Microsoft's servers
+- Patches the ISO to boot automatically without requiring a keypress
+- Downloads VirtIO drivers for optimal VM performance
+- Extracts and organizes drivers for the unattended installation
+- Creates unattended installation configuration files
+- Sets up the autounattend.xml with your specified username, password, and language
+
+Files created:
+- `installer.iso` - Windows 11 Arm64 installation media
+- `unattended/` directory - Contains drivers and installation automation files
+- `unattended/autounattend.xml` - Windows unattended installation configuration
+- `unattended/firstlogin.ps1` - Post-installation script
+
+If you already downloaded the Windows 11 installer ISO, you can copy it to your VM directory as `installer.iso` before running this step. The script will detect the existing file and ask if you want to use it or download a fresh copy:
+
+```output
+installer.iso already exists. Delete it and download a fresh copy? [Y/n]
+```
+
+Choosing 'n' will skip the download and use your existing ISO, saving significant time and bandwidth.
+
+Download Process Details:
+The script uses an automated process to download Windows 11 from Microsoft's official servers:
+
+1. Parse Microsoft's download page - Extracts product edition information
+2. Get language SKU ID - Identifies the correct language variant
+3. Obtain download link - Retrieves the direct download URL for Arm64
+4. Download and verify - Downloads the ISO and verifies its integrity
+
+### Step 3: Prepare VM disk 
+
+```console
+./create-win11-vm.sh prepare $HOME/win11-vm
+```
+
+Command summary:
+- Creates the `unattended.iso` containing drivers and installation files
+- Sets up the main VM hard drive as a QCOW2 disk image
+- Allocates the specified disk space with optimized settings
+- Prepares all components needed for the automated installation
+
+Files created:
+- `unattended.iso` - ISO containing drivers and automation scripts
+- `disk.qcow2` - Main VM hard drive (empty, ready for Windows installation)
+
+Disk Creation Details:
+The script creates a QCOW2 disk image with these optimizations:
+- Cluster size: 2 MB for better performance
+- No copy-on-write: Disabled for improved I/O performance  
+- Metadata preallocation: Reduces fragmentation during VM operation
+
+Important Note: If `disk.qcow2` already exists, the script will warn you that proceeding will delete the existing VM's hard drive and start over with a clean installation.
+
+### Step 4: First boot and Windows installation
+
+```console
+./create-win11-vm.sh firstboot $HOME/win11-vm
+```
+
+Command summary:
+- Launches QEMU with the Windows installer
+- Boots from the Windows 11 ISO with unattended installation
+- Automatically installs Windows with your specified settings
+- Installs VirtIO drivers for optimal performance
+- Configures the user account and system settings
+- Completes the entire Windows setup process without user intervention
+
+System Requirements Check:
+Before starting, the script verifies:
+- Desktop environment is available (DISPLAY or WAYLAND_DISPLAY)
+- All required files exist (installer.iso, unattended.iso, disk.qcow2)
+- Sufficient system resources are available
+
+Automatic Resource Allocation:
+If you don't specify `--vm-mem`, the script automatically allocates:
+- Memory: Half of your system's total RAM (minimum 2GB)
+- CPU cores: Half of your system's total cores (minimum 2 cores)
+
+For example, on a system with 16GB RAM and 8 CPU cores:
+- VM gets 8GB RAM and 4 CPU cores
+- Host system retains 8GB RAM and 4 CPU cores for other tasks
+
+The script launches QEMU with these settings:
+- Machine type: `virt` with KVM acceleration
+- CPU: Host CPU passthrough for best performance
+- Graphics: RamFB with GTK display and OpenGL acceleration
+- Input: USB keyboard and tablet for proper mouse integration
+- Network: User-mode networking with virtio-net for performance
+- Storage: VirtIO block device with optimized caching
+- Random number generator: Hardware entropy for security
+
+The installation process performs the following steps:
+1. UEFI boot - VM starts with UEFI firmware
+2. Windows installer loads - Boots from installer.iso
+3. Unattended installation begins - Uses autounattend.xml configuration
+4. Driver installation - VirtIO drivers installed automatically
+5. User account creation - Your specified username and password
+6. System configuration - Language, region, and basic settings
+7. First login script - Runs firstlogin.ps1 for final setup
+
+The entire installation process typically takes 20-30 minutes depending on your system's performance.
+
+## Troubleshooting common problems
+
+### Insufficient disk space
+
+If you see an error about insufficient disk space:
+```output
+Error: Insufficient free disk space. 40 GB is needed, but you only have 25 GB.
+```
+
+Use the following options to correct the error:
+- Free up disk space on your system
+- Use a smaller disk size: `--disksize 30`
+- Choose a different location with more space
+
+### Download failures
+
+If Windows ISO download fails:
+
+```output
+Error: Failed to download Windows 11 installer.iso from Microsoft
+```
+
+Use the following options to correct the error:
+- Check your internet connection
+- Try again later (Microsoft can block frequent automated downloads)
+- Manually download the ISO from Microsoft's website and save it as `installer.iso`
+
+### Memory allocation issues
+
+If the VM fails to start due to memory issues:
+
+Use the following options to correct the error:
+- Reduce VM memory: `--vm-mem 4`
+- Close other applications to free system memory
+
+You now have a good understanding of virtual machine creation. The next section will cover how to run and connect to your VM using the run script.
\ No newline at end of file
diff --git a/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md
new file mode 100644
index 0000000000..e05656d100
--- /dev/null
+++ b/content/learning-paths/laptops-and-desktops/win11-vm-automation/vm-execution-4.md
@@ -0,0 +1,282 @@
+---
+title: "Run a Windows on Arm virtual machine"
+weight: 5
+layout: "learningpathall"
+---
+
+## Basic VM launch command
+
+After your Windows 11 Arm VM is created, launching it is simple with the unified run script:
+
+```console
+./run-win11-vm.sh $HOME/win11-vm
+```
+
+This single command handles the entire VM startup and connection process automatically. The script performs three key steps: checks if the VM is already running, starts it in headless mode if needed, and connects you via RDP using Remmina.
+
+When the virtual machine starts you will see it on your Linux desktop:
+
+![Windows on Arm VM](./images/win11arm.png)
+
+## What does the run script do? 
+
+Understanding the run script flow helps you troubleshoot issues and customize the VM runtime behavior.
+
+### Step 1: Check if VM is already running
+
+The script first checks if your VM is already running to avoid conflicts.
+
+Here is a fragment of the code:
+
+```bash
+# Check for existing VM process
+if [ -f "$vm_path/qemu.pid" ]; then
+    local vm_pid=$(cat "$vm_path/qemu.pid" 2>/dev/null)
+    if process_exists "$vm_pid"; then
+        status "VM is already running (PID: $vm_pid)"
+    fi
+fi
+```
+
+The run script looks for the `qemu.pid` file in your VM directory, verifies the process ID is still active, cleans up stale PID files from previous sessions, and skips VM startup if already running.
+
+If this happens you will see output similar to:
+
+```output
+VM is already running (PID: 12345)
+Waiting for RDP service on port 3389...
+```
+
+### Step 2: Start VM in headless mode
+
+If the VM isn't running, the script starts it in headless mode (no GUI window) using QEMU. 
+
+The arguments to QEMU are shown below:
+
+```bash
+qemu-system-aarch64 \
+    -M virt,accel=kvm \
+    -cpu host \
+    -m ${vm_mem}G \
+    -smp $num_cores \
+    -name "Windows on Arm" \
+    -pidfile "$vm_path/qemu.pid" \
+    -display none \
+    -netdev user,id=nic,hostfwd=tcp:127.0.0.1:${rdp_port}-:3389 \
+    -device virtio-net-pci,netdev=nic \
+    -bios /usr/share/qemu-efi-aarch64/QEMU_EFI.fd \
+    -drive file="$vm_path/disk.qcow2",if=virtio,discard=unmap,aio=threads,cache=none \
+    -daemonize
+```
+
+The important arguments to QEMU are:
+- `-M virt,accel=kvm` - Uses ARM virtualization with KVM acceleration
+- `-cpu host` - Passes through your host CPU features for best performance
+- `-display none` - Runs headless (no QEMU window)
+- `-daemonize` - Runs QEMU as a background daemon
+- `-netdev user,hostfwd=...` - Sets up port forwarding for RDP access
+- `-pidfile` - Creates a PID file for process management
+
+The script automatically detects and allocates CPU and memory resources.
+
+The code is shown below:
+
+```bash
+# Memory: Half of available RAM (minimum 2GB)
+local total_ram_gb=$(awk '/MemTotal/ {print int($2/1048576)}' /proc/meminfo)
+local vm_mem=$((total_ram_gb / 2))
+[ "$vm_mem" -lt 2 ] && vm_mem=2
+
+# CPU: Half of available cores (minimum 4)
+local total_cores=$(grep -c ^processor /proc/cpuinfo)
+local num_cores=$((total_cores / 2))
+[ "$num_cores" -lt 4 ] && num_cores=4
+```
+
+When the run script executes, you will see the CPU and RAM allocated:
+
+```output
+Starting Windows VM in headless mode...
+Using 8GB RAM and 4 CPU cores
+VM started successfully
+```
+
+### Step 3: Connect via RDP
+
+Once the VM is running, the script waits for the RDP service and connects automatically.
+
+Here is the function which waits for the port to be ready:
+
+```bash
+# Wait for RDP service to be available
+wait_for_rdp() {
+    local port="$1"
+    local max_attempts=60
+    
+    while [ $attempt -le $max_attempts ]; do
+        if timeout 3 bash -c "echo >/dev/tcp/localhost/$port" 2>/dev/null; then
+            return 0
+        fi
+        sleep 2
+        attempt=$((attempt + 1))
+    done
+}
+```
+
+Once the RDP service is ready, Remmina is started and connects. 
+
+The related output is shown below:
+
+```output
+Waiting for RDP service on port 3389...
+RDP service is available!
+Connecting to VM via RDP (localhost:3389)...
+Username: win11arm
+```
+
+## Run script options and examples
+
+The run script supports several options for different use cases:
+
+### Custom RDP port
+
+```console
+./run-win11-vm.sh /path/to/vm --rdp-port 3390
+```
+Uses a custom RDP port, useful when running multiple VMs or avoiding port conflicts.
+
+### Help information
+
+```console
+./run-win11-vm.sh --help
+```
+Displays usage information and all available options.
+
+## Remmina integration
+
+The script uses Remmina as the RDP client and creates a Remmina profile with the connection settings.
+
+The file name is `connect.remmina` and you can review and edit as needed.
+
+```ini
+[remmina]
+name=VM Connect
+protocol=RDP
+scale=2
+quality=9
+disable_fastpath=0
+glyph-cache=0
+multitransport=0
+relax-order-checks=1
+ignore-tls-errors=1
+cert_ignore=1
+window_width=1024
+window_height=768
+window_maximize=0
+disableautoreconnect=1
+viewmode=1
+network=lan #change viewmode=1 to viewmode=3 for fullscreen
+sound=local #to get microphone input working, change to sound=remote, and USB passthrough your m
+icrophone to the VM.
+colordepth=63
+```
+
+
+## VM shutdown 
+
+The preferred method is to shut down Windows normally from within the virtual machine.
+
+1. Click the Start button in Windows
+2. Select Power → Shut down
+3. Wait for Windows to complete shutdown
+4. VM automatically stops when Windows finishes shutting down
+5. Remmina exits automatically when the connection closes
+
+You should avoid killing QEMU directly as it may corrupt the VM disk as well as avoid exiting Remmina as it may leave the VM running in the background.
+
+## Runtime monitoring and management
+
+### Checking VM status
+
+To check if your VM is running without connecting:
+
+```console
+# Check for VM process
+ps aux | grep "Windows on Arm"
+
+# Check PID file
+cat $HOME/win11-vm/qemu.pid
+
+# Test RDP connectivity
+timeout 3 bash -c "echo >/dev/tcp/localhost/3389"
+```
+
+If the RDP connectivity fails the output is:
+
+```output
+bash: connect: Connection refused
+bash: line 1: /dev/tcp/localhost/3389: Connection refused
+```
+
+### Resource usage monitoring
+
+Monitor VM resource usage while running:
+
+```console
+# CPU and memory usage
+top -p $(cat $HOME/win11-vm/qemu.pid)
+
+# Detailed process information
+ps -p $(cat $HOME/win11-vm/qemu.pid) -o pid,ppid,cmd,%cpu,%mem,etime
+```
+
+### Multiple VM management
+
+Running multiple VMs requires different RDP ports:
+
+```console
+# First VM (default port 3389)
+./run-win11-vm.sh $HOME/vm1
+
+# Second VM (custom port 3390)
+./run-win11-vm.sh $HOME/vm2 --rdp-port 3390
+
+# Third VM (custom port 3391)
+./run-win11-vm.sh $HOME/vm3 --rdp-port 3391
+```
+
+Each VM needs its own directory and unique RDP port to avoid conflicts.
+
+## Troubleshooting runtime issues
+
+### RDP connection failures
+
+If RDP connection fails:
+
+```output
+Error: RDP service did not become available after 120 seconds
+```
+
+Check VM is actually running: 
+
+```console
+ps aux | grep qemu-system-aarch64
+```
+
+Verify RDP port: 
+```console
+netstat -tlnp | grep 3389
+```
+
+### Known Remmina crash issue
+
+When disconnecting from RDP, Remmina may crash with:
+
+```output
+./run-win11-vm.sh: line 143: 60433 Aborted (core dumped) remmina -c "$remmina_file" $remmina_flags 2> /dev/null
+RDP session ended
+```
+
+This is a known Remmina issue and does not affect VM functionality.
+
+You have learned how to create Windows on Arm virtual machines on an Arm Linux system with QEMU and KVM. You can use these virtual machines for software development and testing. You can speedup your development tasks by using an Arm Linux desktop or server with high processor count and plenty of RAM.
\ No newline at end of file
diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md
index aae0dcbb19..0ba3f637ac 100644
--- a/content/learning-paths/mobile-graphics-and-gaming/_index.md
+++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md
@@ -9,7 +9,7 @@ key_ip:
 - Mali
 maintopic: true
 operatingsystems_filter:
-- Android: 31
+- Android: 32
 - Linux: 30
 - macOS: 14
 - Windows: 14
@@ -17,7 +17,7 @@ subjects_filter:
 - Gaming: 6
 - Graphics: 6
 - ML: 12
-- Performance and Architecture: 34
+- Performance and Architecture: 35
 subtitle: Optimize Android apps and build faster games using cutting-edge Arm tech
 title: Mobile, Graphics, and Gaming
 tools_software_languages_filter:
@@ -26,7 +26,7 @@ tools_software_languages_filter:
 - Android: 4
 - Android NDK: 2
 - Android SDK: 1
-- Android Studio: 10
+- Android Studio: 11
 - Arm Development Studio: 1
 - Arm Mobile Studio: 1
 - Arm Performance Studio: 3
@@ -38,6 +38,7 @@ tools_software_languages_filter:
 - CCA: 1
 - Clang: 12
 - CMake: 1
+- Coding: 1
 - Docker: 1
 - ExecuTorch: 1
 - Frame Advisor: 1
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png
new file mode 100644
index 0000000000..98a272f84b
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/01.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png
new file mode 100644
index 0000000000..d0b8df7cb0
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/02.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png
new file mode 100644
index 0000000000..80e41973f2
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/03.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png
new file mode 100644
index 0000000000..d098da4e1a
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/04.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png
new file mode 100644
index 0000000000..8fa7609f69
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/05.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png
new file mode 100644
index 0000000000..a78e5ee6f7
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/06.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png
new file mode 100644
index 0000000000..5993f29b22
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/07.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png
new file mode 100644
index 0000000000..a01e883efc
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/08.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png
new file mode 100644
index 0000000000..64d714c262
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/09.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png
new file mode 100644
index 0000000000..571783c51e
Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/android_halide/Figures/10.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md
new file mode 100644
index 0000000000..b351d54846
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_index.md
@@ -0,0 +1,56 @@
+---
+title: Halide Essentials From Basics to Android Integration
+
+draft: true
+cascade:
+    draft: true
+
+minutes_to_complete: 180
+
+who_is_this_for: This is an introductory topic for software developers interested in learning how to use Halide for image processing. 
+
+learning_objectives:
+    - Understand foundational concepts of Halide and set up your development environment.
+    - Create a basic real-time image processing pipeline using Halide.
+    - Optimize image processing workflows by applying operation fusion in Halide.
+    - Integrate Halide pipelines into Android applications developed with Kotlin.
+
+prerequisites:
+    - Basic C++ knowledge
+    - Android Studio with Android Emulator
+
+author: Dawid Borycki
+
+### Tags
+skilllevels: Introductory
+subjects: Performance and Architecture
+armips:
+    - Cortex-A
+    - Cortex-X
+operatingsystems:
+    - Android
+tools_software_languages:
+    - Android Studio
+    - Coding
+
+further_reading:
+    - resource:
+        title: Halide 19.0.0
+        link: https://halide-lang.org/docs/index.html
+        type: website
+    - resource:
+        title: Halide GitHub
+        link: https://github.com/halide/Halide
+        type: repository  
+    - resource:
+        title: Halide Tutorials
+        link: https://halide-lang.org/tutorials/
+        type: website
+
+
+### FIXED, DO NOT MODIFY
+# ================================================================================
+weight: 1                       # _index.md always has weight of 1 to order correctly
+layout: "learningpathall"       # All files under learning paths have this same wrapper
+learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
+---
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md
new file mode 100644
index 0000000000..c3db0de5a2
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/_next-steps.md
@@ -0,0 +1,8 @@
+---
+# ================================================================================
+#       FIXED, DO NOT MODIFY THIS FILE
+# ================================================================================
+weight: 21                  # Set to always be larger than the content in this path to be at the end of the navigation.
+title: "Next Steps"         # Always the same, html page title.
+layout: "learningpathall"   # All files under learning paths have this same wrapper for Hugo processing.
+---
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md
new file mode 100644
index 0000000000..3bb359a6fa
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/android.md
@@ -0,0 +1,419 @@
+---
+# User change
+title: "Integrating Halide into an Android (Kotlin) Project"
+
+weight: 6
+
+layout: "learningpathall"
+---
+
+## Objective
+In this lesson, we’ll learn how to integrate a high-performance Halide image-processing pipeline into an Android application using Kotlin.
+
+## Overview of mobile integration with Halide
+Android is the world’s most widely-used mobile operating system, powering billions of devices across diverse markets. This vast user base makes Android an ideal target platform for developers aiming to reach a broad audience, particularly in applications requiring sophisticated image and signal processing, such as augmented reality, photography, video editing, and real-time analytics.
+
+Kotlin, now the preferred programming language for Android development, combines concise syntax with robust language features, enabling developers to write maintainable, expressive, and safe code. It offers seamless interoperability with existing Java codebases and straightforward integration with native code via JNI, simplifying the development of performant mobile applications. 
+
+##  Benefits of using Halide on mobile
+Integrating Halide into Android applications brings several key advantages:
+1. Performance. Halide enables significant acceleration of complex image processing algorithms, often surpassing the speed of traditional Java or Kotlin implementations by leveraging optimized code generation. By generating highly optimized native code tailored for ARM CPUs or GPUs, Halide can dramatically increase frame rates and responsiveness, essential for real-time or interactive applications.
+2. Efficiency. On mobile devices, resource efficiency translates directly to improved battery life and reduced thermal output. Halide’s scheduling strategies (such as operation fusion, tiling, parallelization, and vectorization) minimize unnecessary memory transfers, CPU usage, and GPU overhead. This optimization substantially reduces overall power consumption, extending battery life and enhancing the user experience by preventing overheating.
+3. Portability. Halide abstracts hardware-specific details, allowing developers to write a single high-level pipeline that easily targets different processor architectures and hardware configurations. Pipelines can seamlessly run on various ARM-based CPUs and GPUs commonly found in Android smartphones and tablets, enabling developers to support a wide range of devices with minimal platform-specific modifications.
+4. Custom Algorithm Integration. Halide allows developers to easily integrate their bespoke image-processing algorithms that may not be readily available or optimized in common libraries, providing full flexibility and control over application-specific performance and functionality.
+
+In short, Halide delivers high-performance image processing without sacrificing portability or efficiency, a balance particularly valuable on resource-constrained mobile devices.
+
+### Android development ecosystem and challenges
+While Android presents abundant opportunities for developers, the mobile development ecosystem brings its own set of challenges, especially for performance-intensive applications:
+1. Limited Hardware Resources. Unlike desktop or server environments, mobile devices have significant constraints on processing power, memory capacity, and battery life. Developers must optimize software meticulously to deliver smooth performance while carefully managing hardware resource consumption. Leveraging tools like Halide allows developers to overcome these constraints by optimizing computational workloads, making resource-intensive tasks feasible on constrained hardware.
+2. Cross-Compilation Complexities. Developing native code for Android requires handling multiple hardware architectures (such as armv8-a, ARM64, and sometimes x86/x86_64). Cross-compilation introduces complexities due to different instruction sets, CPU features, and performance characteristics. Managing this complexity involves careful use of the Android NDK, understanding toolchains, and correctly configuring build systems (e.g., Gradle, CMake). Halide helps mitigate these issues by abstracting away many platform-specific optimizations, automatically generating code optimized for target architectures.
+3. Image-Format Conversions (Bitmap ↔ Halide Buffer). Android typically handles images through the Bitmap class or similar platform-specific constructs, whereas Halide expects image data to be in raw, contiguous buffer formats. Developers must bridge the gap between Android-specific image representations (Bitmaps, YUV images from camera APIs, etc.) and Halide’s native buffer format. Proper management of these conversions—including considerations for pixel formats, stride alignment, and memory copying overhead—can significantly impact performance and correctness, necessitating careful design and efficient implementation of buffer-handling routines.
+
+## Project requirements
+Before integrating Halide into your Android application, ensure you have the necessary tools and libraries.
+
+### Tools and prerequisites
+1. Android Studio. [Download link](https://developer.android.com/studio).
+2. Android NDK (Native Development Kit). Can be easily installed from Android Studio (Tools → SDK Manager → SDK Tools → Android NDK).
+
+## Setting up the Android project
+### Creating the project
+1. Open Android Studio.
+2. Select New Project > Native C++.
+![img4](Figures/04.png)
+
+### Configure the project
+1. Set the project Name to Arm.Halide.AndroidDemo.
+2. Choose Kotlin as the language.
+3. Set Minimum SDK to API 24.
+4. Click Next.
+![img5](Figures/05.png)
+5. Select C++17 from the C++ Standard dropdown list.
+![img6](Figures/06.png)
+6. Click Finish.
+
+## Configuring the Android project
+Next, configure your Android project to use the files generated in the previous step. First, copy blur_threshold_android.a and blur_threshold_android.h into ArmHalideAndroidDemo/app/src/main/cpp. Ensure your cpp directory contains the following files:
+* native-lib.cpp
+* blur_threshold_android.a
+* blur_threshold_android.h
+* CMakeLists.txt
+
+Open CMakeLists.txt and modify it as follows (replace /path/to/halide with your Halide installation directory):
+```cpp
+cmake_minimum_required(VERSION 3.22.1)
+
+project("armhalideandroiddemo")
+include_directories(
+        /path/to/halide/include
+)
+
+add_library(blur_threshold_android STATIC IMPORTED)
+set_target_properties(blur_threshold_android PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_CURRENT_SOURCE_DIR}/blur_threshold_android.a
+)
+
+add_library(${CMAKE_PROJECT_NAME} SHARED native-lib.cpp)
+
+target_link_libraries(${CMAKE_PROJECT_NAME}
+        blur_threshold_android
+        android
+        log)
+```
+
+Open build.gradle.kts and modify it as follows:
+
+```console
+plugins {
+    alias(libs.plugins.android.application)
+    alias(libs.plugins.kotlin.android)
+}
+
+android {
+    namespace = "com.arm.armhalideandroiddemo"
+    compileSdk = 35
+
+    defaultConfig {
+        applicationId = "com.arm.armhalideandroiddemo"
+        minSdk = 24
+        targetSdk = 34
+        versionCode = 1
+        versionName = "1.0"
+        ndk {
+            abiFilters += "arm64-v8a"
+        }
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        externalNativeBuild {
+            cmake {
+                cppFlags += "-std=c++17"
+            }
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_11
+        targetCompatibility = JavaVersion.VERSION_11
+    }
+    kotlinOptions {
+        jvmTarget = "11"
+    }
+    externalNativeBuild {
+        cmake {
+            path = file("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
+    buildFeatures {
+        viewBinding = true
+    }
+}
+
+dependencies {
+
+    implementation(libs.androidx.core.ktx)
+    implementation(libs.androidx.appcompat)
+    implementation(libs.material)
+    implementation(libs.androidx.constraintlayout)
+    testImplementation(libs.junit)
+    androidTestImplementation(libs.androidx.junit)
+    androidTestImplementation(libs.androidx.espresso.core)
+}
+```
+
+Click the Sync Now button at the top. To verify that everything is configured correctly, click Build > Make Project in Android Studio.
+
+## UI
+Now, you'll define the application's User Interface, consisting of two buttons and an ImageView. One button loads the image, the other processes it, and the ImageView displays both the original and processed images.
+1. Open the res/layout/activity_main.xml file, and modify it as follows:
+```XML
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:gravity="center_horizontal"
+    android:orientation="vertical"
+    android:padding="16dp">
+
+    <!-- Button to load the original image -->
+    <Button
+        android:id="@+id/btnLoadImage"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Load Image" />
+
+    <!-- Button to process the image with Halide -->
+    <Button
+        android:id="@+id/btnProcessImage"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Process Image"
+        android:layout_marginTop="16dp"
+        android:enabled="false" />
+
+    <!-- ImageView to display the original image -->
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:adjustViewBounds="true"
+        android:scaleType="fitCenter"
+        android:layout_marginTop="8dp"
+        android:layout_gravity="fill_horizontal"/>
+
+</LinearLayout>
+```
+
+2. In MainActivity.kt, comment out the following line:
+
+```java
+//binding.sampleText.text = stringFromJNI()
+```
+
+Now you can run the app to view the UI:
+
+![img7](Figures/07.png)
+
+## Processing
+You will now implement the image processing code. First, pick up an image you want to process. Here we use the camera man. Then, under the Arm.Halide.AndroidDemo/src/main create assets folder, and save the image under that folder as img.png.
+
+Now, open MainActivity.kt and modify it as follows:
+```java
+package com.arm.armhalideandroiddemo
+
+import android.graphics.Bitmap
+import android.graphics.BitmapFactory
+import androidx.appcompat.app.AppCompatActivity
+import android.os.Bundle
+import android.widget.Button
+import android.widget.ImageView
+import com.arm.armhalideandroiddemo.databinding.ActivityMainBinding
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import java.io.InputStream
+
+class MainActivity : AppCompatActivity() {
+
+    private lateinit var binding: ActivityMainBinding
+
+    private var originalBitmap: Bitmap? = null
+    private lateinit var btnLoadImage: Button
+    private lateinit var btnProcessImage: Button
+    private lateinit var imageView: ImageView
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+
+        binding = ActivityMainBinding.inflate(layoutInflater)
+        setContentView(binding.root)
+
+        btnLoadImage = findViewById(R.id.btnLoadImage)
+        btnProcessImage = findViewById(R.id.btnProcessImage)
+        imageView = findViewById(R.id.imageView)
+
+        // Load the image from assets when the user clicks "Load Image"
+        btnLoadImage.setOnClickListener {
+            originalBitmap = loadImageFromAssets("img.png")
+            originalBitmap?.let {
+                imageView.setImageBitmap(it)
+                // Enable the process button only if the image is loaded.
+                btnProcessImage.isEnabled = true
+            }
+        }
+
+        // Process the image using Halide when the user clicks "Process Image"
+        btnProcessImage.setOnClickListener {
+            originalBitmap?.let { bmp ->
+                // Run the processing on a background thread using coroutines.
+                CoroutineScope(Dispatchers.IO).launch {
+                    // Convert Bitmap to grayscale byte array.
+                    val grayBytes = extractGrayScaleBytes(bmp)
+
+                    // Call your native function via JNI.
+                    val processedBytes = blurThresholdImage(grayBytes, bmp.width, bmp.height)
+
+                    // Convert processed bytes back to a Bitmap.
+                    val processedBitmap = createBitmapFromGrayBytes(processedBytes, bmp.width, bmp.height)
+
+                    // Update UI on the main thread.
+                    withContext(Dispatchers.Main) {
+                        imageView.setImageBitmap(processedBitmap)
+                    }
+                }
+            }
+        }
+    }
+
+    // Utility to load an image from the assets folder.
+    private fun loadImageFromAssets(fileName: String): Bitmap? {
+        return try {
+            val assetManager = assets
+            val istr: InputStream = assetManager.open(fileName)
+            BitmapFactory.decodeStream(istr)
+        } catch (e: Exception) {
+            e.printStackTrace()
+            null
+        }
+    }
+
+    // Convert Bitmap to a grayscale ByteArray.
+    private fun extractGrayScaleBytes(bitmap: Bitmap): ByteArray {
+        val width = bitmap.width
+        val height = bitmap.height
+        val pixels = IntArray(width * height)
+        bitmap.getPixels(pixels, 0, width, 0, 0, width, height)
+        val grayBytes = ByteArray(width * height)
+        var index = 0
+        for (pixel in pixels) {
+            val r = (pixel shr 16 and 0xFF)
+            val g = (pixel shr 8 and 0xFF)
+            val b = (pixel and 0xFF)
+            val gray = ((r + g + b) / 3).toByte()
+            grayBytes[index++] = gray
+        }
+        return grayBytes
+    }
+
+    // Convert a grayscale byte array back to a Bitmap.
+    private fun createBitmapFromGrayBytes(grayBytes: ByteArray, width: Int, height: Int): Bitmap {
+        val bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888)
+        val pixels = IntArray(width * height)
+        var idx = 0
+        for (i in 0 until width * height) {
+            val gray = grayBytes[idx++].toInt() and 0xFF
+            pixels[i] = (0xFF shl 24) or (gray shl 16) or (gray shl 8) or gray
+        }
+        bitmap.setPixels(pixels, 0, width, 0, 0, width, height)
+        return bitmap
+    }
+
+    external fun blurThresholdImage(inputBytes: ByteArray, width: Int, height: Int): ByteArray
+
+    companion object {
+        // Used to load the 'armhalideandroiddemo' library on application startup.
+        init {
+            System.loadLibrary("armhalideandroiddemo")
+        }
+    }
+}
+```
+
+This Kotlin Android application demonstrates integrating a Halide-generated image-processing pipeline within an Android app. The main activity (MainActivity) manages loading and processing an image stored in the application’s asset folder.
+
+When the app launches, the Process Image button is disabled. When a user taps Load Image, the app retrieves img.png from its assets directory and displays it within the ImageView, simultaneously enabling the Process Image button for further interaction.
+
+Upon pressing the Process Image button, the following sequence occurs:
+1. Background Processing. A Kotlin coroutine initiates processing on a background thread, ensuring the application’s UI remains responsive.
+2. Conversion to Grayscale. The loaded bitmap image is converted into a grayscale byte array using a simple RGB-average method, preparing it for processing by the native (JNI) layer.
+3. Native Function Invocation. This grayscale byte array, along with image dimensions, is passed to a native function (blurThresholdImage) defined via JNI. This native function is implemented using the Halide pipeline, performing operations such as blurring and thresholding directly on the image data.
+4. Post-processing. After the native function completes, the resulting processed grayscale byte array is converted back into a Bitmap image.
+5. UI Update. The coroutine then updates the displayed image (on the main UI thread) with this newly processed bitmap, providing the user immediate visual feedback.
+
+The code defines three utility methods:
+1. loadImageFromAssets, which retrieves an image from the assets folder and decodes it into a Bitmap.
+2. extractGrayScaleBytes - converts a Bitmap into a grayscale byte array suitable for native processing.
+3. createBitmapFromGrayBytes - converts a grayscale byte array back into a Bitmap for display purposes.
+
+Note that performing the grayscale conversion in Halide allows us to exploit operator fusion, further improving performance by avoiding intermediate memory accesses. This could be done as in our examples before (processing-workflow).
+
+The JNI integration occurs through an external method declaration, blurThresholdImage, loaded via the companion object at app startup. The native library (armhalideandroiddemo) containing this function is compiled separately and integrated into the application (native-lib.cpp).
+
+You will now need to create blurThresholdImage function. To do so, in Android Studio put the cursor above blurThresholdImage function, and then click Create JNI function for blurThresholdImage:
+![img8](Figures/08.png)
+
+This will generate a new function in the native-lib.cpp:
+```cpp
+extern "C"
+JNIEXPORT jbyteArray JNICALL
+Java_com_arm_armhalideandroiddemo_MainActivity_blurThresholdImage(JNIEnv *env, jobject thiz,
+                                                                  jbyteArray input_bytes,
+                                                                  jint width, jint height) {
+    // TODO: implement blurThresholdImage()
+}
+```
+
+Implement this function as follows:
+```cpp
+extern "C"
+JNIEXPORT jbyteArray JNICALL
+Java_com_arm_armhalideandroiddemo_MainActivity_blurThresholdImage(JNIEnv *env, jobject thiz,
+                                                                  jbyteArray input_bytes,
+                                                                  jint width, jint height) {
+    // Get the input byte array
+    jbyte* inBytes = env->GetByteArrayElements(input_bytes, nullptr);
+    if (inBytes == nullptr) return nullptr;
+
+    // Wrap the grayscale image in a Halide::Runtime::Buffer.
+    Halide::Runtime::Buffer<uint8_t> inputBuffer(reinterpret_cast<uint8_t*>(inBytes), width, height);
+
+    // Prepare an output buffer of the same size.
+    Halide::Runtime::Buffer<uint8_t> outputBuffer(width, height);
+
+    // Call your Halide AOT function. Its signature is typically:
+    blur_threshold(inputBuffer, outputBuffer);
+
+    // Allocate a jbyteArray for the output.
+    jbyteArray outputArray = env->NewByteArray(width * height);
+    // Copy the data from Halide's output buffer to the jbyteArray.
+    env->SetByteArrayRegion(outputArray, 0, width * height, reinterpret_cast<jbyte*>(outputBuffer.data()));
+
+    env->ReleaseByteArrayElements(input_bytes, inBytes, JNI_ABORT);
+    return outputArray;
+}
+```
+Then supplement the native-lib.cpp file by the following includes:
+```cpp
+#include "HalideBuffer.h"
+#include "Halide.h"
+#include "blur_threshold_android.h"
+```
+
+This C++ function acts as a bridge between Java (Kotlin) and native code. Specifically, the function blurThresholdImage is implemented using JNI, allowing it to be directly called from Kotlin. When invoked from Kotlin (through the external fun blurThresholdImage declaration), the function receives a grayscale image represented as a Java byte array (jbyteArray) along with its width and height.
+
+The input Java byte array (input_bytes) is accessed and pinned into native memory via GetByteArrayElements. This provides a direct pointer (inBytes) to the grayscale data sent from Kotlin. The raw grayscale byte data is wrapped into a Halide::Runtime::Buffer<uint8_t> object (inputBuffer). This buffer structure is required by the Halide pipeline. An output buffer (outputBuffer) is created with the same dimensions as the input image. This buffer will store the result produced by the Halide pipeline. The native function invokes the Halide-generated AOT function blur_threshold, passing in both the input and output buffers. After processing, a new Java byte array (outputArray) is allocated to hold the processed grayscale data. The processed data from the Halide output buffer is copied into this Java array using SetByteArrayRegion. The native input buffer (inBytes) is explicitly released using ReleaseByteArrayElements, specifying JNI_ABORT as no changes were made to the input array. Finally, the processed byte array (outputArray) is returned to Kotlin.
+
+Through this JNI bridge, Kotlin can invoke high-performance native code. You can now re-run the application. Click the Load Image button, and then Process Image. You will see the following results:
+
+![img9](Figures/09.png)
+![img10](Figures/10.png)
+
+In the above code we created a new jbyteArray and copying the data explicitly, which can result in an additional overhead. To optimize performance by avoiding unnecessary memory copies, you can directly wrap Halide’s buffer in a Java-accessible ByteBuffer like so
+```java
+// Instead of allocating a new jbyteArray, create a direct ByteBuffer from Halide's buffer data.
+jobject outputBuffer = env->NewDirectByteBuffer(output.data(), width * height);
+```
+
+## Summary
+In this lesson, we’ve successfully integrated a Halide image-processing pipeline into an Android application using Kotlin. We started by setting up an Android project configured for native development with the Android NDK, employing Kotlin as the primary language. We then integrated Halide-generated static libraries and demonstrated their usage through Java Native Interface (JNI), bridging Kotlin and native code. This equips developers with the skills needed to harness Halide’s capabilities for building sophisticated, performant mobile applications on Android.
\ No newline at end of file
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md
new file mode 100644
index 0000000000..4c8ebe0796
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md
@@ -0,0 +1,162 @@
+---
+# User change
+title: "Ahead-of-time and cross-compilation"
+
+weight: 5
+
+layout: "learningpathall"
+---
+
+## Ahead-of-time and cross-compilation
+One of Halide’s standout features is the ability to compile image processing pipelines ahead-of-time (AOT), enabling developers to generate optimized binary code on their host machines rather than compiling directly on target devices. This AOT compilation process allows developers to create highly efficient libraries that run effectively across diverse hardware without incurring the runtime overhead associated with just-in-time (JIT) compilation.
+
+Halide also supports robust cross-compilation capabilities. Cross-compilation means using the host version of Halide, typically running on a desktop Linux or macOS system—to target different architectures, such as ARM for Android devices. Developers can thus optimize Halide pipelines on their host machine, produce libraries specifically optimized for Android, and integrate them seamlessly into Android applications. The generated pipeline code includes essential optimizations and can embed minimal runtime support, further reducing workload on the target device and ensuring responsiveness and efficiency.
+
+## Objective
+In this section, we leverage the host version of Halide to perform AOT compilation of an image processing pipeline via cross-compilation. The resulting pipeline library is specifically tailored to Android devices (targeting, for instance, arm64-v8a ABI), while the compilation itself occurs entirely on the host system. This approach significantly accelerates development by eliminating the need to build Halide or perform JIT compilation on Android devices. It also guarantees that the resulting binaries are optimized for the intended hardware, streamlining the deployment of high-performance image processing applications on mobile platforms.
+
+## Prepare Pipeline for Android
+The procedure implemented in the following code demonstrates how Halide’s AOT compilation and cross-compilation features can be utilized to create an optimized image processing pipeline for Android. We will run Halide on our host machine (in this example, macOS) to generate a static library containing the pipeline function, which will later be invoked from an Android device. Below is a step-by-step explanation of this process.
+
+Create a new file named blur-android.cpp with the following contents:
+
+```cpp
+#include "Halide.h"
+#include <iostream>
+#include <string>   // for std::string
+#include <cstdint>  // for fixed-width integer types (e.g., uint8_t)
+using namespace Halide;
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        std::cerr << "Usage: " << argv[0] << " <output_basename> \n";
+        return 1;
+    }
+
+    std::string output_basename = argv[1];
+
+    // Configure Halide Target for Android
+    Halide::Target target;
+    target.os = Halide::Target::OS::Android; 
+    target.arch = Halide::Target::Arch::ARM;
+    target.bits = 64;
+    target.set_feature(Target::NoRuntime, false);
+
+    // --- Define the pipeline ---
+    // Define variables
+    Var x("x"), y("y");
+
+    // Define input parameter
+    ImageParam input(UInt(8), 2, "input");
+
+    // Create a clamped function that limits the access to within the image bounds
+    Func clamped = Halide::BoundaryConditions::repeat_edge(input);
+
+    // Now use the clamped function in processing
+    RDom r(0, 3, 0, 3);
+    Func blur("blur");
+
+    // Initialize blur accumulation
+    blur(x, y) = cast<uint16_t>(0);
+    blur(x, y) += cast<uint16_t>(clamped(x + r.x - 1, y + r.y - 1));
+
+    // Then continue with pipeline
+    Func blur_div("blur_div");
+    blur_div(x, y) = cast<uint8_t>(blur(x, y) / 9);
+
+    // Thresholding
+    Func thresholded("thresholded");
+    Expr t = cast<uint8_t>(128);
+    thresholded(x, y) = select(blur_div(x, y) > t, cast<uint8_t>(255), cast<uint8_t>(0));
+
+    // Simple scheduling 
+    blur_div.compute_root();
+    thresholded.compute_root();
+
+    // --- AOT compile to a file ---
+    thresholded.compile_to_static_library(
+        output_basename,      // base filename
+        { input },            // list of inputs
+        "blur_threshold",     // name of the generated function
+        target
+    );
+
+    return 0;
+}
+```
+
+In the original implementation constants 128, 255, and 0 were implicitly treated as integers. Here, the threshold value (128) and output values (255, 0) are explicitly cast to uint8_t. This approach removes ambiguity and clearly specifies the types used, ensuring compatibility and clarity. Both approaches result in identical functionality, but explicitly casting helps emphasize the type correctness and may avoid subtle issues during cross-compilation or in certain environments.
+
+The program takes at least one command-line argument, the output base name used to generate the files (e.g., “blur_threshold_android”). Here, the target architecture is explicitly set within the code to Android ARM64:
+
+```cpp
+// Configure Halide Target for Android
+Halide::Target target;
+target.os = Halide::Target::OS::Android; 
+target.arch = Halide::Target::Arch::ARM;
+target.bits = 64;
+
+// Enable Halide runtime inclusion in the generated library (needed if not linking Halide runtime separately).
+target.set_feature(Target::NoRuntime, false);
+
+// Optionally, enable hardware-specific optimizations to improve performance on ARM devices:
+// - DotProd: Optimizes matrix multiplication and convolution-like operations on ARM.
+// - ARMFp16 (half-precision floating-point operations).
+```
+
+Notes: 
+* NoRuntime — When set to true, Halide excludes its runtime from the generated code, and you must link the runtime manually during the linking step. When set to false, the Halide runtime is included in the generated library, which simplifies deployment.
+* ARMFp16 — Enables the use of ARM hardware support for half-precision (16-bit) floating-point operations, which can provide faster execution when reduced precision is acceptable.
+
+We declare spatial variables (x, y) and an ImageParam named “input” representing the input image data. We use boundary clamping (clamp) to safely handle edge pixels. Then, we apply a 3x3 blur with a reduction domain (RDom). The accumulated sum is divided by 9 (the number of pixels in the neighborhood), producing an average blurred image. Lastly, thresholding is applied, producing a binary output: pixels above a certain brightness threshold (128) become white (255), while others become black (0).
+
+This section intentionally reinforces previous concepts, focusing now primarily on explicitly clarifying integration details, such as type correctness and the handling of runtime features within Halide.
+
+Simple scheduling directives (compute_root) instruct Halide to compute intermediate functions at the pipeline’s root, simplifying debugging and potentially enhancing runtime efficiency.
+
+This strategy can simplify debugging by clearly isolating computational steps and may enhance runtime efficiency by explicitly controlling intermediate storage locations.
+
+By clearly separating algorithm logic from scheduling, developers can easily test and compare different scheduling strategies,such as compute_inline, compute_root, compute_at, and more, without modifying their fundamental algorithmic code. This separation significantly accelerates iterative optimization and debugging processes, ultimately yielding better-performing code with minimal overhead.
+
+We invoke Halide’s AOT compilation function compile_to_static_library, which generates a static library (.a) containing the optimized pipeline and a corresponding header file (.h).
+
+```cpp
+thresholded.compile_to_static_library(
+    output_basename,      // base filename for output files (e.g., "blur_threshold_android")
+    { input },            // list of input parameters to the pipeline
+    "blur_threshold",     // the generated function name
+    target                // our target configuration for Android
+);
+```
+
+This will produce:
+* A static library (blur_threshold_android.a) containing the compiled pipeline. This static library also includes Halide’s runtime functions tailored specifically for the targeted architecture (arm-64-android). Thus, no separate Halide runtime needs to be provided on the Android device when linking against this library.
+* A header file (blur_threshold_android.h) declaring the pipeline function for use in other C++/JNI code.
+
+These generated files are then ready to integrate directly into an Android project via JNI, allowing efficient execution of the optimized pipeline on Android devices. The integration process is covered in the next section.
+
+Note: JNI (Java Native Interface) is a framework that allows Java (or Kotlin) code running in a Java Virtual Machine (JVM), such as on Android, to interact with native applications and libraries written in languages like C or C++. JNI bridges the managed Java/Kotlin environment and the native, platform-specific implementations.
+
+## Compilation instructions
+To compile the pipeline-generation program on your host system, use the following commands (replace /path/to/halide with your Halide installation directory):
+```console
+export DYLD_LIBRARY_PATH=/path/to/halide/lib/libHalide.19.dylib
+g++ -std=c++17 blud-android.cpp -o blud-android \
+    -I/path/to/halide/include -L/path/to/halide/lib -lHalide \
+    $(pkg-config --cflags --libs opencv4) -lpthread -ldl \
+    -Wl,-rpath,/path/to/halide/lib
+```
+
+Then execute the binary:
+```console
+./blur_android blur_threshold_android
+```
+
+This will produce two files:
+* blur_threshold_android.a: The static library containing your Halide pipeline.
+* blur_threshold_android.h: The header file needed to invoke the generated pipeline.
+
+We will integrate these files into our Android project in the following section.
+
+## Summary
+In this section, we’ve explored Halide’s powerful ahead-of-time (AOT) and cross-compilation capabilities, preparing an optimized image processing pipeline tailored specifically for Android devices. By using the host-based Halide compiler, we’ve generated a static library optimized for ARM64 Android architecture, incorporating safe boundary conditions, neighborhood-based blurring, and thresholding operations. This streamlined process allows seamless integration of highly optimized native code into Android applications, ensuring both development efficiency and runtime performance on mobile platforms.
\ No newline at end of file
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md
new file mode 100644
index 0000000000..6ce0249adf
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/fusion.md
@@ -0,0 +1,531 @@
+---
+# User change
+title: "Demonstrating Operation Fusion"
+
+weight: 4
+
+layout: "learningpathall"
+---
+
+## Objective
+In the previous section, you explored parallelization and tiling. Here, you will focus on operator fusion (inlining) in Halide i.e., letting producers be computed directly inside their consumers—versus materializing intermediates with compute_root() or compute_at(). You will learn when fusion reduces memory traffic and when materializing saves recomputation (e.g., for large stencils or multi-use intermediates). You will inspect loop nests with print_loop_nest(), switch among schedules (fuse-all, fuse-blur-only, materialize, tile-and-materialize-per-tile) in a live camera pipeline, and measure the impact (ms/FPS/MPix/s).
+
+This section does not cover loop fusion (the fuse directive). You will focus on operator fusion, which is Halide’s default behavior.
+
+## Code
+To demonstrate how fusion in Halide works create a new file `camera-capture-fusion.cpp`, and modify it as follows. This code uses a live camera pipeline (BGR → gray → 3×3 blur → threshold), adds a few schedule variants to toggle operator fusion vs. materialization, and print ms / FPS / MPix/s. So you can see the impact immediately.
+
+```cpp
+#include "Halide.h"
+#include <opencv2/opencv.hpp>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <cstdint>
+#include <exception>
+
+using namespace Halide;
+using namespace cv;
+using namespace std;
+
+enum class Schedule : int {
+    Simple = 0,              // materialize gray + blur
+    FuseBlurAndThreshold = 1,// materialize gray; fuse blur+threshold
+    FuseAll = 2,             // fuse everything (default)
+    Tile = 3,                // tile output; materialize gray per tile; blur fused
+};
+
+static const char* schedule_name(Schedule s) {
+    switch (s) {
+        case Schedule::Simple:               return "Simple";
+        case Schedule::FuseBlurAndThreshold: return "FuseBlurAndThreshold";
+        case Schedule::FuseAll:              return "FuseAll";
+        case Schedule::Tile:                 return "Tile";
+        default:                              return "Unknown";
+    }
+}
+
+// Build the BGR->Gray -> 3x3 binomial blur -> threshold pipeline.
+// We clamp the *ImageParam* at the borders (Func clamp of ImageParam works in Halide 19).
+Pipeline make_pipeline(ImageParam& input, Schedule schedule) {
+    Var x("x"), y("y");
+
+    // Assume 3-channel BGR interleaved frames (we convert if needed).
+    input.dim(0).set_stride(3);      // x-stride = channels
+    input.dim(2).set_stride(1);      // c-stride = 1
+    input.dim(2).set_bounds(0, 3);   // three channels
+
+    Func inputClamped = BoundaryConditions::repeat_edge(input);
+
+    // Gray (Rec.601)
+    Func gray("gray");
+    gray(x, y) = cast<uint8_t>(0.114f * inputClamped(x, y, 0)
+                             + 0.587f * inputClamped(x, y, 1)
+                             + 0.299f * inputClamped(x, y, 2));
+
+    // 3x3 binomial blur (sum/16)
+    Func blur("blur");
+    const uint16_t k[3][3] = {{1,2,1},{2,4,2},{1,2,1}};
+    Expr blurSum = cast<uint16_t>(0);
+    for (int j = 0; j < 3; ++j)
+        for (int i = 0; i < 3; ++i)
+            blurSum = blurSum + cast<uint16_t>(gray(x + i - 1, y + j - 1)) * k[j][i];
+    blur(x, y) = cast<uint8_t>(blurSum / 16);
+
+    // Threshold (binary)
+    Func thresholded("thresholded");
+    Expr T = cast<uint8_t>(128);
+    thresholded(x, y) = select(blur(x, y) > T, cast<uint8_t>(255), cast<uint8_t>(0));
+
+    // Final output
+    Func output("output");
+    output(x, y) = thresholded(x, y);
+    output.compute_root(); // we always realize 'output'
+
+    // Scheduling to demonstrate OPERATOR FUSION vs MATERIALIZATION
+    // Default in Halide = fusion/inlining (no schedule on producers).
+    Var xo("xo"), yo("yo"), xi("xi"), yi("yi");
+
+    switch (schedule) {
+        case Schedule::Simple:
+            // Materialize gray and blur (two loop nests); thresholded fuses into output
+            gray.compute_root();
+            blur.compute_root();
+            break;
+
+        case Schedule::FuseBlurAndThreshold:
+            // Materialize gray; blur and thresholded remain fused into output
+            gray.compute_root();
+            break;
+
+        case Schedule::FuseAll:
+            // No schedule on producers: gray, blur, thresholded all fuse into output
+            break;
+
+        case Schedule::Tile:
+            // Tile the output; compute gray per tile; blur stays fused within tile
+            output.tile(x, y, xo, yo, xi, yi, 64, 64);
+            gray.compute_at(output, xo);
+            break;
+    }
+
+    // (Optional) Print loop nest once to “x-ray” the schedule
+    std::cout << "\n---- Loop structure (" << schedule_name(schedule) << ") ----\n";
+    output.print_loop_nest();
+    std::cout << "-----------------------------------------------\n";
+
+    return Pipeline(output);
+}
+
+int main(int argc, char** argv) {
+    // Optional CLI: start with a given schedule number 0..3
+    Schedule current = Schedule::FuseAll;
+    if (argc >= 2) {
+        int s = std::atoi(argv[1]);
+        if (s >= 0 && s <= 3) current = static_cast<Schedule>(s);
+    }
+    std::cout << "Starting with schedule: " << schedule_name(current)
+              << " (press 0..3 to switch; q/Esc to quit)\n";
+
+    // Open camera
+    VideoCapture cap(0);
+    if (!cap.isOpened()) {
+        std::cerr << "Error: Unable to open camera.\n";
+        return 1;
+    }
+    cap.set(CAP_PROP_CONVERT_RGB, true); // ask OpenCV for BGR frames
+
+    // Grab one frame to get size/channels
+    Mat frame;
+    cap >> frame;
+    if (frame.empty()) {
+        std::cerr << "Error: empty first frame.\n";
+        return 1;
+    }
+    if (frame.channels() == 4) {
+        cvtColor(frame, frame, COLOR_BGRA2BGR);
+    } else if (frame.channels() == 1) {
+        cvtColor(frame, frame, COLOR_GRAY2BGR);
+    }
+    if (!frame.isContinuous()) frame = frame.clone();
+
+    const int width  = frame.cols;
+    const int height = frame.rows;
+
+    // Halide inputs/outputs
+    ImageParam input(UInt(8), 3, "input");
+    Buffer<uint8_t, 2> outBuf(width, height, "out");
+
+    // Build pipeline for the starting schedule
+    Pipeline pipe = make_pipeline(input, current);
+
+    bool warmed_up = false;
+    namedWindow("Fusion Demo (live)", WINDOW_NORMAL);
+
+    for (;;) {
+        cap >> frame;
+        if (frame.empty()) break;
+        if (frame.channels() == 4) {
+            cvtColor(frame, frame, COLOR_BGRA2BGR);
+        } else if (frame.channels() == 1) {
+            cvtColor(frame, frame, COLOR_GRAY2BGR);
+        }
+        if (!frame.isContinuous()) frame = frame.clone();
+
+        // Wrap interleaved frame
+        auto in_rt = Runtime::Buffer<uint8_t>::make_interleaved(
+            frame.data, frame.cols, frame.rows, /*channels*/3);
+        Buffer<> in_fe(*in_rt.raw_buffer());
+        input.set(in_fe);
+
+        // Time the Halide realize() only
+        auto t0 = std::chrono::high_resolution_clock::now();
+        try {
+            pipe.realize(outBuf);
+        } catch (const Halide::RuntimeError& e) {
+            std::cerr << "Halide runtime error: " << e.what() << "\n";
+            break;
+        } catch (const std::exception& e) {
+            std::cerr << "std::exception: " << e.what() << "\n";
+            break;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        double fps = ms > 0.0 ? 1000.0 / ms : 0.0;
+        double mpixps = ms > 0.0 ? (double(width) * double(height)) / (ms * 1000.0) : 0.0;
+
+        std::cout << std::fixed << std::setprecision(2)
+                  << (warmed_up ? "" : "[warm-up] ")
+                  << schedule_name(current) << " | "
+                  << ms << " ms  |  "
+                  << fps << " FPS  |  "
+                  << mpixps << " MPix/s\r" << std::flush;
+        warmed_up = true;
+
+        // Show result
+        Mat view(height, width, CV_8UC1, outBuf.data());
+        imshow("Fusion Demo (live)", view);
+        int key = waitKey(1);
+        if (key == 27 || key == 'q' || key == 'Q') break;
+
+        // Hotkeys 0..3 to switch schedules live
+        if (key >= '0' && key <= '3') {
+            Schedule next = static_cast<Schedule>(key - '0');
+            if (next != current) {
+                std::cout << "\nSwitching to schedule: " << schedule_name(next) << "\n";
+                current = next;
+                try {
+                    pipe = make_pipeline(input, current); // rebuild JIT with new schedule
+                } catch (const Halide::CompileError& e) {
+                    std::cerr << "Halide compile error: " << e.what() << "\n";
+                    break;
+                }
+                warmed_up = false; // next frame includes JIT, label as warm-up
+            }
+        }
+    }
+
+    std::cout << "\n";
+    destroyAllWindows();
+    return 0;
+}
+```
+You will begin by pulling in the right set of headers. Right after the includes you define an enumeration, Schedule, which lists the four different scheduling strategies you want to experiment with. These represent the “modes” you will toggle between while the program is running: a simple materialized version, a fused blur-plus-threshold, a fully fused pipeline, and a tiled variant.
+
+Finally, to make the output more readable, you add a small helper function, `schedule_name`. It converts each enum value into a human-friendly label so that when the program prints logs or overlays statistics, you can immediately see which schedule is active.
+```cpp
+#include "Halide.h"
+#include <opencv2/opencv.hpp>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <cstdint>
+#include <exception>
+
+using namespace Halide;
+using namespace cv;
+using namespace std;
+
+enum class Schedule : int {
+    Simple = 0,
+    FuseBlurAndThreshold = 1,
+    FuseAll = 2,
+    Tile = 3,
+};
+
+static const char* schedule_name(Schedule s) { ... }
+```
+
+The main part of this program is the `make_pipeline` function. It defines the camera processing pipeline in Halide and applies different scheduling choices depending on which mode we select.
+
+You start by declaring Var x, y as our pixel coordinates. Similarly as before, the camera frames come in as 3-channel interleaved BGR, you will tell Halide how the data is laid out: the stride along x is 3 (one step moves across all three channels), the stride along c (channels) is 1, and the bounds on the channel dimension are 0–2.
+
+Because you don’t want to worry about array bounds when applying filters, you will clamp the input at the borders. In Halide 19, BoundaryConditions::repeat_edge works cleanly when applied to an ImageParam, since it has .dim() information. This way, all downstream stages can assume safe access even at the edges of the image.
+
+```cpp
+Pipeline make_pipeline(ImageParam& input, Schedule schedule) {
+    Var x("x"), y("y");
+
+    // (a) Interleaved constraints for BGR frames
+    input.dim(0).set_stride(3);      // x stride = channels
+    input.dim(2).set_stride(1);      // channel stride = 1
+    input.dim(2).set_bounds(0, 3);   // channels = 0..2
+
+    // (b) Border handling: clamp the *ImageParam* (works cleanly in Halide 19)
+    Func inputClamped = BoundaryConditions::repeat_edge(input);
+```
+
+Next comes the gray conversion. As in previous section, you will use Rec.601 weights a 3×3 binomial blur. Instead of using a reduction domain (RDom), you unroll the sum in C++ host code with a pair of loops over the kernel. The kernel values {1, 2, 1; 2, 4, 2; 1, 2, 1} approximate a Gaussian filter. Each pixel of blur is simply the weighted sum of its 3×3 neighborhood, divided by 16.
+
+You will then add a threshold stage. Pixels above 128 become white, and all others black, producing a binary image. Finally, define an output Func that wraps the thresholded result and call compute_root() on it so that it will be realized explicitly when you run the pipeline.
+
+```cpp
+    // (c) BGR → gray (Rec.601, float weights)
+    Func gray("gray");
+    gray(x, y) = cast<uint8_t>(0.114f * inputClamped(x, y, 0)
+                             + 0.587f * inputClamped(x, y, 1)
+                             + 0.299f * inputClamped(x, y, 2));
+
+    // (d) 3×3 binomial blur, unrolled in host code (no RDom needed)
+    Func blur("blur");
+    const uint16_t k[3][3] = {{1,2,1},{2,4,2},{1,2,1}};
+    Expr blurSum = cast<uint16_t>(0);
+    for (int j = 0; j < 3; ++j)
+        for (int i = 0; i < 3; ++i)
+            blurSum = blurSum + cast<uint16_t>(gray(x + i - 1, y + j - 1)) * k[j][i];
+    blur(x, y) = cast<uint8_t>(blurSum / 16);
+
+    // (e) Threshold to binary
+    Func thresholded("thresholded");
+    Expr T = cast<uint8_t>(128);
+    thresholded(x, y) = select(blur(x, y) > T, cast<uint8_t>(255), cast<uint8_t>(0));
+
+    // (f) Final output and default root
+    Func output("output");
+    output(x, y) = thresholded(x, y);
+    output.compute_root();
+```
+
+Now comes the interesting part: the scheduling choices. Depending on the Schedule enum passed in, you instruct Halide to either fuse everything (the default), materialize some intermediates, or even tile the output.
+  * Simple: Here you will explicitly compute and store both gray and blur across the whole frame with compute_root(). This makes them easy to reuse or parallelize, but requires extra memory traffic.
+  * FuseBlurAndThreshold: You compute gray once as a planar buffer, but leave blur and thresholded fused into output. This often works well when the input is interleaved, because subsequent stages read from a planar gray.
+  * FuseAll: You will apply no scheduling to producers, so gray, blur, and thresholded are all inlined into output. This minimizes memory usage but can recompute gray many times inside the 3×3 stencil.
+  * Tile: You will split the output into 64×64 tiles. Within each tile, we materialize gray (compute_at(output, xo)), so the working set is small and stays in cache. blur remains fused within each tile.
+
+To help you examine what’s happening, print the loop nest Halide generates for each schedule using print_loop_nest(). This will give you a clear view of how fusion or materialization changes the structure of the computation.
+
+```cpp
+Var xo("xo"), yo("yo"), xi("xi"), yi("yi");
+
+switch (schedule) {
+    case Schedule::Simple:
+        // Materialize gray and blur as whole-frame buffers.
+        gray.compute_root();
+        blur.compute_root();
+        break;
+
+    case Schedule::FuseBlurAndThreshold:
+        // Materialize only gray; leave blur+threshold fused into output.
+        gray.compute_root();
+        break;
+
+    case Schedule::FuseAll:
+        // No schedules on producers → gray, blur, thresholded all inline into output.
+        break;
+
+    case Schedule::Tile:
+        // Tile the output; compute gray per tile; blur stays fused inside each tile.
+        output.tile(x, y, xo, yo, xi, yi, 64, 64);
+        gray.compute_at(output, xo);
+        break;
+}
+
+// Optional: print loop nest to “x-ray” the shape of the generated loops
+std::cout << "\n---- Loop structure (" << schedule_name(schedule) << ") ----\n";
+output.print_loop_nest();
+std::cout << "-----------------------------------------------\n";
+
+return Pipeline(output);
+}
+```
+
+All the camera handling is just like before: you open the default webcam with OpenCV, normalize frames to 3-channel BGR if needed, wrap each frame as an interleaved Halide buffer, run the pipeline, and show the result. You will still time only the realize() call and print ms / FPS / MPix/s, with the first frame marked as [warm-up].
+
+The new part is that you can toggle scheduling modes from the keyboard while the application is running:
+1. Keys:
+* 0 – Simple (materialize gray and blur)
+* 1 – FuseBlurAndThreshold (materialize gray; fuse blur+threshold)
+* 2 – FuseAll (default fusion: fuse gray+blur+threshold)
+* 3 – Tile (tile output; materialize gray per tile; blur fused inside tile)
+* q / Esc – quit
+
+Under the hood, pressing 0–3 triggers a rebuild of the Halide pipeline with the chosen schedule:
+1. You map the key to a Schedule enum value.
+2. You call make_pipeline(input, next) to construct the new scheduled pipeline.
+3. You reset the warm-up flag, so the next line of stats is labeled [warm-up] (that frame includes JIT).
+4. The main loop keeps grabbing frames; only the Halide schedule changes.
+
+This live switching makes fusion tangible: you can watch the loop nest printout change, see the visualization update, and compare throughput numbers in real time as you move between Simple, FuseBlurAndThreshold, FuseAll, and Tile.
+
+Now, build and run the sample:
+```console
+g++ -std=c++17 camera-capture-fusion.cpp -o camera-capture-fusion \
+  -I/path/to/halide/include -L/path/to/halide/lib -lHalide \
+  $(pkg-config --cflags --libs opencv4) -lpthread -ldl \
+  -Wl,-rpath,/path/to/halide/lib
+./camera-capture-fusion
+```
+
+You will see the following output:
+```output
+% ./camera-capture-fusion
+Starting with schedule: FuseAll (press 0..3 to switch; q/Esc to quit)
+
+---- Loop structure (FuseAll) ----
+produce output:
+  for y:
+    for x:
+      output(...) = ...
+-----------------------------------------------
+FuseAll | 18.90 ms  |  52.92 FPS  |  109.74 MPix/s2 MPix/s
+Switching to schedule: FuseBlurAndThreshold
+
+---- Loop structure (FuseBlurAndThreshold) ----
+produce gray:
+  for y:
+    for x:
+      gray(...) = ...
+consume gray:
+  produce output:
+    for y:
+      for x:
+        output(...) = ...
+-----------------------------------------------
+FuseBlurAndThreshold | 4.85 ms  |  206.19 FPS  |  427.55 MPix/s97 MPix/s
+Switching to schedule: FuseAll
+
+---- Loop structure (FuseAll) ----
+produce output:
+  for y:
+    for x:
+      output(...) = ...
+-----------------------------------------------
+FuseAll | 18.14 ms  |  55.12 FPS  |  114.30 MPix/s22 MPix/s
+Switching to schedule: Tile
+
+---- Loop structure (Tile) ----
+produce output:
+  for y.yo:
+    for x.xo:
+      produce gray:
+        for y:
+          for x:
+            gray(...) = ...
+      consume gray:
+        for y.yi in [0, 63]:
+          for x.xi in [0, 63]:
+            output(...) = ...
+-----------------------------------------------
+Tile | 4.98 ms  |  200.73 FPS  |  416.23 MPix/s28 MPix/s
+Switching to schedule: Simple
+
+---- Loop structure (Simple) ----
+produce gray:
+  for y:
+    for x:
+      gray(...) = ...
+consume gray:
+  produce blur:
+    for y:
+      for x:
+        blur(...) = ...
+  consume blur:
+    produce output:
+      for y:
+        for x:
+          output(...) = ...
+-----------------------------------------------
+Simple | 6.01 ms  |  166.44 FPS  |  345.12 MPix/s15 MPix/s
+```
+
+The console output combines two kinds of information:
+1. Loop nests – printed by print_loop_nest(). These show how Halide actually arranges the computation for the chosen schedule. They are a great “x-ray” view of fusion and materialization:
+* In FuseAll, the loop nest contains only output. That’s because gray, blur, and thresholded are all inlined (fused) into it. Each pixel of output recomputes its 3×3 neighborhood of gray.
+* In FuseBlurAndThreshold, there is an extra loop for gray, because we explicitly called gray.compute_root(). The blur and thresholded stages are still fused into output. This reduces recomputation of gray and makes downstream loops simpler to vectorize.
+* In Simple, both gray and blur have their own loop nests, and thresholded fuses into output. This introduces two extra buffers, but each stage is computed once and can be parallelized independently.
+* In Tile, you see the outer tile loops (y.yo and x.xo) and the inner per-tile loops (y.yi, x.xi). Inside each tile, gray is produced once and then consumed by the fused blur and threshold. This keeps the working set small and cache-friendly.
+2. Performance metrics – printed after each realize(). They report:
+* ms – the average time to process one frame.
+* FPS – frames per second (1000 / ms).
+* MPix/s – millions of pixels per second processed.
+
+Comparing the numbers:
+* FuseAll runs at ~53 FPS. It has minimal memory traffic but pays for recomputation of gray under the blur.
+* FuseBlurAndThreshold jumps to over 200 FPS. By materializing gray, we avoid redundant recomputation and allow blur+threshold to stay fused. This is often the sweet spot for interleaved camera input.
+* Simple reaches ~166 FPS. Both gray and blur are materialized, so no recomputation occurs, but memory traffic is higher than in FuseBlurAndThreshold.
+* Tile achieves similar speed (~200 FPS). Producing gray per tile balances recomputation and memory traffic by keeping intermediates local to cache.
+
+By toggling schedules live, you can see and measure how operator fusion and materialization change both the loop structure and the throughput:
+* Fusion is the default in Halide and eliminates temporary storage, but may cause recomputation for spatial filters.
+* Materializing selected stages with compute_root() or compute_at() can reduce recomputation, enable vectorization and parallelization, and sometimes yield much higher throughput.
+* Tile-level materialization (compute_at) provides a hybrid - fusing within tiles while keeping intermediates small and cache-resident.
+
+This demo makes these trade-offs concrete: the loop nest diagrams explain the structure, and the live FPS/MPix/s stats show the real performance impact.
+
+## What “fusion” means in Halide
+One of Halide’s defining features is that, by default, it performs operator fusion, also called inlining. This means that if a stage produces some intermediate values, those values aren’t stored in a separate buffer and then re-read later—instead, the stage is computed directly inside the consumer’s loop. In other words, unless you tell Halide otherwise, every producer Func is fused into the next stage that uses it.
+
+Why is this important? Fusion reduces memory traffic, because Halide doesn’t need to write intermediates out to RAM and read them back again. On CPUs, where memory bandwidth is often the bottleneck, this can be a major performance win. Fusion also improves cache locality, since values are computed exactly where they are needed and the working set stays small. The trade-off, however, is that fusion can cause recomputation: if a consumer uses a neighborhood (like a blur that reads 3×3 or 9×9 pixels), the fused producer may be recalculated multiple times for overlapping regions. Whether fusion is faster depends on the balance between compute cost and memory traffic.
+
+Consider the difference in pseudocode:
+```cpp
+for y:
+  for x:
+    out(x,y) = threshold( sum_{i,j in 3x3} kernel(i,j) * gray(x+i,y+j) )
+    // gray(...) is computed on the fly for each (i,j)
+```
+
+Materialized with compute_root():
+
+```cpp
+for y: for x: gray(x,y) = ...                // write one planar gray image
+for y: for x: out(x,y) = threshold( sum kernel * gray(x+i,y+j) )
+```
+
+The fused version eliminates buffer writes but recomputes gray under the blur stencil. The materialized version performs more memory operations but avoids recomputation, and also gives us a clean point to parallelize or vectorize the gray stage.
+
+It’s worth noting that Halide also supports a loop fusion directive (fuse) that merges two loop variables together. That’s a different concept and not our focus here. In this tutorial, we’re talking specifically about operator fusion—the decision of whether to inline or materialize stages.
+
+## How this looks in the live camera demo
+Our pipeline is: BGR input → gray → 3×3 blur → thresholded → output. Depending on the schedule, we see different kinds of fusion:
+* FuseAll. No schedules on producers. gray, blur, and thresholded are all inlined into output. This minimizes memory traffic but recomputes gray repeatedly inside the 3×3 blur.
+* FuseBlurAndThreshold: We add gray.compute_root(), materializing gray once as a planar buffer. This avoids recomputation of gray and makes downstream blur and thresholded vectorize better. blur and thresholded remain fused.
+* Simple. Both gray and blur are materialized across the frame. This avoids recomputation entirely but increases memory traffic.
+* Tile. We split the output into 64×64 tiles and compute gray per tile (compute_at(output, xo)). This keeps intermediate results local to cache while still fusing blur inside each tile.
+
+By toggling between these modes in the live demo, you can see how the loop nests and throughput numbers change, which makes the abstract idea of fusion much more concrete.
+
+## When to use operator fusion
+Fusion is Halide’s default and usually the right place to start. It’s especially effective for:
+* Element-wise chains, where each pixel is transformed independently:
+examples include intensity scaling or offset, gamma correction, channel mixing, color-space conversions, and logical masking.
+* Cheap post-ops after spatial filters:
+for instance, there’s no reason to materialize a blurred image just to threshold it. Fuse the threshold directly into the blur’s consumer.
+
+In our code, FuseAll inlines gray, blur, and thresholded into output. FuseBlurAndThreshold materializes only gray, then keeps blur and thresholded fused—a common middle ground that balances memory use and compute reuse.
+
+## When to materialize instead of fuse
+Fusion isn’t always best. You’ll want to materialize an intermediate (compute_root() or compute_at()) if:
+* The producer would be recomputed many times under a large stencil.
+* The producer is read from an interleaved source and it’s easier to vectorize a planar buffer.
+* The intermediate is reused by multiple consumers.
+* You need a natural stage to apply parallelization or tiling.
+
+### Profiling
+The fastest way to check whether fusion helps is to measure it. Our demo prints timing and throughput per frame, but Halide also includes a built-in profiler that reports per-stage runtimes. To learn how to enable and interpret the profiler, see the official [Halide profiling tutorial](https://halide-lang.org/tutorials/tutorial_lesson_21_auto_scheduler_generate.html#profiling).
+
+## Summary
+In this section, you have learned about operator fusion in Halide—a powerful technique for reducing memory bandwidth and improving computational efficiency. You explored why fusion matters, looked at scenarios where it is most effective, and saw how Halide’s scheduling constructs such as compute_root() and compute_at() let us control whether stages are fused or materialized. By experimenting with different schedules, including fusing the Gaussian blur and thresholding stages, we observed how fusion can significantly improve the performance of a real-time image processing pipeline
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md
new file mode 100644
index 0000000000..892004c17c
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md
@@ -0,0 +1,239 @@
+---
+# User change
+title: "Background and Installation"
+
+weight: 2
+
+layout: "learningpathall"
+---
+
+## Introduction
+Halide is a powerful, open-source programming language specifically designed to simplify and optimize high-performance image and signal processing pipelines. Initially developed by researchers at MIT and Adobe in 2012, Halide addresses a critical challenge in computational imaging: efficiently mapping image-processing algorithms onto diverse hardware architectures without extensive manual tuning. It accomplishes this by clearly separating the description of an algorithm (specifying the mathematical or logical transformations applied to images or signals) from its schedule (detailing how and where those computations execute). This design enables rapid experimentation and effective optimization for various processing platforms, including CPUs, GPUs, and mobile hardware.
+
+A key advantage of Halide lies in its innovative programming model. By clearly distinguishing between algorithmic logic and scheduling decisions—such as parallelism, vectorization, memory management, and hardware-specific optimizations, developers can first focus on ensuring the correctness of their algorithms. Performance tuning can then be handled independently, significantly accelerating development cycles. This approach often yields performance that matches or even surpasses manually optimized code. As a result, Halide has seen widespread adoption across industry and academia, powering image processing systems at organizations such as Google, Adobe, and Facebook, and enabling advanced computational photography features used by millions daily.
+
+In this learning path, you will explore Halide’s foundational concepts, set up your development environment, and create your first functional Halide application. By the end, you will understand what makes Halide uniquely suited to efficient image processing, particularly on mobile and Arm-based hardware, and be ready to build your own optimized pipelines.
+
+For broader or more general use cases, please refer to the official Halide documentation and tutorials available at [halide-lang.org](https://halide-lang.org).
+
+The example code for this Learning Path is available in two repositories [here](https://github.com/dawidborycki/Arm.Halide.Hello-World.git) and [here](https://github.com/dawidborycki/Arm.Halide.AndroidDemo.git)
+
+## Key concepts in Halide
+### Separation of algorithm and schedule
+At the core of Halide’s design philosophy is the principle of clearly separating algorithms from schedules. Traditional image-processing programming tightly couples algorithmic logic with execution strategy, complicating optimization and portability. In contrast, Halide explicitly distinguishes these two components:
+  * Algorithm: Defines what computations are performed—for example, image filters, pixel transformations, or other mathematical operations on image data.
+  * Schedule: Specifies how and where these computations are executed, addressing critical details such as parallel execution, memory usage, caching strategies, and hardware-specific optimizations.
+
+This separation allows developers to rapidly experiment and optimize their code for different hardware architectures or performance requirements without altering the core algorithmic logic.
+
+Halide provides three key building blocks, including Functions, Vars, and Pipelines, to simplify and structure image processing algorithms. Consider the following illustrative example:
+
+```cpp
+Halide::Var x("x"), y("y"), c("c");
+Halide::Func brighter("brighter");
+
+// Define a function to increase image brightness by 50
+brighter(x, y, c) = Halide::cast<uint8_t>(Halide::min(input(x, y, c) + 50, 255));
+```
+
+Functions (Func) represent individual computational steps or image operations. Each Func encapsulates an expression applied to pixels, allowing concise definition of complex image processing tasks. Vars symbolically represent spatial coordinates or dimensions (e.g., horizontal x, vertical y, color channel c). They specify where computations are applied in the image data Pipelines are formed by interconnecting multiple Func objects, structuring a clear workflow where the output of one stage feeds into subsequent stages, enabling modular and structured image processing.
+
+Halide is a domain-specific language (DSL) tailored explicitly for image and signal processing tasks. It provides a concise set of predefined operations and building blocks optimized for expressing complex image processing pipelines. By abstracting common computational patterns into simple yet powerful operators, Halide allows developers to succinctly define their processing logic, facilitating readability, maintainability, and easy optimization for various hardware targets.
+
+### Scheduling strategies (parallelism, vectorization, tiling)
+Halide offers several powerful scheduling strategies designed for maximum performance:
+  * Parallelism: Executes computations concurrently across multiple CPU cores, significantly reducing execution time for large datasets.
+  * Vectorization: Enables simultaneous processing of multiple data elements using SIMD (Single Instruction, Multiple Data) instructions available on CPUs and GPUs, greatly enhancing performance.
+  * Tiling: Divides computations into smaller blocks (tiles) optimized for cache efficiency, thus improving memory locality and reducing overhead due to memory transfers.
+
+By combining these scheduling techniques, developers can achieve optimal performance tailored specifically to their target hardware architecture.
+
+Beyond manual scheduling strategies, Halide also provides an Autoscheduler, a powerful tool that automatically generates optimized schedules tailored to specific hardware architectures, further simplifying performance optimization.
+
+## System requirements and environment setup
+To start developing with Halide, your system must meet several requirements and dependencies.
+
+### Installation options
+Halide can be set up using one of two main approaches:
+* Installing pre-built binaries - pre-built binaries are convenient, quick to install, and suitable for most beginners or standard platforms (Windows, Linux, macOS). This approach is recommended for typical use cases.
+* Building Halide from source is required when pre-built binaries are unavailable for your specific environment, or if you wish to experiment with the latest Halide features or LLVM versions still under active development. This method typically requires greater familiarity with build systems and may be more suitable for advanced users.
+
+Here, you will use pre-built binaries:
+  1. Visit the official Halide releases [page](https://github.com/halide/Halide/releases). As of this writing, the latest Halide version is v19.0.0.
+  2. Download and unzip the binaries to a convenient location (e.g., /usr/local/halide on Linux/macOS or C:\halide on Windows).
+  3. 3. Optionally set environment variables to simplify further usage:
+```console
+export HALIDE_DIR=/path/to/halide
+export PATH=$HALIDE_DIR/bin:$PATH
+```
+
+To proceed futher, make sure to install the following components:
+1. LLVM (Halide requires LLVM to compile and execute pipelines): 
+* Linux (Ubuntu):
+```console
+sudo apt-get install llvm-19-dev libclang-19-dev clang-19
+```
+* macOS (Homebrew):
+```console
+brew install llvm
+```
+2. OpenCV (for image handling in later lessons):
+* Linux (Ubuntu):
+```console
+sudo apt-get install libopencv-dev pkg-config
+```
+* macOS (Homebrew):
+```console
+brew install opencv pkg-config
+```
+
+Halide examples were tested with OpenCV 4.11.0
+
+## Your first Halide program
+Now you’re ready to build your first Halide-based application. Save the following code in a file named `hello-world.cpp`:
+```cpp
+#include "Halide.h"
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <string>
+#include <cstdint>
+
+using namespace Halide;
+using namespace cv;
+
+int main() {
+    // Static path for the input image.
+    std::string imagePath = "img.png";
+
+    // Load the input image using OpenCV (BGR by default).
+    Mat input = imread(imagePath, IMREAD_COLOR);
+    // Alternative: Halide has a built-in IO function to directly load images as Halide::Buffer.
+    // Example: Halide::Buffer<uint8_t> inputBuffer = Halide::Tools::load_image(imagePath);
+    if (input.empty()) {
+        std::cerr << "Error: Unable to load image from " << imagePath << std::endl;
+        return -1;
+    }
+            
+    // Convert RGB back to BGR for correct color display in OpenCV (optional but recommended for OpenCV visualization).
+    cvtColor(input, input, COLOR_BGR2RGB);
+
+    // Wrap the OpenCV Mat data in a Halide::Buffer.
+    Buffer<uint8_t> inputBuffer(input.data, input.cols, input.rows, input.channels());
+
+    // Example Halide pipeline definition directly using inputBuffer
+    // Define Halide pipeline variables:
+    // x, y - spatial coordinates (width, height)
+    // c    - channel coordinate (R, G, B)
+    Var x("x"), y("y"), c("c");
+    Func invert("inverted");
+    invert(x, y, c) = 255 - inputBuffer(x, y, c);
+
+    // Schedule the pipeline so that the channel dimension is the innermost loop,
+    // ensuring that the output is interleaved.
+    invert.reorder(c, x, y);
+
+    // Realize the output buffer with the same dimensions as the input.
+    Buffer<uint8_t> outputBuffer = invert.realize({input.cols, input.rows, input.channels()});
+
+    // Wrap the Halide output buffer directly into an OpenCV Mat header.
+    // CV_8UC3 indicates an 8-bit unsigned integer image (CV_8U) with 3 color channels (C3), typically representing RGB or BGR images.
+    // This does not copy data; it creates a header that refers to the same memory.
+    Mat output(input.rows, input.cols, CV_8UC3, outputBuffer.data());
+
+    // Convert from BGR to RGB for consistency (optional, but recommended if your pipeline expects RGB).
+    cvtColor(output, output, COLOR_RGB2BGR);
+
+    // Display the input and processed image.
+    imshow("Original Image", input);
+    imshow("Inverted Image", output);
+
+    // Wait indefinitely until a key is pressed.
+    waitKey(0); // Wait for a key press before closing the window.
+
+    return 0;
+}
+```
+
+This program demonstrates how to combine Halide’s image processing capabilities with OpenCV’s image I/O and display functionality. It begins by loading an image from disk using OpenCV, specifically reading from a static file named `img.png` (here you use a Cameraman image). Since OpenCV loads images in BGR format by default, the code immediately converts the image to RGB format so that it is compatible with Halide’s expectations.
+
+Once the image is loaded and converted, the program wraps the raw image data into a Halide buffer, capturing the image’s dimensions (width, height, and color channels). Next, the Halide pipeline is defined through a function named invert, which specifies the computations to perform on each pixel—in this case, subtracting the original pixel value from 255 to invert the colors. The pipeline definition alone does not perform any actual computation; it only describes what computations should occur and how to schedule them.
+
+The actual computation occurs when the pipeline is executed with the call to invert.realize(...). This is the step that processes the input image according to the defined pipeline and produces an output Halide buffer. The scheduling directive (invert.reorder(c, x, y)) ensures that pixel data is computed in an interleaved manner (channel-by-channel per pixel), aligning the resulting data with OpenCV’s expected memory layout for images.
+
+Finally, the processed Halide output buffer is efficiently wrapped in an OpenCV Mat header without copying pixel data. For proper display in OpenCV, which uses BGR channel ordering by default, the code converts the processed image back from RGB to BGR. The program then displays the original and inverted images in separate windows, waiting for a key press before exiting. This approach demonstrates a streamlined integration between Halide for high-performance image processing and OpenCV for convenient input and output operations.
+
+By default, Halide orders loops based on the order of variable declaration. In this example, the original ordering (x, y, c) implies processing the image pixel-by-pixel across all horizontal positions (x), then vertical positions (y), and finally channels (c). This ordering naturally produces a planar memory layout (e.g., processing all red pixels first, then green, then blue).
+
+However, the optimal loop order depends on your intended memory layout and compatibility with external libraries:
+1. Interleaved Layout (RGBRGBRGB…):
+* Commonly used by libraries such as OpenCV.
+* To achieve this, the color channel (c) should be the innermost loop, followed by horizontal (x) and then vertical (y) loops
+
+Specifically, call:
+```cpp
+invert.reorder(c, x, y);
+```
+This changes the loop nesting to process each pixel’s channels together (R, G, B for the first pixel, then R, G, B for the second pixel, and so on), resulting in:
+* Better memory locality and cache performance when interfacing with interleaved libraries like OpenCV.
+* Reduced overhead for subsequent image-handling operations (display, saving, or further processing).
+
+By default, OpenCV stores images in interleaved memory layout, using the HWC (Height, Width, Channel) ordering. To correctly represent this data layout in a Halide buffer, you can also explicitly use the Buffer::make_interleaved() method, which ensures the data layout is properly specified. The code snippet would look like this:
+
+```cpp
+// Wrap the OpenCV Mat data in a Halide buffer with interleaved HWC layout.
+Buffer<uint8_t> inputBuffer = Buffer<uint8_t>::make_interleaved(
+    input.data, input.cols, input.rows, input.channels()
+);
+```
+
+2. Planar Layout (RRR...GGG...BBB...):
+* Preferred by certain image-processing routines or hardware accelerators (e.g., some GPU kernels or certain ML frameworks).
+* Achieved naturally by Halide’s default loop ordering (x, y, c).
+
+It is essential to select loop ordering based on your specific data format requirements and integration scenario. Halide provides full flexibility, allowing you to explicitly reorder loops to match the desired memory layout efficiently.
+
+In Halide, two distinct concepts must be distinguished clearly:
+1. Loop execution order (controlled by reorder). Defines the nesting order of loops during computation. For example, to make the channel dimension (c) innermost during computation:
+
+```cpp
+invert.reorder(c, x, y);
+```
+2. Memory storage layout (controlled by reorder_storage). Defines the actual order in which data is stored in memory, such as interleaved or planar:
+
+```cpp
+invert.reorder_storage(c, x, y);
+```
+
+Using only reorder(c, x, y) affects the computational loop order but not necessarily the memory layout. The computed data could still be stored in planar order by default. Using reorder_storage(c, x, y) explicitly defines the memory layout as interleaved.
+
+## Compilation instructions
+Compile the program as follows (replace /path/to/halide accordingly):
+```console
+export DYLD_LIBRARY_PATH=/path/to/halide/lib/libHalide.19.dylib
+g++ -std=c++17 hello-world.cpp -o hello-world \
+    -I/path/to/halide/include -L/path/to/halide/lib -lHalide \
+    $(pkg-config --cflags --libs opencv4) -lpthread -ldl \
+    -Wl,-rpath,/path/to/halide/lib
+```
+
+Note that, on Linux, you would set LD_LIBRARY_PATH instead:
+```console
+export LD_LIBRARY_PATH=/path/to/halide/lib/
+```
+
+Run the executable:
+```console
+./hello-world
+```
+
+You will see two windows displaying the original and inverted images:
+![img1](Figures/01.png)
+![img2](Figures/02.png)
+
+## Summary
+In this section, you have learned Halide’s foundational concepts, explored the benefits of separating algorithms and schedules, set up your development environment, and created your first functional Halide application integrated with OpenCV. 
+
+While the example introduces the core concepts of Halide pipelines (such as defining computations symbolically and realizing them), it does not yet showcase the substantial benefits of explicitly separating algorithm definition from scheduling strategies.
+
+In subsequent sections, you will explore advanced Halide scheduling techniques, including parallelism, vectorization, tiling, and loop fusion, which will clearly demonstrate the practical advantages of separating algorithm logic from scheduling. These techniques enable fine-grained performance optimization tailored to specific hardware without modifying algorithmic correctness.
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md
new file mode 100644
index 0000000000..a06d206841
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/processing-workflow.md
@@ -0,0 +1,585 @@
+---
+# User change
+title: "Building a Simple Camera Image Processing Workflow"
+
+weight: 3
+
+layout: "learningpathall"
+---
+
+## Objective
+In this section, you will build a real-time camera processing pipeline using Halide. First, you capture video frames from a webcam using OpenCV, then implement a Gaussian (binomial) blur to smooth the captured images, followed by thresholding to create a clear binary output highlighting prominent image features. After establishing this pipeline, you will measure performance and then explore Halide’s scheduling options—parallelization and tiling—to understand when they help and when they don’t.
+
+## Gaussian blur and thresholding
+Create a new `camera-capture.cpp` file and modify it as follows:
+```cpp
+#include "Halide.h"
+#include "HalideRuntime.h"   // for Runtime::Buffer make_interleaved
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <string>
+#include <cstdint>
+#include <exception>
+
+using namespace cv;
+using namespace std;
+
+// Clamp coordinate within [0, maxCoord - 1].
+static inline Halide::Expr clampCoord(Halide::Expr coord, int maxCoord) {
+    return Halide::clamp(coord, 0, maxCoord - 1);
+}
+
+int main() {
+    // Open the default camera.
+    VideoCapture cap(0);
+    if (!cap.isOpened()) {
+        cerr << "Error: Unable to open camera." << endl;
+        return -1;
+    }
+
+    while (true) {
+        // Capture frame (typically interleaved BGR).
+        Mat frame;
+        cap >> frame;
+        if (frame.empty()) {
+            cerr << "Error: Received empty frame." << endl;
+            break;
+        }
+        if (!frame.isContinuous()) frame = frame.clone();
+
+        const int width    = frame.cols;
+        const int height   = frame.rows;
+        const int channels = frame.channels(); // 3 (BGR) or 4 (BGRA)
+
+        // Wrap the interleaved OpenCV frame for Halide.
+        auto in_rt = Halide::Runtime::Buffer<uint8_t>::make_interleaved(
+            frame.data, width, height, channels);
+        Halide::Buffer<> inputBuffer(*in_rt.raw_buffer()); // front-end view
+
+        // Define ImageParam (x, y, c) and declare interleaved layout.
+        Halide::ImageParam input(Halide::UInt(8), 3, "input");
+        input.set(inputBuffer);
+        input.dim(0).set_stride(channels); // x-stride = C (interleaved)
+        input.dim(2).set_stride(1);        // c-stride = 1 (adjacent bytes)
+        input.dim(2).set_bounds(0, channels);
+
+        // Spatial vars.
+        Halide::Var x("x"), y("y");
+
+        // Grayscale in Halide
+        Halide::Func gray("gray");
+        Halide::Expr r16 = Halide::cast<int16_t>(input(x, y, 2));
+        Halide::Expr g16 = Halide::cast<int16_t>(input(x, y, 1));
+        Halide::Expr b16 = Halide::cast<int16_t>(input(x, y, 0));
+
+        // Integer approx: Y ≈ (77*R + 150*G + 29*B) >> 8
+        gray(x, y) = Halide::cast<uint8_t>((77 * r16 + 150 * g16 + 29 * b16) >> 8);
+
+        // 3×3 binomial kernel (sum = 16).
+        int kernel_vals[3][3] = {
+            {1, 2, 1},
+            {2, 4, 2},
+            {1, 2, 1}
+        };
+        Halide::Buffer<int> kernelBuf(&kernel_vals[0][0], 3, 3);
+
+        // Blur via reduction over a 3×3 neighborhood.
+        Halide::RDom r(0, 3, 0, 3);
+        Halide::Func blur("blur");
+
+        // Use int16_t for safe multiply-and-accumulate with 8-bit input.
+        Halide::Expr val =
+            Halide::cast<int16_t>(
+                gray(clampCoord(x + r.x - 1, width),
+                     clampCoord(y + r.y - 1, height))
+            ) * Halide::cast<int16_t>(kernelBuf(r.x, r.y));
+
+        blur(x, y) = Halide::cast<uint8_t>(Halide::sum(val) / 16);
+
+        // Thresholding.
+        Halide::Func thresholded("thresholded");
+        thresholded(x, y) = Halide::cast<uint8_t>(
+            Halide::select(blur(x, y) > 128, 255, 0)
+        );
+
+        // Realize and display.
+        Halide::Buffer<uint8_t> outputBuffer;
+        try {
+            outputBuffer = thresholded.realize({ width, height });
+        } catch (const std::exception &e) {
+            cerr << "Halide pipeline error: " << e.what() << endl;
+            break;
+        }
+
+        Mat blurredThresholded(height, width, CV_8UC1, outputBuffer.data());
+        imshow("Processed Image", blurredThresholded);
+
+        // ~33 FPS; exit on any key.
+        if (waitKey(30) >= 0) break;
+    }
+
+    cap.release();
+    destroyAllWindows();
+    return 0;
+}
+```
+
+This code demonstrates a real-time image processing pipeline using Halide and OpenCV. The default camera is accessed, continuously capturing color video frames in an interleaved BGR format. The images are then converted to the grayscale directly inside the Halide pipeline. A Halide function gray(x, y) computes the luminance from the red, green, and blue channels using an integer approximation of the Rec.601 formula:
+
+```cpp
+Halide::Expr r16 = Halide::cast<int16_t>(input(x, y, 2));
+Halide::Expr g16 = Halide::cast<int16_t>(input(x, y, 1));
+Halide::Expr b16 = Halide::cast<int16_t>(input(x, y, 0));
+gray(x, y) = Halide::cast<uint8_t>((77 * r16 + 150 * g16 + 29 * b16) >> 8);
+```
+
+The pipeline then applies a Gaussian blur using a 3×3 kernel explicitly defined in a Halide buffer:
+```
+int kernel_vals[3][3] = {
+    {1, 2, 1},
+    {2, 4, 2},
+    {1, 2, 1}
+};
+Halide::Buffer<int> kernelBuf(&kernel_vals[0][0], 3, 3);
+```
+
+Why this kernel?
+* It provides effective smoothing while remaining computationally lightweight.
+* The weights approximate a Gaussian distribution, which reduces noise but preserves edges better than a box filter.
+* This is mathematically a binomial filter, a standard and efficient approximation of Gaussian blurring.
+
+The Gaussian blur is computed using a Halide reduction domain (RDom), which iterates over the 3×3 neighborhood around each pixel. To handle boundaries, pixel coordinates are manually clamped to valid ranges. Intermediate products use 16-bit arithmetic to safely accumulate pixel values before normalization:
+```cpp
+Halide::Expr val =
+    Halide::cast<int16_t>(
+        gray(clampCoord(x + r.x - 1, width),
+             clampCoord(y + r.y - 1, height))
+    ) * Halide::cast<int16_t>(kernelBuf(r.x, r.y));
+
+blur(x, y) = Halide::cast<uint8_t>(Halide::sum(val) / 16);
+```
+
+After the blur stage, the pipeline applies a thresholding operation to highlight prominent features. Thresholding converts the blurred grayscale image into a binary image: pixels with intensity greater than 128 become white (255), while all others become black (0). This is expressed in Halide as:
+```cpp
+Halide::Func thresholded("thresholded");
+thresholded(x, y) = Halide::cast<uint8_t>(
+    Halide::select(blur(x, y) > 128, 255, 0)
+);
+```
+
+This simple but effective step emphasizes strong edges and regions of high contrast, often used as a building block in segmentation and feature extraction pipelines
+
+Finally, the result is realized by Halide into a buffer and directly wrapped into an OpenCV matrix (cv::Mat) without extra copying:
+```cpp
+Halide::Buffer<uint8_t> outputBuffer = thresholded.realize({width, height});
+Mat blurredThresholded(height, width, CV_8UC1, outputBuffer.data());
+imshow("Processed Image", blurredThresholded);
+```
+
+The main loop continues capturing frames, running the Halide pipeline, and displaying the processed output in real-time until a key is pressed. This demonstrates how Halide integrates with OpenCV to build efficient, interactive image processing applications.
+
+In the examples above, pixel coordinates are manually clamped with a helper function:
+
+```cpp
+gray(clampCoord(x + r.x - 1, width),
+     clampCoord(y + r.y - 1, height))
+```
+
+This ensures that when the reduction domain r extends beyond the image borders (for example, at the left or top edge), the coordinates are clipped into the valid range [0, width-1] and [0, height-1]. Manual clamping is explicit and easy to understand, but it scatters boundary-handling logic across the pipeline.
+
+Halide provides an alternative through boundary condition functions, which wrap an existing Func and define its behavior outside the valid region. For the Gaussian blur, you can clamp the grayscale function instead of the raw input, producing a new function that automatically handles out-of-bounds coordinates:
+```cpp
+// Clamp the grayscale function instead of raw input
+Halide::Func grayClamped = Halide::BoundaryConditions::repeat_edge(gray);
+
+// Use grayClamped inside the blur definition
+Halide::Expr val =
+    Halide::cast<int16_t>(grayClamped(x + (r.x - 1), y + (r.y - 1))) *
+    Halide::cast<int16_t>(kernelBuf(r.x, r.y));
+```
+
+In practice, both manual clamping and BoundaryConditions produce the same visual results. But for maintainability and performance tuning, using BoundaryConditions::repeat_edge (or another suitable policy) can be the preferred approach in production Halide pipelines.
+
+## Compilation instructions
+Compile the program as follows (replace /path/to/halide accordingly):
+```console
+g++ -std=c++17 camera-capture.cpp -o camera-capture \
+    -I/path/to/halide/include -L/path/to/halide/lib -lHalide \
+    $(pkg-config --cflags --libs opencv4) -lpthread -ldl \
+    -Wl,-rpath,/path/to/halide/lib
+```
+
+Run the executable:
+```console
+./camera-capture
+```
+
+The output should look as in the figure below:
+![img3](Figures/03.png)
+
+## Parallelization and Tiling
+In this section, you will explore two complementary scheduling optimizations provided by Halide: Parallelization and Tiling. Both techniques help enhance performance but achieve it through different mechanisms—parallelization leverages multiple CPU cores, whereas tiling improves cache efficiency by optimizing data locality.
+
+Now you will learn how to use each technique separately for clarity and to emphasize their distinct benefits. 
+
+Let’s first lock in a measurable baseline before we start changing the schedule. You will create a second file, `camera-capture-perf-measurement.cpp`, that runs the same grayscale → blur → threshold pipeline but prints per-frame timing, FPS, and MPix/s around the Halide realize() call. This lets you quantify each optimization you will add next (parallelization, tiling, caching).
+
+Create `camera-capture-perf-measurement.cpp` with the following code:
+```cpp
+#include "Halide.h"
+#include "HalideRuntime.h"          
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <string>
+#include <cstdint>
+#include <exception>
+#include <chrono>                    
+#include <iomanip>                   
+
+using namespace cv;
+using namespace std;
+
+// Clamp coordinate within [0, maxCoord - 1].
+static inline Halide::Expr clampCoord(Halide::Expr coord, int maxCoord) {
+    return Halide::clamp(coord, 0, maxCoord - 1);
+}
+
+int main() {
+    // Open the default camera.
+    VideoCapture cap(0);
+    if (!cap.isOpened()) {
+        cerr << "Error: Unable to open camera." << endl;
+        return -1;
+    }
+
+    bool warmed_up = false;  // skip/report first-frame JIT separately
+
+    while (true) {
+        // Capture frame.
+        Mat frame;
+        cap >> frame;
+        if (frame.empty()) {
+            cerr << "Error: Received empty frame." << endl;
+            break;
+        }
+        if (!frame.isContinuous()) {
+            frame = frame.clone();
+        }
+
+        int width    = frame.cols;
+        int height   = frame.rows;
+        int channels = frame.channels();   // typically 3 (BGR) or 4 (BGRA)
+
+        // Wrap the interleaved BGR[BGR...] frame for Halide
+        auto in_rt = Halide::Runtime::Buffer<uint8_t>::make_interleaved(
+            frame.data, width, height, channels);
+        Halide::Buffer<> inputBuffer(*in_rt.raw_buffer()); // front-end Buffer view
+
+        // Define ImageParam for color input (x, y, c).
+        Halide::ImageParam input(Halide::UInt(8), 3, "input");
+        input.set(inputBuffer);
+
+        const int C = frame.channels();          // 3 (BGR) or 4 (BGRA)
+        input.dim(0).set_stride(C);              // x stride = channels (interleaved)
+        input.dim(2).set_stride(1);              // c stride = 1 (adjacent bytes)
+        input.dim(2).set_bounds(0, C);           // c in [0, C)
+
+        // Define variables representing image coordinates.
+        Halide::Var x("x"), y("y");
+
+        // Grayscale in Halide (BGR order; ignore alpha if present)
+        Halide::Func gray("gray");
+        Halide::Expr r16 = Halide::cast<int16_t>(input(x, y, 2));
+        Halide::Expr g16 = Halide::cast<int16_t>(input(x, y, 1));
+        Halide::Expr b16 = Halide::cast<int16_t>(input(x, y, 0));
+
+        // Integer approx: Y ≈ (77*R + 150*G + 29*B) >> 8
+        gray(x, y) = Halide::cast<uint8_t>((77 * r16 + 150 * g16 + 29 * b16) >> 8);
+
+        // Kernel layout: [1 2 1; 2 4 2; 1 2 1], sum = 16.
+        int kernel_vals[3][3] = {
+            {1, 2, 1},
+            {2, 4, 2},
+            {1, 2, 1}
+        };
+        Halide::Buffer<int> kernelBuf(&kernel_vals[0][0], 3, 3);
+
+        Halide::RDom r(0, 3, 0, 3);
+        Halide::Func blur("blur");
+
+        Halide::Expr val =
+            Halide::cast<int16_t>( gray(clampCoord(x + r.x - 1, width),
+                                        clampCoord(y + r.y - 1, height)) ) *
+            Halide::cast<int16_t>( kernelBuf(r.x, r.y) );
+
+        blur(x, y) = Halide::cast<uint8_t>(Halide::sum(val) / 16);
+
+        // Thresholding stage
+        Halide::Func thresholded("thresholded");
+        thresholded(x, y) = Halide::cast<uint8_t>(
+            Halide::select(blur(x, y) > 128, 255, 0)
+        );
+
+        // Performance timing around realize() only
+        Halide::Buffer<uint8_t> outputBuffer;
+        auto t0 = std::chrono::high_resolution_clock::now();
+
+        try {
+            outputBuffer = thresholded.realize({ width, height });
+        } catch (const std::exception &e) {
+            cerr << "Halide pipeline error: " << e.what() << endl;
+            break;
+        }
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+
+        // First frame includes JIT; mark it so you know why it's slower
+        double fps = (ms > 0.0) ? 1000.0 / ms : 0.0;
+        double mpixps = (ms > 0.0) ? (double(width) * double(height)) / (ms * 1000.0) : 0.0;
+
+        std::cout << std::fixed << std::setprecision(2)
+                  << (warmed_up ? "" : "[warm-up] ")
+                  << "Halide realize: " << ms << " ms  |  "
+                  << fps << " FPS  |  "
+                  << mpixps << " MPix/s" << endl;
+
+        warmed_up = true;
+
+        // Wrap output in OpenCV Mat and display.
+        Mat blurredThresholded(height, width, CV_8UC1, outputBuffer.data());
+        imshow("Processed Image", blurredThresholded);
+
+        // Wait for 30 ms (~33 FPS). Exit if any key is pressed.
+        if (waitKey(30) >= 0) {
+            break;
+        }
+    }
+
+    std::cout << std::endl;
+    cap.release();
+    destroyAllWindows();
+    return 0;
+}
+```
+ 
+* The console prints ms, FPS, and MPix/s per frame, measured strictly around realize() (camera capture and UI are excluded).
+* The very first line is labeled [warm-up] because it includes Halide’s JIT compilation. You can ignore it when comparing schedules.
+* MPix/s = (width*height)/seconds is a good resolution-agnostic metric to compare schedule variants.
+
+Build and run the application. Here is the sample output:
+
+```console
+% ./camera-capture-perf-measurement 
+[warm-up] Halide realize: 327.13 ms  |  3.06 FPS  |  6.34 MPix/s
+Halide realize: 77.32 ms  |  12.93 FPS  |  26.82 MPix/s
+Halide realize: 82.86 ms  |  12.07 FPS  |  25.03 MPix/s
+Halide realize: 83.59 ms  |  11.96 FPS  |  24.81 MPix/s
+Halide realize: 79.20 ms  |  12.63 FPS  |  26.18 MPix/s
+Halide realize: 78.97 ms  |  12.66 FPS  |  26.26 MPix/s
+Halide realize: 80.37 ms  |  12.44 FPS  |  25.80 MPix/s
+Halide realize: 79.60 ms  |  12.56 FPS  |  26.05 MPix/s
+Halide realize: 80.52 ms  |  12.42 FPS  |  25.75 MPix/s
+Halide realize: 80.22 ms  |  12.47 FPS  |  25.85 MPix/s
+Halide realize: 80.91 ms  |  12.36 FPS  |  25.63 MPix/s
+Halide realize: 79.90 ms  |  12.51 FPS  |  25.95 MPix/s
+Halide realize: 79.49 ms  |  12.58 FPS  |  26.09 MPix/s
+Halide realize: 79.78 ms  |  12.53 FPS  |  25.99 MPix/s
+Halide realize: 80.74 ms  |  12.38 FPS  |  25.68 MPix/s
+Halide realize: 80.88 ms  |  12.36 FPS  |  25.64 MPix/s
+Halide realize: 81.07 ms  |  12.34 FPS  |  25.58 MPix/s
+Halide realize: 79.98 ms  |  12.50 FPS  |  25.93 MPix/s
+Halide realize: 79.73 ms  |  12.54 FPS  |  26.01 MPix/s
+Halide realize: 80.24 ms  |  12.46 FPS  |  25.84 MPix/s
+Halide realize: 80.99 ms  |  12.35 FPS  |  25.60 MPix/s
+Halide realize: 80.70 ms  |  12.39 FPS  |  25.69 MPix/s
+Halide realize: 81.24 ms  |  12.31 FPS  |  25.52 MPix/s
+Halide realize: 79.77 ms  |  12.54 FPS  |  26.00 MPix/s
+Halide realize: 79.81 ms  |  12.53 FPS  |  25.98 MPix/s
+Halide realize: 80.13 ms  |  12.48 FPS  |  25.88 MPix/s
+Halide realize: 80.12 ms  |  12.48 FPS  |  25.88 MPix/s
+Halide realize: 80.45 ms  |  12.43 FPS  |  25.78 MPix/s
+Halide realize: 77.72 ms  |  12.87 FPS  |  26.68 MPix/s
+Halide realize: 80.54 ms  |  12.42 FPS  |  25.74 MPix/s
+Halide realize: 80.44 ms  |  12.43 FPS  |  25.78 MPix/s
+Halide realize: 79.47 ms  |  12.58 FPS  |  26.09 MPix/s
+Halide realize: 79.68 ms  |  12.55 FPS  |  26.02 MPix/s
+Halide realize: 79.79 ms  |  12.53 FPS  |  25.99 MPix/s
+Halide realize: 79.86 ms  |  12.52 FPS  |  25.97 MPix/s
+Halide realize: 80.52 ms  |  12.42 FPS  |  25.75 MPix/s
+Halide realize: 79.47 ms  |  12.58 FPS  |  26.09 MPix/s
+Halide realize: 82.55 ms  |  12.11 FPS  |  25.12 MPix/s
+Halide realize: 78.59 ms  |  12.72 FPS  |  26.38 MPix/s
+Halide realize: 79.98 ms  |  12.50 FPS  |  25.93 MPix/s
+Halide realize: 79.06 ms  |  12.65 FPS  |  26.23 MPix/s
+Halide realize: 80.54 ms  |  12.42 FPS  |  25.75 MPix/s
+Halide realize: 79.19 ms  |  12.63 FPS  |  26.19 MPix/s
+Halide realize: 80.70 ms  |  12.39 FPS  |  25.70 MPix/s
+```
+
+This gives an average FPS of 12.48, and average throughput of 25.88 MPix/s. Now you can start measuring potential improvements from scheduling.
+
+### Parallelization
+Parallelization lets Halide run independent pieces of work at the same time on multiple CPU cores. For image pipelines, rows (or tiles of rows) are naturally parallel: each can be processed independently once producer data is available. By distributing work across cores, we reduce wall-clock time—crucial for real-time video.
+
+With the baseline measured, you will apply a minimal schedule that parallelizes the blur reduction across rows while keeping the threshold stage at root. This avoids tricky interactions between a parallel consumer and an unscheduled reduction (a common source of internal errors).
+
+Add these lines right after the threshold definition (and before any realize()):
+```cpp
+blur.compute_root().parallel(y);   // parallelize reduction across scanlines
+thresholded.compute_root();        // cheap pixel-wise stage at root
+```
+
+This does two important things:
+* compute_root() on blur moves the reduction to the top level, so it isn’t nested under a parallel loop that might complicate reduction ordering.
+* parallel(y) parallelizes over the pure loop variable y (rows), not the reduction domain r, which is the safe/idiomatic way to parallelize reductions in Halide.
+
+Now rebuild and run the application again. The results should look like:
+```output
+% ./camera-capture-perf-measurement
+[warm-up] Halide realize: 312.66 ms  |  3.20 FPS  |  6.63 MPix/s
+Halide realize: 84.86 ms  |  11.78 FPS  |  24.44 MPix/s
+Halide realize: 88.53 ms  |  11.30 FPS  |  23.42 MPix/s
+Halide realize: 85.46 ms  |  11.70 FPS  |  24.26 MPix/s
+Halide realize: 83.12 ms  |  12.03 FPS  |  24.95 MPix/s
+Halide realize: 88.70 ms  |  11.27 FPS  |  23.38 MPix/s
+Halide realize: 87.58 ms  |  11.42 FPS  |  23.68 MPix/s
+Halide realize: 83.38 ms  |  11.99 FPS  |  24.87 MPix/s
+Halide realize: 81.65 ms  |  12.25 FPS  |  25.39 MPix/s
+Halide realize: 84.88 ms  |  11.78 FPS  |  24.43 MPix/s
+Halide realize: 84.40 ms  |  11.85 FPS  |  24.57 MPix/s
+Halide realize: 85.30 ms  |  11.72 FPS  |  24.31 MPix/s
+Halide realize: 83.15 ms  |  12.03 FPS  |  24.94 MPix/s
+Halide realize: 85.69 ms  |  11.67 FPS  |  24.20 MPix/s
+Halide realize: 83.39 ms  |  11.99 FPS  |  24.87 MPix/s
+
+% g++ -std=c++17 camera-capture-perf-measurement.cpp -o camera-capture-perf-measurement \
+    -I/Users/db/Repos/Halide-19.0.0-arm-64-osx/include -L/Users/db/Repos/Halide-19.0.0-arm-64-osx/lib -lHalide \
+    $(pkg-config --cflags --libs opencv4) -lpthread -ldl \
+    -Wl,-rpath,/Users/db/Repos/Halide-19.0.0-arm-64-osx
+% ./camera-capture-perf-measurement 
+[warm-up] Halide realize: 300.76 ms  |  3.32 FPS  |  6.89 MPix/s
+Halide realize: 64.23 ms  |  15.57 FPS  |  32.29 MPix/s
+Halide realize: 64.68 ms  |  15.46 FPS  |  32.06 MPix/s
+Halide realize: 71.92 ms  |  13.90 FPS  |  28.83 MPix/s
+Halide realize: 63.78 ms  |  15.68 FPS  |  32.51 MPix/s
+Halide realize: 67.95 ms  |  14.72 FPS  |  30.52 MPix/s
+Halide realize: 67.31 ms  |  14.86 FPS  |  30.81 MPix/s
+Halide realize: 67.90 ms  |  14.73 FPS  |  30.54 MPix/s
+Halide realize: 68.81 ms  |  14.53 FPS  |  30.14 MPix/s
+Halide realize: 68.57 ms  |  14.58 FPS  |  30.24 MPix/s
+Halide realize: 66.83 ms  |  14.96 FPS  |  31.03 MPix/s
+Halide realize: 68.04 ms  |  14.70 FPS  |  30.47 MPix/s
+Halide realize: 67.72 ms  |  14.77 FPS  |  30.62 MPix/s
+Halide realize: 68.79 ms  |  14.54 FPS  |  30.14 MPix/s
+Halide realize: 67.56 ms  |  14.80 FPS  |  30.69 MPix/s
+Halide realize: 67.65 ms  |  14.78 FPS  |  30.65 MPix/s
+Halide realize: 67.81 ms  |  14.75 FPS  |  30.58 MPix/s
+Halide realize: 67.81 ms  |  14.75 FPS  |  30.58 MPix/s
+Halide realize: 68.03 ms  |  14.70 FPS  |  30.48 MPix/s
+Halide realize: 67.44 ms  |  14.83 FPS  |  30.75 MPix/s
+Halide realize: 70.11 ms  |  14.26 FPS  |  29.58 MPix/s
+Halide realize: 66.23 ms  |  15.10 FPS  |  31.31 MPix/s
+Halide realize: 67.96 ms  |  14.72 FPS  |  30.51 MPix/s
+Halide realize: 68.00 ms  |  14.71 FPS  |  30.49 MPix/s
+Halide realize: 67.98 ms  |  14.71 FPS  |  30.50 MPix/s
+Halide realize: 67.56 ms  |  14.80 FPS  |  30.69 MPix/s
+Halide realize: 68.53 ms  |  14.59 FPS  |  30.26 MPix/s
+Halide realize: 67.06 ms  |  14.91 FPS  |  30.92 MPix/s
+```
+
+This gives, on average FPS: 14.79, and throughput of 30.67 MPix/s, leading to ~+18.5% improvement vs baseline.
+
+### Tiling
+Tiling is a scheduling technique that divides computations into smaller, cache-friendly blocks or tiles. This approach significantly enhances data locality, reduces memory bandwidth usage, and leverages CPU caches more efficiently. While tiling can also use parallel execution, its primary advantage comes from optimizing intermediate data storage.
+
+Tiling splits the image into cache-friendly blocks (tiles). Two wins:
+* Partitioning: tiles are easy to parallelize across cores.
+* Locality: when you cache intermediates per tile, you avoid refetching/recomputing data and hit L1/L2 more often.
+
+Now lets look at both flavors.
+
+### Tiling with explicit intermediate storage (best for cache efficiency)
+Here you will cache gray once per tile so the 3×3 blur can reuse it instead of recomputing RGB -> gray up to 9× per output pixel.
+
+Before using this, remove any earlier compute_root().parallel(y) schedule for blur.
+
+```cpp
+// After defining: input, gray, blur, thresholded
+Halide::Var xo("xo"), yo("yo"), xi("xi"), yi("yi");
+
+// Tile & parallelize the consumer; vectorize inner x on planar output.
+thresholded
+    .tile(x, y, xo, yo, xi, yi, 128, 64)
+    .vectorize(xi, 16)
+    .parallel(yo);
+
+// Compute blur inside each tile and vectorize its inner x.
+blur
+    .compute_at(thresholded, xo)
+    .vectorize(x, 16);
+
+// Cache RGB→gray per tile (reads interleaved input → keep unvectorized).
+gray
+    .compute_at(thresholded, xo)
+    .store_at(thresholded, xo);
+```
+
+In this scheduling:
+* tile(...) splits the image into cache-friendly blocks and makes it easy to parallelize across tiles.
+* blur.compute_at(thresholded, xo) localizes the blur computation to each tile (it doesn’t force storing blur; it just computes it where it’s needed, keeping the working set small).
+* gray.compute_at(...).store_at(...) materializes a tile-local planar buffer for the grayscale intermediate so blur can reuse it within the tile.
+* Vectorization is applied only to planar stages (blur, thresholded), gray stays unvectorized because it reads interleaved input (x-stride = channels).
+
+Recompile your application as before, then run. On our machine, this version ran at ~7.6 FPS (~15.76 MPix/s, ~139 ms/frame), slower than baseline (~12.48 FPS) and the parallelization-only schedule (~14.79 FPS). The 3×3 blur is very small (low arithmetic intensity), the extra writes/reads of a tile-local buffer add overhead, and the interleaved source still limits how efficiently the gray producer can be read/vectorized.
+
+This pattern shines when the cached intermediate is expensive and reused a lot (bigger kernels, multi-use intermediates, or separable/multi-stage pipelines). For a tiny 3×3 on CPU, the benefit often doesn’t amortize.
+
+### Tiling for parallelization (without explicit intermediate storage)
+Tiling can also be used just to partition work across cores, without caching intermediates. This keeps the schedule simple: you split the output into tiles, parallelize across tiles, and vectorize along unit-stride x. Producers are computed inside each tile to keep the working set small, but don’t materialize extra tile-local buffers:
+```cpp
+// Tiling (partitioning only)
+Halide::Var xo("xo"), yo("yo"), xi("xi"), yi("yi");
+
+thresholded
+    .tile(x, y, xo, yo, xi, yi, 128, 64)  // try 128x64; tune per CPU
+    .vectorize(xi, 16)                    // safe: planar, unit-stride along x
+    .parallel(yo);                        // run tiles across cores
+
+blur
+    .compute_at(thresholded, xo)          // keep work tile-local
+    .vectorize(x, 16);                    // vectorize planar blur
+```
+
+What this does
+* tile(...) splits the image into cache-friendly blocks and makes parallelization straightforward.
+* parallel(yo) distributes tiles across CPU cores.
+* compute_at(thresholded, xo) evaluates blur per tile (better locality) without forcing extra storage.
+* Vectorization is applied to planar stages (blur, thresholded).
+
+Recompile your application as before, then run. On our test machine, we got 9.35 FPS (19.40 MPix/s, ~106.93 ms/frame). This is slower than both the baseline and the parallelization-only schedule. The main reasons:
+* Recomputation of gray: with a 3×3 blur, each output reuses up to 9 neighbors; leaving gray inlined means RGB→gray is recomputed for each tap.
+* Interleaved input: gray reads BGR interleaved data (x-stride = channels), limiting unit-stride vectorization efficiency upstream.
+* Overhead vs. work: a 3×3 blur has low arithmetic intensity; extra tile/task overhead isn’t amortized.
+
+Tiling without caching intermediates mainly helps partition work, but for tiny kernels on CPU (and interleaved sources) it often underperforms. The earlier “quick win” (blur.compute_root().parallel(y)) remains the better choice here.
+
+### Tiling vs. parallelization
+* Parallelization spreads independent work across CPU cores. For this pipeline, the safest/most effective quick win was:
+```cpp
+blur.compute_root().parallel(y);
+thresholded.compute_root();
+```
+* Tiling for cache efficiency helps when an expensive intermediate is reused many times per output (e.g., larger kernels, separable/multi-stage pipelines, multiple consumers) and when producers read planar data. Caching gray per tile with a tiny 3×3 kernel over an interleaved source added overhead and ran slower (~8.2 FPS / 17.0 MPix/s).
+* Tiling for parallelization (partitioning only) simplifies work distribution and enables vectorization of planar stages, but with low arithmetic intensity (3×3) and an interleaved source it underperformed here (~9.35 FPS / 19.40 MPix/s).
+
+When to choose what:
+* Start with parallelizing the main reduction at root.
+* Add tiling + caching only if: kernel ≥ 5×5, separable/multi-pass blur, or the intermediate is reused by multiple consumers—and preferably after converting sources to planar (or precomputing a planar gray).
+* Keep stages that read interleaved inputs unvectorized; vectorize only planar consumers.
+
+## Summary
+In this section, you built a real-time Halide+OpenCV pipeline—grayscale, a 3×3 binomial blur, then thresholding—and instrumented it to measure throughput. The baseline settled around 12.48 FPS (25.88 MPix/s). A small, safe schedule tweak that parallelizes the blur reduction across rows lifted performance to about 14.79 FPS (30.67 MPix/s). In contrast, tiling used only for partitioning landed near 9.35 FPS (19.40 MPix/s), and tiling with a cached per-tile grayscale buffer was slower still at roughly 8.2 FPS (17.0 MPix/s).
+
+The pattern is clear. On CPU, with a small kernel and an interleaved camera source, parallelizing the reduction is the most effective first step. Tiling starts to pay off only when an expensive intermediate is reused enough to amortize the overhead, e.g., after making the blur separable (horizontal+vertical), producing a planar grayscale once per frame with gray.compute_root(), and applying boundary conditions to unlock interior fast paths. From there, tune tile sizes and thread count to squeeze out the remaining headroom.
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/2-emulation-layer.md b/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/2-emulation-layer.md
index ff6ec801ab..95d00da6cc 100644
--- a/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/2-emulation-layer.md
+++ b/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/2-emulation-layer.md
@@ -34,7 +34,7 @@ To emulate the ML extensions for Vulkan:
    ```
 ![Add user-defined Vulkan layers path in Vulkan Configurator#center](./images/load_layers.png "Figure 1: Add Vulkan layer path.")
 
-5. Ensure the Graph layer is listed *above* the Tensor layer, and that you've set up the configuration scope as shown in the image.
+5. Switch back to the **Vulkan Loader Management** tab. Ensure the Graph layer is listed *above* the Tensor layer, and that you've set up the configuration scope as shown in the image.
 
 ![Layer configuration showing Graph above Tensor#center](./images/verify_layers.png "Figure 2: Verify layer ordering and scope.")
 
diff --git a/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/images/verify_layers.png b/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/images/verify_layers.png
index 51e7a45b62..6a907daa12 100644
Binary files a/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/images/verify_layers.png and b/content/learning-paths/mobile-graphics-and-gaming/nss-unreal/images/verify_layers.png differ
diff --git a/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/2-ml-ext-for-vulkan.md b/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/2-ml-ext-for-vulkan.md
index ac2f4cd193..42bc9bc559 100644
--- a/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/2-ml-ext-for-vulkan.md
+++ b/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/2-ml-ext-for-vulkan.md
@@ -55,7 +55,7 @@ Next, enable the emulation layers using the Vulkan Configurator to simulate the
 
 Under the **Vulkan Layers Location** tab, add the path to your `MLEmulationLayerForVulkan` folder.
 
-In the **Apply a Vulkan Loader Configuration** list, right-click and choose **Create a new Configuration**. You can give the new configuration any name, for example `tensor_and_data_graph`.
+Switch back to the **Vulkan Loader Management** tab. In the **Apply a Vulkan Loader Configuration** list, right-click and choose **Create a new Configuration**. You can give the new configuration any name, for example `tensor_and_data_graph`.
 
 ![Screenshot of the Vulkan Configurator showing the Vulkan Layers Location tab, where the emulation layer path (MLEmulationLayerForVulkan) is added to enable VK_ARM_data_graph and VK_ARM_tensors alt-text#center](./images/load_layers.png "Add emulation layers in Vulkan Configurator")
 
diff --git a/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/images/verify_layers.png b/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/images/verify_layers.png
index 51e7a45b62..84b6514d12 100644
Binary files a/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/images/verify_layers.png and b/content/learning-paths/mobile-graphics-and-gaming/vulkan-ml-sample/images/verify_layers.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md
index c42dd243cc..c17a248304 100644
--- a/content/learning-paths/servers-and-cloud-computing/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/_index.md
@@ -8,7 +8,7 @@ key_ip:
 maintopic: true
 operatingsystems_filter:
 - Android: 3
-- Linux: 175
+- Linux: 177
 - macOS: 13
 - Windows: 14
 pinned_modules:
@@ -19,11 +19,11 @@ pinned_modules:
     - migration
 subjects_filter:
 - CI-CD: 7
-- Containers and Virtualization: 31
+- Containers and Virtualization: 32
 - Databases: 17
 - Libraries: 9
 - ML: 31
-- Performance and Architecture: 71
+- Performance and Architecture: 72
 - Storage: 1
 - Web: 12
 subtitle: Optimize cloud native apps on Arm for performance and cost
@@ -72,7 +72,7 @@ tools_software_languages_filter:
 - Capstone: 1
 - CCA: 8
 - Clair: 1
-- Clang: 12
+- Clang: 13
 - ClickBench: 1
 - ClickHouse: 1
 - CMake: 1
@@ -89,7 +89,7 @@ tools_software_languages_filter:
 - Fortran: 1
 - FunASR: 1
 - FVP: 7
-- GCC: 24
+- GCC: 25
 - gdb: 1
 - Geekbench: 1
 - Generative AI: 12
@@ -106,21 +106,27 @@ tools_software_languages_filter:
 - Google Cloud: 2
 - Google Test: 1
 - HammerDB: 1
+- Helm: 1
 - Herd7: 1
 - Hugging Face: 11
 - InnoDB: 1
 - Intrinsics: 1
 - iPerf3: 1
+- ipmitool: 1
 - Java: 4
 - JAX: 1
 - JMH: 1
 - Kafka: 1
+- KEDA: 1
+- Kedify: 1
 - Keras: 1
-- Kubernetes: 10
+- KleidiAI: 1
+- Kubernetes: 11
 - Libamath: 1
 - libbpf: 1
 - Linaro Forge: 1
 - Litmus7: 1
+- llama.cpp: 1
 - Llama.cpp: 2
 - LLM: 10
 - llvm-mca: 1
@@ -135,6 +141,7 @@ tools_software_languages_filter:
 - mpi: 1
 - MySQL: 9
 - NEON: 7
+- Neoverse: 1
 - Networking: 1
 - Nexmark: 1
 - NGINX: 4
@@ -142,12 +149,14 @@ tools_software_languages_filter:
 - Ollama: 1
 - ONNX Runtime: 1
 - OpenBLAS: 1
+- OpenBMC: 1
 - OpenJDK 21: 2
 - OpenShift: 1
 - Orchard Core: 1
 - PAPI: 1
 - perf: 6
 - PostgreSQL: 4
+- Profiling: 1
 - Python: 31
 - PyTorch: 9
 - QEMU: 1
@@ -188,6 +197,7 @@ tools_software_languages_filter:
 - wrk2: 2
 - x265: 1
 - YCSB: 1
+- Yocto/BitBake: 1
 - zlib: 1
 - ZooKeeper: 1
 weight: 1
diff --git a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/_index.md
index 467205cda0..1fe6173552 100644
--- a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/_index.md
@@ -5,9 +5,9 @@ draft: true
 cascade:
     draft: true
     
-minutes_to_complete: 40   
+minutes_to_complete: 30   
 
-who_is_this_for: This Learning Path is designed for software developers looking to migrate their Golang workloads from x86_64 to Arm-based platforms, specifically on the Microsoft Azure Cobalt 100 processors.
+who_is_this_for: This is an introductory topic for software developers looking to migrate their Golang workloads from x86_64 to Arm-based platforms, specifically on the Microsoft Azure Cobalt 100 processors.
 
 learning_objectives: 
     - Provision an Azure Arm64 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image.
@@ -16,13 +16,12 @@ learning_objectives:
 
 prerequisites:
     - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6)
-    - Basic understanding of Linux command line.
     - Familiarity with the [Golang](https://go.dev/) and deployment practices on Arm64 platforms.
 
-author: Jason Andrews
+author: Pareena Verma
 
 ### Tags
-skilllevels: Advanced
+skilllevels: Introductory
 subjects: Performance and Architecture
 cloud_service_providers: Microsoft Azure
 
@@ -31,7 +30,6 @@ armips:
 
 tools_software_languages:
     - Golang
-    - go test -bench
 
 operatingsystems:
     - Linux
diff --git a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/baseline-testing.md b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/baseline-testing.md
index c71870f584..2f23ad7591 100644
--- a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/baseline-testing.md
+++ b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/baseline-testing.md
@@ -7,27 +7,26 @@ layout: learningpathall
 ---
 
 
-### Baseline testing of Golang Web Page on Azure Arm64
-This section demonstrates how to test your Go installation on the **Ubuntu Pro 24.04 LTS Arm64** virtual machine by creating and running a simple Go web server that serves a styled HTML page.
+### Baseline Testing: Running a Go Web Server on Azure Arm64
+To validate your Go toolchain and runtime environment, you can build and run a lightweight web server. This ensures that compilation, networking, and runtime execution are working correctly on your Ubuntu Pro 24.04 LTS Arm64 virtual machine running on Azure Cobalt 100.
 
-**1. Create Project Directory**
-
-First, create a new folder called goweb to contain all project files, and then navigate into it:
+1. Create the project directory
+   
+Start by creating a new folder to hold your Go web project and navigate to it:
 
 ```console
 mkdir goweb && cd goweb
 ```
-This command creates a new directory named goweb and then switches into it.
 
-**2. Create HTML Page with Bootstrap Styling**
+2. Create an HTML Page with Bootstrap Styling
 
-Next, create a file named `index.html` using the nano editor:
+Next, create a simple web page that your Go server will serve. Using the nano editor (or any editor of your choice), create a file named `index.html`:
 
 ```console
 nano index.html
 ```
 
-Paste the following HTML code into the index.html file. This builds a simple, styled web page with a header, a welcome message, and a button using Bootstrap.
+Paste the following HTML code into the `index.html` file. This page uses Bootstrap for styling and includes a header, a welcome message, and a button that links to a Go-powered API endpoint.
 
 ```html
 <!DOCTYPE html>
@@ -66,14 +65,14 @@ Paste the following HTML code into the index.html file. This builds a simple, st
 </body>
 </html>
 ```
-**3. Create Golang Web Server**
+3. Create Golang Web Server
 
-Now create the Go program that will serve this web page:
+Now, let’s create the Go program that will serve your static HTML page and expose a simple API endpoint.
 
 ```console
 nano main.go
 ```
-Paste the following code into the main.go file. This sets up a very basic web server that serves files from the current folder, including the **index.html** you just created. When it runs, it will print a message showing the server address.
+Paste the following code into the `main.go` file. This sets up a very basic web server that serves files from the current folder, including the `index.html` you just created. When it runs, it will print a message showing the server address.
 
 ```go
 package main
@@ -105,57 +104,55 @@ func main() {
 }
 ```
 {{% notice Note %}}Running on port 80 requires root privileges. Use sudo with the full Go path if needed.{{% /notice %}}
-**4. Run on the Web Server**
 
-Run your Go program with:
+4. Run the Web Server
+   
+Compile and start your Go program with:
 
 ```console
 sudo /usr/local/go/bin/go run main.go
 ```
 
-This compiles and immediately starts the server. If the server starts successfully, you will see the following message in your terminal::
+This command compiles the Go source code into a binary and immediately starts the server on port 80. If the server starts successfully, you will see the following message in your terminal:
 
 ```output
 2025/08/19 04:35:06 Server running on http://0.0.0.0:80
 ```
-**5. Allow HTTP Traffic in Firewall**
+5. Allow HTTP Traffic in Firewall
+
+On Ubuntu Pro 24.04 LTS virtual machines, UFW (Uncomplicated Firewall) is used to manage firewall rules. By default, it allows only SSH (port 22), while other inbound connections are blocked. 
 
-On **Ubuntu Pro 24.04 LTS** virtual machines, **UFW (Uncomplicated Firewall)** is used to manage firewall rules. By default, it allows only SSH (port 22) and blocks most other traffic.  
+Even if you have already configured Azure Network Security Group (NSG) rules to allow inbound traffic on port 80, the VM level firewall may still block HTTP requests until explicitly opened.
 
-So even if Azure allows HTTP on port 80 (added to inbound ports during VM creation), your VM’s firewall may still block it until you run:
+Run the following commands to allow HTTP traffic on port 80:
 
 ```console
 sudo ufw allow 80/tcp
 sudo ufw enable
 ```
-You can verify that HTTP is now allowed with:
+After enabling UFW and allowing traffic on port 80, confirm that the firewall is now configured correctly by running:
 
 ```console
 sudo ufw status
 ```
-You should see an output similar to: 
+You should see output similar to: 
 ```output
 Status: active
 
 To                         Action      From
 --                         ------      ----
-8080/tcp                   ALLOW       Anywhere
 80/tcp                     ALLOW       Anywhere
-8080/tcp (v6)              ALLOW       Anywhere (v6)
 80/tcp (v6)                ALLOW       Anywhere (v6)
 ```
 
-**6. Open in Browser**
-
-Run the following command to print your VM’s public URL, then open it in a browser:
+6. Open in a Browser
+To quickly get your VM’s public IP address and form the URL, run:
 
 ```console
 echo "http://$(curl -s ifconfig.me)/"
 ```
-When you visit this link, you should see the styled HTML page being served directly by your Go application.
-
-You should see the Golang web page confirming a successful installation of Golang.
+Open this URL in your browser, and you should see the styled HTML landing page being served directly by your Go application.
 
 ![golang](images/go-web.png)
 
-Now, your Golang instance is ready for further benchmarking and production use.                                                                   
+Reaching this page in your browser confirms that Go is installed correctly, your environment is configured, and your Go web server is working end-to-end on Azure Cobalt 100 (Arm64). You can now proceed to perform further benchmarking tests.                                                      
diff --git a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/benchmarking.md
index 9408bdf6db..85329cabad 100644
--- a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/benchmarking.md
+++ b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/benchmarking.md
@@ -6,13 +6,13 @@ weight: 6
 layout: learningpathall
 ---
 
-## Run the performance tests using go test -bench
+## Run Performance Tests Using go test -bench
 
 `go test -bench` (the benchmarking mode of go test) is Golang’s built-in benchmarking framework that measures the performance of functions by running them repeatedly and reporting execution time (**ns/op**), memory usage, and allocations. With the `-benchmem flag`, it also shows memory usage and allocations. It’s simple, reliable, and requires only writing benchmark functions in the standard Golang testing package.
 
 1. Create a Project Folder
 
-Open your terminal and create a new folder for this project:
+In your terminal, create a directory for your benchmark project and navigate into it:
 
 ```console
 mkdir gosort-bench
@@ -20,22 +20,22 @@ cd gosort-bench
 ```
 
 2. Initialize a Go Module
-
-Inside the project directory, run following command:
+   
+Inside your project directory, initialize a new Go module by running:
 
 ```console
 go mod init gosort-bench
 ```
-This creates a go.mod file, which defines the module path (gosort-bench in this case) and marks the directory as a Go project. The go.mod file also allows Go to manage dependencies (external libraries) automatically, ensuring your project remains reproducible and easy to maintain.
+This creates a `go.mod` file, which defines the module path (gosort-bench in this case) and marks the directory as a Go project. The `go.mod` file also allows Go to manage dependencies (external libraries) automatically, ensuring your project remains reproducible and easy to maintain.
 
 3. Add Sorting Functions
 
-Create a file called **sorting.go**:
+Create a file called `sorting.go`:
 
 ```console
 nano sorting.go
 ```
-Paste this code in **sorting.go** file:
+Paste the following code in `sorting.go`:
 
 ```go
 package sorting 
@@ -75,13 +75,15 @@ func partition(arr []int, low, high int) int {
     return i + 1
 }
 ```
-- The code contains **two sorting methods**, Bubble Sort and Quick Sort, which arrange numbers in order from smallest to largest.
-- **Bubble Sort** works by repeatedly comparing two numbers side by side and swapping them if they are in the wrong order. It keeps doing this until the whole list is sorted.
-- **Quick Sor**t is faster. It picks a "pivot" number and splits the list into two groups — numbers smaller than the pivot and numbers bigger than it. Then it sorts each group separately.
-- The **function** partition helps Quick Sort decide where to split the list based on the pivot number.
-- In short, **Bubble Sort is simple but slow,** while **Quick Sort is smarter and usually much faster for big lists of numbers**.
+The code contains two sorting methods, Bubble Sort and Quick Sort, which arrange numbers in order from smallest to largest.
+  * Bubble Sort works by repeatedly comparing two numbers side by side and swapping them if they are in the wrong order. It keeps doing this until the whole list is sorted.
+  * Quick Sort is faster. It picks a pivot number and splits the list into two groups — numbers smaller than the pivot and numbers bigger than it. Then it sorts each group separately. The function partition helps Quick Sort decide where to split the list based on the pivot number.
+
+To summarize, Bubble Sort is simple but slow, while Quick Sort is more efficient and usually much faster for big lists of numbers. 
 
-You create the sorting folder and then move `sorting.go` into it to organize your code properly so that the Go module can reference it as `gosort-bench/sorting`.
+At this point, you have defined two sorting algorithms ready to be benchmarked.
+
+To keep your project modular and maintainable, it’s best practice to place implementation code inside its own package folder. This allows benchmarks and other Go files to import it cleanly.
 
 ```console
 mkdir sorting
@@ -90,13 +92,13 @@ mv sorting.go sorting/
 
 4. Add Benchmark Tests
 
-Create another file called s**orting_benchmark_test.go**:
+Next, create a benchmark test file named `sorting_benchmark_test.go` in your project’s root directory (not inside the sorting/ folder, so it can import the sorting package cleanly):
 
 ```console
 nano sorting_benchmark_test.go
 ````
 
-Paste the below code:
+Paste the following code into it:
 
 ```go
 package sorting_test
@@ -131,13 +133,12 @@ func BenchmarkQuickSort(b *testing.B) {
     }
 }
 ```
+The code implements a benchmark that checks how fast Bubble Sort and Quick Sort run in Go.
+- It first creates a list of 10,000 random numbers each time before running a sort, so the test is fair and consistent.
+- The BenchmarkBubbleSort() function measures the speed of sorting using the slower Bubble Sort method.
+- The BenchmarkQuickSort() function measures the speed of sorting using the faster Quick Sort method.
 
-- The code is a **benchmark test** that checks how fast Bubble Sort and Quick Sort run in Go.
-- It first creates a **list of 10,000 random numbers** each time before running a sort, so the test is fair and consistent.
-- **BenchmarkBubbleSort** measures the speed of sorting using the slower Bubble Sort method.
-- **BenchmarkQuickSort** measures the speed of sorting using the faster Quick Sort method.
-
-When you run **go test -bench=. -benchmem**, Go will show you how long each sort takes and how much memory it uses, so you can compare the two sorting techniques.
+When you run the benchmark, Go will show you how long each sort takes and how much memory it uses, so you can compare the two sorting techniques.
 
 ### Run the Benchmark
 
@@ -145,10 +146,10 @@ Execute the benchmark suite using the following command:
 ```console
 go test -bench=. -benchmem
 ```
-- **-bench=.** - runs all functions starting with Benchmark.
-- **-benchmem** - also shows memory usage (allocations per operation).
+-bench=. runs every function whose name starts with Benchmark.
+-benchmem adds memory metrics (B/op, allocs/op) to the report.
 
-You should see the output similar to this:
+You should see output similar to:
 
 ```output
 goos: linux
@@ -159,14 +160,14 @@ BenchmarkQuickSort-4                3506            340873 ns/op               0
 PASS
 ok      gosort-bench    2.905s
 ```
-### Matrics Explanation
-
-- **ns/op** - nanoseconds per operation (lower is better).
-- **B/op** - bytes of memory used per operation.
-- **allocs/op** - how many memory allocations happened per operation.
+### Metrics Explained
 
+  * ns/op – nanoseconds per operation (lower is better). This is the primary latency metric.
+  * B/op – bytes allocated per operation (lower is better). This is useful for spotting hidden allocations.
+  * allocs/op – number of heap allocations per operation (lower is better). Zero here means the algorithm itself didn’t allocate.
+    
 ### Benchmark summary on Arm64
-Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**.
+Here is a summary of benchmark results collected on an Arm64 D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine.
 
 | Benchmark          | Value on Virtual Machine |
 |-------------------|--------------------------|
@@ -179,7 +180,7 @@ Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pr
 | Total time (s)     | 2.905                   |
 
 ### Benchmark summary on x86_64
-Here is a summary of the benchmark results collected on x86_64 **D4s_v6 Ubuntu Pro 24.04 LTS virtual machine**.
+Here is a summary of the benchmark results collected on x86_64 D4s_v6 Ubuntu Pro 24.04 LTS virtual machine.
 
 | Benchmark          | Value on Virtual Machine |
 |-------------------|--------------------------|
@@ -196,8 +197,8 @@ Here is a summary of the benchmark results collected on x86_64 **D4s_v6 Ubuntu P
 
 When you compare the benchmarking results you will notice that on the Azure Cobalt 100:
 
-- **Arm64 maintains consistency** – the virtual machine delivered stable and predictable results, showing that Arm64 optimizations are effective for compute workloads.
-- **BubbleSort (CPU-heavy, O(n²))** – runs in **~36.6M ns/op**, proving that raw CPU performance on Arm64 is consistent and unaffected by environmental factors.
-- **QuickSort (efficient O(n log n))** – execution is very fast (**~341K ns/op**), demonstrating that Arm64 handles algorithmic workloads efficiently.
-- **No memory overhead** – the benchmark shows **0 B/op and 0 allocs/op**, confirming Golang’s memory efficiency is preserved on Arm64.
-- **Run counts align closely** – **BubbleSort (32 runs)** and **QuickSort (3,506 runs)** indicate Arm64 delivers repeatable and reliable performance.
+Azure Cobalt 100 (Arm64) outperforms in both BubbleSort and QuickSort benchmarks, with the advantage more pronounced for QuickSort. The performance delta (~15–33%) shows how Arm Neoverse cores deliver strong results in CPU-bound, integer-heavy workloads common in Go applications.
+
+For real-world Go applications that rely on sorting, JSON processing, and other recursive or data-processing workloads, running on Azure Cobalt 100 Arm64 VMs can deliver better throughput and reduced execution time compared to similarly sized x86_64 VMs.
+
+These results validate the benefits of running Go workloads on Azure Cobalt 100 Arm64 instances, and establish a baseline for extending benchmarks to real-world workloads beyond sorting.
diff --git a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/create-instance.md
index 9571395aa2..7ef1323d1e 100644
--- a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/create-instance.md
+++ b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/create-instance.md
@@ -8,11 +8,17 @@ layout: learningpathall
 
 ## Introduction
 
-There are several ways to create an Arm-based Cobalt 100 virtual machine : the Microsoft Azure console, the Azure CLI tool, or using your choice of IaC (Infrastructure as Code). This guide will use the Azure console to create a virtual machine with Arm-based Cobalt 100 Processor. 
+There are several ways to create an Arm-based Cobalt 100 virtual machine: 
+
+- The Azure console
+- The Azure CLI
+- An infrastructure as code (IaC) tool
+
+In this section, you will use the Azure console to create a virtual machine with the Arm-based Azure Cobalt 100 processor.
 
 This learning path focuses on the general-purpose virtual machine of the D series. Please read the guide on [Dpsv6 size series](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) offered by Microsoft Azure.  
 
-If you have never used the Microsoft Cloud Platform before, please review the microsoft [guide to Create a Linux virtual machine in the Azure portal](https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal?tabs=ubuntu). 
+While the steps to create this instance are included here for convenience, you can also refer to the [Deploy a Cobalt 100 virtual machine on Azure Learning Path](/learning-paths/servers-and-cloud-computing/cobalt/).
 
 #### Create an Arm-based Azure Virtual Machine 
 
diff --git a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/deploy.md
index a97e52576e..b416eb44ed 100644
--- a/content/learning-paths/servers-and-cloud-computing/golang-on-azure/deploy.md
+++ b/content/learning-paths/servers-and-cloud-computing/golang-on-azure/deploy.md
@@ -8,32 +8,32 @@ layout: learningpathall
 
 
 ## Install Golang on Ubuntu Pro 24.04 LTS (Arm64)
-This section covers installing the latest Golang version on **Ubuntu Pro 24.04 LTS Arm64**, configuring the environment, and verifying the setup.
+This section guides you through installing the latest Go toolchain on Ubuntu Pro 24.04 LTS (Arm64), configuring the environment, and verifying the setup for benchmarking workloads on Azure Cobalt 100 VMs.
 
 1. Download the Golang archive
 
-This command downloads the official Golang package for Linux Arm64 from the Golang website.
+Use the following command to download the latest Go release for Linux Arm64 directly from the official Go distribution site:
 
 ```console
 wget https://go.dev/dl/go1.25.0.linux-arm64.tar.gz
 ```
 {{% notice Note %}}
-There are many enhancements added to Golang version 1.18, that has resulted in up to a 20% increase in performance for Golang workloads on Arm-based servers. Please see [this reference content](https://aws.amazon.com/blogs/compute/making-your-go-workloads-up-to-20-faster-with-go-1-18-and-aws-graviton/) for the details.
+There are many enhancements added to Golang version 1.18, that have resulted in up to a 20% increase in performance for Golang workloads on Arm-based servers. Please see [this blog](https://aws.amazon.com/blogs/compute/making-your-go-workloads-up-to-20-faster-with-go-1-18-and-aws-graviton/) for the details.
 
 The [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) also recommends Golang version 1.18 as the minimum recommended on the Arm platforms.
 {{% /notice %}}
 
-2. Extract the archive into `/usr/local`
+2. Extract the archive 
 
-This unpacks the Golang files into the system directory /usr/local, which is a standard place for system-wide software.
+Unpack the downloaded archive into `/usr/local`, which is the conventional directory for installing system-wide software on Linux. This ensures the Go toolchain is available for all users and integrates cleanly with the system’s environment.
 
 ```console
 sudo tar -C /usr/local -xzf ./go1.25.0.linux-arm64.tar.gz
 ```
 
-3. Add Golang to your system PATH
+3. Add Go to your system PATH
 
-This updates your .bashrc file so your shell can recognize the Golang command from anywhere.
+To make the Go toolchain accessible from any directory, add its binary location to your shell’s PATH environment variable. Updating your `.bashrc` file ensures this change persists across sessions:
 
 ```console
 echo 'export PATH="$PATH:/usr/local/go/bin"' >> ~/.bashrc
@@ -41,34 +41,34 @@ echo 'export PATH="$PATH:/usr/local/go/bin"' >> ~/.bashrc
 
 4. Apply the PATH changes immediately
 
-This reloads your .bashrc so you don’t need to log out and log back in for the changes to take effect.
+After updating .bashrc, reload it so your current shell session picks up the new environment variables without requiring you to log out and back in:
 
 ```console
 source ~/.bashrc
 ```
 
-5. Verify Golang installation
+5. Verify Go installation
 
-This checks if Golang is installed correctly and shows the installed version.
+Check if Go is installed correctly and confirm the version:
 
 ```console
 go version
 ```
 
-You should see an output similar to: 
+You should see output similar to: 
 
 ```output
 go version go1.25.0 linux/arm64
 ```
-6. Check Golang environment settings
+6. Check Go environment settings
 
-This displays Golang’s environment variables (like GOROOT and GOPATH) to ensure they point to the correct installation.
+Use the following command to display Go’s environment variables and confirm that key paths (such as GOROOT and GOPATH) are correctly set:
 
 ```console
 go env
 ```
 
-You should see an output similar to: 
+You should see output similar to: 
 
 ```output
 AR='ar'
@@ -118,4 +118,4 @@ GOVERSION='go1.25.0'
 GOWORK=''
 PKG_CONFIG='pkg-config'
 ```
-Golang installation on Ubuntu Pro 24.04 LTS Arm64 is complete. You can now proceed with Golang development or baseline testing.
+At this point, the Go installation on Ubuntu Pro 24.04 LTS (Arm64) VM is complete. You are now ready to proceed with Go application development, benchmarking, or performance tuning on Azure Cobalt 100 VMs.
diff --git a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md
index 3eee3c2d17..f9a798b797 100644
--- a/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md
+++ b/content/learning-paths/servers-and-cloud-computing/irq-tuning-guide/checking.md
@@ -69,4 +69,4 @@ done
 
 ### Saving these changes
 
-Any changes you make to IRQs will be reset at reboot. You will need to change your systems settings to make your changes permanant.
+Any changes you make to IRQs will be reset at reboot. You will need to change your systems settings to make your changes permanent.
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md
index 32091b4ff5..245237dd5f 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/_index.md
@@ -1,21 +1,17 @@
 ---
-title: Deploy Java applications on the Microsoft Azure Cobalt 100 processors 
-
-draft: true
-cascade:
-    draft: true
+title: Deploy Java applications on Azure Cobalt 100 processors 
 
 minutes_to_complete: 30   
 
-who_is_this_for: This is an introductory topic about Java deployment and benchmarking on Microsoft Azure Cobalt 100 (Arm-based) virtual machines. It is designed for developers migrating Java applications from x86_64 to Arm.
+who_is_this_for: This is an introductory topic about Java deployment and benchmarking on Microsoft Azure Cobalt 100 Arm-based virtual machines. It is designed for developers migrating Java applications from x86_64 to Arm architecture.
 
 learning_objectives: 
-    - Provision an Azure Arm-based Cobalt 100 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image.
-    - Deploy Java on the Azure Arm64 virtual machine.
-    - Perform Java baseline testing and benchmarking on the Arm64 virtual machines.
+    - Provision an Azure Arm-based Cobalt 100 virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image
+    - Deploy Java on the Azure Arm64 virtual machine
+    - Perform Java baseline testing and benchmarking on the Arm64 virtual machines
 
 prerequisites:
-    - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6). 
+    - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6)
 
 
 author: Pareena Verma
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md
index 32b9847d65..1949cfdff4 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/background.md
@@ -6,15 +6,18 @@ weight: 2
 layout: "learningpathall"
 ---
 
-## Cobalt 100 Arm-based processor
+## Azure Cobalt 100 Arm-based CPU for Linux workloads
 
-Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU, ensuring consistent and predictable performance. 
+Azure Cobalt 100 is Microsoft’s first‑generation, in‑house Arm‑based CPU built on Arm Neoverse N2. It is designed for predictable performance and energy efficiency across common Linux workloads such as web and application servers, analytics, open‑source databases, and caching systems. Each vCPU maps to a dedicated physical core and runs up to **3.4 GHz**, helping deliver consistent latency under load.
 
-To learn more about Cobalt 100, refer to the blog [Announcing the preview of new Azure virtual machine based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353).
+Learn more in this Microsoft announcement blog: [Announcing the preview of new Azure VMs based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353).
 
-## Java  
-Java is a high-performance, open-source, object-oriented programming language and runtime environment widely used for building scalable, reliable, and secure applications.  
+## Running Java on Azure Cobalt 100 Arm-based VMs
+
+Java is a mature, object‑oriented language and runtime used to build scalable, secure applications. The Java Virtual Machine (JVM) executes platform‑independent bytecode, enabling *write once, run anywhere* portability across architectures, including Arm64 (AArch64). On Azure Cobalt 100, Java services benefit from modern JIT compilers and efficient multithreading for steady throughput and low tail latency.
+
+Learn more with these resources:
+- Visit the [OpenJDK website](https://openjdk.org/).
+- See the [Java documentation](https://docs.oracle.com/en/java/).
 
-It enables developers to write code once and run it anywhere, thanks to the Java Virtual Machine (JVM), which abstracts away hardware and operating system differences. Java applications are compiled into bytecode, which the JVM executes, providing portability and performance across platforms.  
 
-Java is extensively used in enterprise systems, cloud-native applications, Android development, big data processing, and high-performance computing. Learn more from the [OpenJDK official website](https://openjdk.org/) and its [official documentation](https://docs.oracle.com/en/java/).  
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md
index 11d22b0197..13389b68ba 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/baseline.md
@@ -7,20 +7,20 @@ layout: learningpathall
 ---
 
 
-### Deploy a Java application with a Tomcat-like operation 
-Apache Tomcat is a widely used Java web application server. Technically, it is a Servlet container, responsible for executing Java servlets and supporting technologies like:
+## Deploy a Java application with a Tomcat-like operation 
+Apache Tomcat is a widely used Java web application server. Technically, it is a Servlet container, responsible for executing Java servlets and supporting technologies such as:
 
-  * JSP (JavaServer Pages): Java-based templates for dynamic web content.
-  * RESTful APIs: Lightweight endpoints for modern microservices.
+- JSP (JavaServer Pages): Java-based templates for dynamic web content
+- RESTful APIs: lightweight endpoints for modern microservices
 
-In production, frameworks like Tomcat introduce additional complexity (request parsing, thread management, I/O handling). Before layering those components, it's useful to measure how efficiently raw Java executes simple request/response logic on Azure Cobalt 100 Arm-based instances.
+In production, frameworks like Tomcat introduce additional complexity (such as request parsing, thread management, and I/O handling). Before layering those components, it's useful to measure how efficiently raw Java executes simple request/response logic on Azure Cobalt 100 Arm-based instances.
 
-In this section, you will run a minimal Tomcat-like simulation. It won't launch a real server, but instead it:
-  * Constructs a basic HTTP response string in memory.
-  * Measures the time taken to build that response, acting as a microbenchmark.
-  * Provides a baseline for raw string and I/O handling performance in Java.
+In this section, you will run a minimal Tomcat-like simulation. It won't launch a real server, but instead it will do the following:
+- Construct a basic HTTP response string in memory
+- Measure the time taken to build that response, acting as a microbenchmark
+- Provide a baseline for raw string and I/O handling performance in Java
 
-Using a file editor of your choice create a file named `HttpSingleRequestTest.java`, and add the content below to it:
+Using a file editor of your choice, create a file named `HttpSingleRequestTest.java`, and add the content below to it:
 
 ```java
 public class HttpSingleRequestTest {
@@ -40,18 +40,25 @@ public class HttpSingleRequestTest {
     }
 }
 ```
-Compile and Run Java program :
+## Compile and run the Java program
+
+Compile the program and run it with modest heap sizes and the G1 garbage collector:
 
 ```console
 javac HttpSingleRequestTest.java
 java -Xms128m -Xmx256m -XX:+UseG1GC HttpSingleRequestTest
 ```
 
-- -Xms128m  sets the initial heap size for the Java Virtual Machine to 128 MB. 
-- -Xmx256m sets the maximum heap size for the JVM to 256 MB. 
-- -XX:+UseG1GC enables the G1 Garbage Collector (Garbage First GC), designed for low pause times and better performance in large heaps.
+## JVM flags explained
+
+- **-Xms128m** - sets the initial heap size to 128 MB
+- **-Xmx256m** - sets the maximum heap size to 256 MB
+- **-XX:+UseG1GC** - enables the G1 garbage collector designed for low pause times
+
+## Sample output
+
+If the program runs successfully, you should see output similar to the following:
 
-You should output similar to:
 ```output
 java -Xms128m -Xmx256m -XX:+UseG1GC HttpSingleRequestTest
 Response Generated:
@@ -62,11 +69,21 @@ Content-Length: 29
 Tomcat baseline test on Arm64
 Response generation took 12901.53 microseconds.
 ```
-Output breakdown:
 
-Generated Response: The program generates a fake HTTP 200 OK response with headers and a custom body string.
-Timing Result: The program prints how long it took (in microseconds) to build that response.
-In this example, it took ~12,901 µs (~12.9 ms). Your result will vary depending on CPU load, JVM warm-up, and environment.
+## Output breakdown
+
+- Generated response: the program prints a fake HTTP 200 OK response with headers and a custom body string
+- Timing result: the program prints how long it took (in microseconds) to build that response
+- Variability: results change with CPU load, JVM warm‑up, and environment; run several times and use the median
+
+{{% notice Tip %}}
+For repeatable baselines on Azure Cobalt 100, keep other workloads off the VM, use consistent power settings, and keep OS/JDK versions fixed during comparisons. For statistics and warmups, wrap this code with **JMH**.
+{{% /notice %}}
+
+## Why this baseline matters
+
+- Provides a Tomcat‑like request path without container overhead
+- Enables x86_64 vs Arm64 comparisons on identical code and flags
+- Informs GC and flag choices before testing full frameworks like Tomcat, Jetty, or Netty
+
 
-This provides you with a baseline measurement of how Java handles simple string operations and memory allocation on Cobalt 100 (Arm64) instances.
-It serves as a lightweight proxy for Tomcat-style request handling before adding the full complexity of a servlet container.
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md
index 99de415742..368cccf054 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/benchmarking.md
@@ -1,16 +1,18 @@
+
 ---
-title: Benchmarking via JMH
+title: Benchmark using Java Microbenchmark Harness
 weight: 6 
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
+## Overview
 
 Now that you have built and run a Tomcat-like response in Java, the next step is to benchmark it using a reliable, JVM-aware framework.
 
 ## Run performance tests using JMH
 
-JMH (Java Microbenchmark Harness) is a Java benchmarking framework developed by the JVM team at Oracle to measure the performance of small code snippets with high precision. It accounts for JVM optimizations like JIT and warm-up to ensure accurate and reproducible results. You can measure throughput (ops/sec), average execution time, or percentiles for latency. 
+JMH (Java Microbenchmark Harness) is a Java benchmarking framework developed by the JVM team at Oracle to measure the performance of small code snippets with high precision. It accounts for JVM optimizations like JIT and warmup to ensure accurate and reproducible results. You can measure throughput (ops/sec), average execution time, or percentiles for latency. 
 
 Follow the steps to help benchmark the Tomcat-like operation with JMH:
 
@@ -18,19 +20,13 @@ Follow the steps to help benchmark the Tomcat-like operation with JMH:
 Install Maven:
 
 ```console
-sudo apt install maven -y
+sudo apt update
+sudo apt install -y maven
 ```
 Once Maven is installed, create a JMH benchmark project using the official archetype provided by OpenJDK:
 
 ```console
-mvn archetype:generate \
-  -DinteractiveMode=false \
-  -DarchetypeGroupId=org.openjdk.jmh \
-  -DarchetypeArtifactId=jmh-java-benchmark-archetype \
-  -DarchetypeVersion=1.37 \
-  -DgroupId=com.example \
-  -DartifactId=jmh-benchmark \
-  -Dversion=1.0
+mvn archetype:generate   -DinteractiveMode=false   -DarchetypeGroupId=org.openjdk.jmh   -DarchetypeArtifactId=jmh-java-benchmark-archetype   -DarchetypeVersion=1.37   -DgroupId=com.example   -DartifactId=jmh-benchmark   -Dversion=1.0
 cd jmh-benchmark
 ```
 The output should look like:
@@ -84,10 +80,12 @@ public class MyBenchmark {
 ```
 This mirrors the Tomcat-like simulation you created earlier but now runs under JMH.
 
-Build the Benchmark JAR:
+## Build the benchmark JAR
+
+Build the project to produce the benchmark JAR:
 
 ```console
-mvn clean install
+mvn clean install -q
 ```
 
 The output from this command should look like:
@@ -104,7 +102,7 @@ The output from this command should look like:
 
 After the build is complete, the JMH benchmark JAR will be located in the target directory.
 
-Run the Benchmark:
+Run the benchmark:
 
 ```console
 java -jar target/benchmarks.jar
@@ -205,8 +203,7 @@ Each iteration shows how many times per second your `benchmarkHttpResponse()` me
 REMEMBER: The numbers below are just data. To gain reusable insights, you need to follow up on
 why the numbers are the way they are. Use profilers (see -prof, -lprof), design factorial
 experiments, perform baseline and negative tests that provide experimental control, make sure
-the benchmarking environment is safe on JVM/OS/HW level, ask for reviews from the domain experts.
-Do not assume the numbers tell you what you want them to tell.
+the benchmarking environment is safe on JVM/OS/HW level, ask for reviews from the domain experts. Do not assume the numbers tell you what you want them to tell.
 
 NOTE: Current JVM experimentally supports Compiler Blackholes, and they are in use. Please exercise
 extra caution when trusting the results, look into the generated code to check the benchmark still
@@ -218,34 +215,34 @@ Benchmark                           Mode  Cnt         Score        Error  Units
 MyBenchmark.benchmarkHttpResponse  thrpt   25  35659618.044 ± 686946.011  ops/s
 ```
 
-### Benchmark Metrics Explained  
+## Benchmark metrics explained  
 
-- **Run Count**: The total number of benchmark iterations that JMH executed. More runs improve statistical reliability and help smooth out anomalies caused by the JVM or OS. 
-- **Average Throughput**: The mean number of operations completed per second across all measured iterations. This is the primary indicator of sustained performance for the benchmarked code.
-- **Standard Deviation**: Indicates the amount of variation or dispersion from the average throughput. A smaller standard deviation means more consistent performance.  
-- **Confidence Interval (99.9%)**: The statistical range in which the true average throughput is expected to fall with 99.9% certainty. Narrow confidence intervals suggest more reliable and repeatable measurements. 
-- **Min Throughput**: The lowest observed throughput across all iterations, representing a worst-case scenario under the current test conditions.
-- **Max Throughput**: The highest observed throughput across all iterations, representing the best-case performance under the current test conditions. 
+- **Run count** - the total number of benchmark iterations that JMH executed. More runs improve statistical reliability and help smooth out anomalies caused by the JVM or OS. 
+- **Average throughput** - the mean number of operations completed per second across all measured iterations. This is the primary indicator of sustained performance for the benchmarked code.
+- **Standard deviation** - indicates the amount of variation or dispersion from the average throughput. A smaller standard deviation means more consistent performance.  
+- **Confidence interval (99.9%)** - the statistical range in which the true average throughput is expected to fall with 99.9% certainty. Narrow confidence intervals suggest more reliable and repeatable measurements. 
+- **Min throughput** - the lowest observed throughput across all iterations, representing a worst-case scenario under the current test conditions.
+- **Max throughput** - the highest observed throughput across all iterations, representing the best-case performance under the current test conditions. 
 
-### Benchmark summary on Arm64
+## Benchmark summary on Arm64
 
 Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**.
 | Metric                        | Value |
 |--------------------------------|---------------------------|
-| **Java Version**               | OpenJDK 21.0.8            |
-| **Run Count**                  | 25 iterations             |
-| **Average Throughput**         | 35.66M ops/sec            |
-| **Standard Deviation**         | ±0.92M ops/sec            |
-| **Confidence Interval (99.9%)**| [34.97M, 36.34M] ops/sec  |
-| **Min Throughput**             | 33.53M ops/sec            |
-| **Max Throughput**             | 36.99M ops/sec            |
+| **Java version**               | OpenJDK 21.0.8            |
+| **Run count**                  | 25 iterations             |
+| **Average throughput**         | 35.66M ops/sec            |
+| **Standard deviation**         | ±0.92M ops/sec            |
+| **Confidence interval (99.9%)**| [34.97M, 36.34M] ops/sec  |
+| **Min throughput**             | 33.53M ops/sec            |
+| **Max throughput**             | 36.99M ops/sec            |
 
 
-### Key insights from the results
+## Key insights from the results
 
-- **Strong throughput performance** The benchmark sustained around 35.6 million operations per second, demonstrating efficient string construction and memory handling on the Arm64 JVM.
-- **Consistency across runs** With a standard deviation under 1 million ops/sec, results were tightly clustered. This suggests stable system performance without significant noise from background processes.
-- **High statistical confidence** The narrow 99.9% confidence interval ([34.97M, 36.34M]) indicates reliable, repeatable results. 
-- **Predictable performance envelope** The difference between min (33.5M) and max (37.0M) throughput is modest (~10%), suggests the workload performed consistently without extreme slowdowns or spikes.
+- **Strong throughput performance** - the benchmark sustained around 35.6 million operations per second, demonstrating efficient string construction and memory handling on the Arm64 JVM.
+- **Consistency across runs** - with a standard deviation under 1 million ops/sec, results were tightly clustered. This suggests stable system performance without significant noise from background processes.
+- **High statistical confidence** - the narrow 99.9% confidence interval ([34.97M, 36.34M]) indicates reliable, repeatable results. 
+- **Predictable performance envelope** - the difference between min (33.5M) and max (37.0M) throughput is modest (~10%), suggests the workload performed consistently without extreme slowdowns or spikes.
 
 The Arm-based Azure `D4ps_v6` VM provides stable and efficient performance for Java workloads, even in microbenchmark scenarios. These results establish a baseline you can now compare directly against x86_64 instances to evaluate relative performance. 
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md
index 87ecb87ef0..02fcd2bf06 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/create-instance.md
@@ -1,5 +1,5 @@
 ---
-title: Create an Arm based cloud virtual machine using Microsoft Cobalt 100 CPU 
+title: Create an Arm-based cloud virtual machine using Microsoft Cobalt 100 CPU 
 weight: 3
 
 ### FIXED, DO NOT MODIFY
@@ -12,37 +12,41 @@ You can create an Azure Cobalt 100 Arm64 virtual machine in several ways, includ
 
 In this Learning Path, you’ll use the Azure portal to create a VM with the Cobalt 100 processor, following a process similar to creating any other virtual machine in Azure.
 
-## Step-by-step: create the virtual machine 
-
-Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to "Virtual Machines".
-1. Select "Create", and click on "Virtual Machine" from the drop-down list.
-2. Inside the "Basic" tab, fill in the Instance details such as "Virtual machine name" and "Region".
-3. Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select “Arm64” as the VM architecture.
-4. In the “Size” field, click on “See all sizes” and select the D-Series v6 family of virtual machines. Select “D4ps_v6” from the list.
-
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Figure 1: Select the D-Series v6 family of virtual machines")
-
-5. Select "SSH public key" as an Authentication type. Azure will automatically generate an SSH key pair for you and allow you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine.
-6. Fill in the Administrator username for your VM.
-7. Select "Generate new key pair", and select "RSA SSH Format" as the SSH Key Type. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key.
-8. In the "Inbound port rules", select HTTP (80) and SSH (22) as the inbound ports.
-
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance1.png "Figure 2: Allow inbound port rules")
-
-9. Click on the "Review + Create" tab and review the configuration for your virtual machine. It should look like the following:
-
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/ubuntu-pro.png "Figure 3: Review and Create an Azure Cobalt 100 Arm64 VM")
-
-10. Finally, click on the "Create" button, and click on the "Download Private key and Create Resources" button.
-
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Figure 4: Download Private key and Create Resources")
-
-11. Your virtual machine should be ready and running within no time. You can SSH into the virtual machine using the private key, along with the Public IP details.
-
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "Figure 5: VM deployment confirmation in Azure portal")
+## Create the virtual machine 
+
+To create an Azure virtual machine, open the Azure portal and go to **Virtual machines**. 
+
+Now follow these steps:
+
+- Select **Create** > **Virtual machine** from the toolbar.
+- In **Basics**, complete **Instance details** including **Virtual machine name** and **Region**.
+- In **Image**, choose **Ubuntu Pro 24.04 LTS**, then set **Architecture** to **Arm64**.
+- In **Size**, select **See all sizes**, choose the **Dpsv6 (D‑series v6)** family, then select **D4ps_v6**.
+   
+   ![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6)alt-text#center](images/instance.png "Select the D-Series v6 family of virtual machines")
+- Under **Administrator account**, set **Authentication type** to **SSH public key**.
+- Enter the `Username` for the VM.
+- Select **Generate new key pair**, select **SSH key type** (**ED25519** or **RSA**), and provide a **Key pair name**.
+- In **Inbound port rules**, set **Public inbound ports** to **Allow selected ports**, then select **HTTP (80)** and **SSH (22)**.
+   
+   ![Azure portal VM creation — inbound port rules alt-text#center](images/instance1.png "Allow inbound port rules")
+- Select **Review + create** and verify your configuration.
+   
+   ![Azure portal review and create — Ubuntu Pro Arm64 VM alt-text#center](images/ubuntu-pro.png "Review and create an Azure Cobalt 100 Arm64 VM")
+- Select **Create**, then **Download private key and create resource**.
+    
+    ![Azure portal VM creation — download private key alt-text#center](images/instance4.png "Download private key and create resource")
+- After deployment completes, select **Go to resource**. From **Overview**, copy the **Public IP address** and connect through SSH using your key.
+
+   ![Azure portal deployment confirmation — VM running alt-text#center](images/final-vm.png "VM deployment confirmation in Azure portal")
+
+## Connect to your VM (example)
+
+Use the command below, replacing placeholders with your values:
+```bash
+ssh -i <path-to-private-key> <username>@<public-ip-address>
+```
 
 {{% notice Note %}}
-
-To learn more about Arm-based virtual machine in Azure, refer to “Getting Started with Microsoft Azure” in [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/azure).
-
+To learn more about Arm-based virtual machines in Azure, see "Getting started with Microsoft Azure" in [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/azure).
 {{% /notice %}}
diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md
index 3508bc16bd..2143a76f62 100644
--- a/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md
+++ b/content/learning-paths/servers-and-cloud-computing/java-on-azure/deploy.md
@@ -6,29 +6,34 @@ weight: 4
 layout: learningpathall
 ---
 
+## Install Java on Azure Ubuntu Pro 24.04 LTS (Arm64)
 
-
-## Java Installation on Azure Ubuntu Pro virtual machine
 In this section, you will install Java on your Arm-based Ubuntu Pro virtual machine. The goal is to ensure you have both the Java Runtime Environment (JRE) for running Java applications and the Java Development Kit (JDK) for compiling code and running benchmarks.
 
 
-### Install Java
+## Install OpenJDK (JRE + JDK)
+
+Use the Ubuntu package manager. The `default-jdk` package installs both the runtime and the compiler.
 
-You will install Java using the Ubuntu package manager. `default-jdk` installs both the default JRE and JDK provided by Azure Ubuntu Pro machine.
 ```console
 sudo apt update
 sudo apt install -y default-jdk
 ```
- 
-Verify your JRE installation: 
+
+## Verify your installation
+
+Confirm the architecture and the installed Java versions:
 
 ```console
-java -version 
-``` 
+uname -m
+java -version
+javac -version
+```
 
-You should the JRE version printed: 
+You see the version information printed: 
 
 ```output
+aarch64
 openjdk version "21.0.8" 2025-07-15
 OpenJDK Runtime Environment (build 21.0.8+9-Ubuntu-0ubuntu124.04.1)
 OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sharing)
@@ -37,27 +42,47 @@ OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sha
 Check to ensure that the JDK is properly installed:
 
 ```console
-javac -version 
+which java
+which javac
 ```
 The output should look similar to:
 
 ```output
-javac 21.0.8
+/usr/bin/java
+/usr/bin/javac
 ```
 
-Set the Java Environment Variables to point to the root directory of your JDK installation: 
+Set the Java Environment Variables to point to the root directory of your JDK installation.
+
+Use a text editor to edit the file `$HOME/.bashrc` and add the 2 environment variables.
 
 ```console 
 export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-arm64
 export PATH=$JAVA_HOME/bin:$PATH
+```
+Source the updated `$HOME/.bashrc` file.
+
+```console
 source ~/.bashrc 
 ```
- 
-{{% notice Note %}}
-Ubuntu Pro 24.04 LTS offers the default JDK version 21.0.8. It’s important to ensure that your version of OpenJDK for Arm is at least 11.0.9, or above. There is a large performance gap between OpenJDK-11.0.8 and OpenJDK 11.0.9. A patch added in 11.0.9 reduces false-sharing cache contention. 
-For more information, you can view this [Arm community blog](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/java-performance-on-neoverse-n1). 
 
-You can also refer to the [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) for software package version recommendations on Arm Neoverse Linux machines.
+Confirm the new settings.
+
+```console
+echo $JAVA_HOME
+which java
+which javac
+```
+
+The output is:
+
+```output
+/usr/lib/jvm/java-21-openjdk-arm64
+/usr/lib/jvm/java-21-openjdk-arm64/bin/java
+/usr/lib/jvm/java-21-openjdk-arm64/bin/javac
+```
+
+{{% notice Note %}}
+Ubuntu Pro 24.04 LTS provides OpenJDK 21 by default. Ensure your OpenJDK for Arm64 is **11.0.9 or newer** if you must run Java 11; releases before 11.0.9 can suffer performance issues due to false‑sharing cache contention. See the Arm community blog: [Java performance on Neoverse N1](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/java-performance-on-neoverse-n1). You can also consult the [Arm Ecosystem Dashboard](https://developer.arm.com/ecosystem-dashboard/) for package guidance on Arm Neoverse Linux systems.
 {{% /notice %}}
 
-Your Java environment has been successfully configured. You may now proceed with baseline testing.
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md
new file mode 100644
index 0000000000..7fc65f7eb4
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md
@@ -0,0 +1,57 @@
+---
+title: Autoscaling HTTP applications on Kubernetes
+
+draft: true
+cascade:
+    draft: true
+    
+minutes_to_complete: 45
+
+who_is_this_for: This is an introductory topic for developers running HTTP-based workloads on Kubernetes who want to enable event-driven autoscaling.
+
+learning_objectives:
+  - Install Kedify (KEDA build, HTTP Scaler, and Kedify Agent) via Helm
+  - Verify that the components are running in your cluster
+  - Deploy a sample HTTP application and test autoscaling behavior
+
+prerequisites:
+  - A running Kubernetes cluster (local or cloud)
+  - kubectl and helm installed locally
+  - Access to the Kedify Service dashboard (https://dashboard.kedify.io/) to obtain Organization ID and API Key. You can log in or create an account if you don’t have one
+
+author: Zbynek Roubalik
+
+### Tags
+skilllevels: Introductory
+subjects: Containers and Virtualization
+armips:
+  - Neoverse
+operatingsystems:
+  - Linux
+tools_software_languages:
+  - Kubernetes
+  - Helm
+  - KEDA
+  - Kedify
+
+further_reading:
+  - resource:
+      title: Kedify HTTP Scaler
+      link: https://kedify.io/scalers/http
+      type: documentation
+  - resource:
+      title: Kedify documentation
+      link: https://docs.kedify.io
+      type: documentation
+  - resource:
+      title: KEDA project
+      link: https://keda.sh/
+      type: documentation
+
+
+### FIXED, DO NOT MODIFY
+# =============================================================================
+weight: 1                       # _index.md always has weight of 1 to order correctly
+layout: "learningpathall"       # All files under learning paths have this same wrapper
+learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
+---
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_next-steps.md
new file mode 100644
index 0000000000..52222c2fa7
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_next-steps.md
@@ -0,0 +1,8 @@
+---
+# =============================================================================
+#       FIXED, DO NOT MODIFY THIS FILE
+# =============================================================================
+weight: 21                  # Set to always be larger than the content in this path to be at the end of the navigation.
+title: "Next Steps"         # Always the same, html page title.
+layout: "learningpathall"   # All files under learning paths have this same wrapper for Hugo processing.
+---
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/http-scaling.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/http-scaling.md
new file mode 100644
index 0000000000..88715908f3
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/http-scaling.md
@@ -0,0 +1,283 @@
+---
+title: "HTTP Scaling for Ingress-Based Applications"
+weight: 4
+layout: "learningpathall"
+---
+
+In this section, you’ll gain hands-on experience with Kedify HTTP autoscaling. You will deploy a small web service, expose it through a standard Kubernetes Ingress, and rely on Kedify’s autowiring to route traffic via its proxy so requests are measured and drive scaling.
+
+You will scale a real HTTP app exposed through Kubernetes Ingress using Kedify’s [kedify-http](https://docs.kedify.io/scalers/http-scaler/) scaler. You will deploy a simple application, enable autoscaling with a [ScaledObject](https://keda.sh/docs/latest/concepts/scaling-deployments/), generate load, and observe the system scale out and back in (including scale-to-zero when idle).
+
+## How it works
+
+With ingress autowiring enabled, Kedify automatically routes traffic through its proxy before it reaches your Service/Deployment:
+
+```output
+Ingress → kedify-proxy → Service → Deployment
+```
+
+The [Kedify Proxy](https://docs.kedify.io/scalers/http-scaler/#kedify-proxy) gathers request metrics used by the scaler to make decisions.
+
+## Deployment Overview
+  * Deployment & Service: An HTTP server with a small response delay to simulate work
+  * Ingress: Public entry point configured using host `application.keda`
+  * ScaledObject: A Kedify HTTP scaler using `trafficAutowire: ingress`
+
+## Step 1 — Configure the Ingress IP environment variable
+
+Before testing the application, make sure the INGRESS_IP environment variable is set to your ingress controller’s external IP address or hostname.
+
+If you followed the [Install Ingress Controller](../install-ingress/) guide, you should already have this set. If not, or if you're using an existing ingress controller, run this command:
+
+```bash
+export INGRESS_IP=$(kubectl get service ingress-nginx-controller --namespace=ingress-nginx -o jsonpath='{.status.loadBalancer.ingress[0].ip}{.status.loadBalancer.ingress[0].hostname}')
+echo "Ingress IP/Hostname: $INGRESS_IP"
+```
+This will store the correct IP or hostname in the $INGRESS_IP environment variable. If no value is returned, wait a short while and try again.
+
+{{% notice Note %}}
+If your ingress controller service uses a different name or namespace, update the command accordingly. For example, some installations use `nginx-ingress-controller` or place it in a different namespace.
+{{% /notice %}}
+
+## Step 2 — Deploy the application and configure Ingress
+
+Now  you will deploy a simple HTTP server and expose it using an Ingress resource. The source code for this application is available on [GitHub](https://github.com/kedify/examples/tree/main/samples/http-server).
+
+#### Deploy the application
+
+Run the following command to deploy your application:
+
+```bash
+cat <<'EOF' | kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: application
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: application
+  template:
+    metadata:
+      labels:
+        app: application
+    spec:
+      nodeSelector:
+        kubernetes.io/arch: arm64
+      tolerations:
+        - key: "kubernetes.io/arch"
+          operator: "Equal"
+          value: "arm64"
+          effect: "NoSchedule"
+      containers:
+        - name: application
+          image: ghcr.io/kedify/sample-http-server:latest
+          imagePullPolicy: Always
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          env:
+            - name: RESPONSE_DELAY
+              value: "0.3"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: application-service
+spec:
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8080
+      targetPort: http
+  selector:
+    app: application
+  type: ClusterIP
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: application-ingress
+spec:
+  ingressClassName: nginx
+  rules:
+    - host: application.keda
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: application-service
+                port:
+                  number: 8080
+EOF
+```
+
+Notes:
+- `RESPONSE_DELAY` adds ~300ms latency per request, making scaling effects easier to see.
+- The Ingress uses host `application.keda`. To access this app we will use your ingress controller’s IP with a `Host:` header (shown below).
+
+#### Verify the application is running correctly
+
+You will now check if you have 1 replica of the application deployed and ready:
+
+```bash
+kubectl get deployment application
+```
+
+In the output you should see 1 replica ready:
+```output
+NAME          READY   UP-TO-DATE   AVAILABLE   AGE
+application   1/1     1            1           3m44s
+```
+
+#### Test the application
+Once the application and Ingress are deployed, verify that everything is working correctly by sending a request to the exposed endpoint. Run the following command:
+
+```bash
+curl -I -H "Host: application.keda" http://$INGRESS_IP
+```
+
+If the routing is set up properly, you should see a response similar to:
+```output
+HTTP/1.1 200 OK
+Date: Thu, 11 Sep 2025 14:11:24 GMT
+Content-Type: text/html
+Content-Length: 301
+Connection: keep-alive
+```
+
+## Step 3 — Enable autoscaling with Kedify
+
+The application is now running. Next,  you will enable autoscaling so that it can scale dynamically between 0 and 10 replicas. Kedify ensures that no requests are dropped during scaling. Apply the `ScaledObject` by running the following command:
+
+```bash
+cat <<'EOF' | kubectl apply -f -
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: application
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: application
+  cooldownPeriod: 5
+  minReplicaCount: 0
+  maxReplicaCount: 10
+  fallback:
+    failureThreshold: 2
+    replicas: 1
+  advanced:
+    restoreToOriginalReplicaCount: true
+    horizontalPodAutoscalerConfig:
+      behavior:
+        scaleDown:
+          stabilizationWindowSeconds: 5
+  triggers:
+    - type: kedify-http
+      metadata:
+        hosts: application.keda
+        pathPrefixes: /
+        service: application-service
+        port: "8080"
+        scalingMetric: requestRate
+        targetValue: "10"
+        granularity: 1s
+        window: 10s
+        trafficAutowire: ingress
+EOF
+```
+
+Key Fields explained:
+- `type: kedify-http` — Specifies that Kedify’s HTTP scaler should be used.
+- `hosts`, `pathPrefixes` — Define which requests are monitored for scaling decisions.
+- `service`, `port` — TIdentify the Kubernetes Service and port that will receive the traffic.
+- `scalingMetric: requestRate` and `targetValue: 10` — Scale out when request rate exceeds the target threshold (e.g., 1000 req/s per window, depending on configuration granularity).
+- `minReplicaCount: 0` — Enables scale-to-zero when there is no traffic.
+- `trafficAutowire: ingress` — Automatically wires your Ingress to the Kedify proxy for seamless traffic management.
+
+After applying, the `ScaledObject` will appear in the Kedify dashboard (https://dashboard.kedify.io/).
+
+![Kedify Dashboard With ScaledObject](images/scaledobject.png)
+
+## Step 4 — Send traffic and observe scaling
+
+Since no traffic is currently being sent to the application, it will eventually scale down to zero replicas.
+
+#### Verify scale to zero
+
+To confirm that the application has scaled down, run the following command and watch until the number of replicas reaches 0:
+
+```bash
+watch kubectl get deployment application -n default
+```
+
+You should see similar output:
+```bash
+Every 2,0s: kubectl get deployment application -n default
+
+NAME          READY   UP-TO-DATE   AVAILABLE   AGE
+application   0/0     0            0           110s
+```
+This continuously monitors the deployment status in the default namespace. Once traffic stops and the idle window has passed, you should see the application deployment report 0/0 replicas, indicating that it has successfully scaled to zero.
+
+#### Verify the app can scale from zero
+
+Next, test that the application can scale back up from zero when traffic arrives. Send a request to the app:
+
+```bash
+curl -I -H "Host: application.keda" http://$INGRESS_IP
+```
+The application should scale from 0 → 1 replica automatically.
+You should receive an HTTP 200 OK response, confirming that the service is reachable again.
+
+#### Test higher load
+
+Now, generate a heavier, sustained load against the application. You can use `hey` (or a similar benchmarking tool):
+
+```bash
+hey -n 40000 -c 200 -host "application.keda" http://$INGRESS_IP
+```
+
+While the load test is running, open another terminal and monitor the deployment replicas in real time:
+
+```bash
+watch kubectl get deployment application -n default
+```
+
+You will see the number of replicas change dynamically. For example:
+
+```output
+Every 2,0s: kubectl get deployment application -n default
+
+NAME          READY   UP-TO-DATE   AVAILABLE   AGE
+application   5/5     5            5           23m
+```
+
+Expected behavior:
+- On bursty load, Kedify scales the Deployment up toward `maxReplicaCount`.
+- When traffic subsides, replicas scale down. After the cooldown, they can return to zero.
+
+You can also monitor traffic and scaling in the Kedify dashboard:
+
+![Kedify Dashboard ScaledObject Detail](images/load.png)
+
+## Clean up
+
+When you have finished testing, remove the resources created in this Learning Path to free up your cluster:
+
+```bash
+kubectl delete scaledobject application
+kubectl delete ingress application-ingress
+kubectl delete service application-service
+kubectl delete deployment application
+```
+This will delete the `ScaledObject`, Ingress, Service, and Deployment associated with the demo application.
+
+## Next steps
+
+To go futher, you can explore the Kedify [How-to guides](https://docs.kedify.io/how-to/) for more configurations such as Gateway API, Istio VirtualService, or OpenShift Routes.
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/load.png b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/load.png
new file mode 100644
index 0000000000..c51d0b92dd
Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/load.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/scaledobject.png b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/scaledobject.png
new file mode 100644
index 0000000000..ab63711abe
Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/images/scaledobject.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-ingress.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-ingress.md
new file mode 100644
index 0000000000..038e07e45d
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-ingress.md
@@ -0,0 +1,92 @@
+---
+title: "Install Ingress Controller"
+weight: 3
+layout: "learningpathall"
+---
+
+Before deploying HTTP applications with Kedify autoscaling, you need an Ingress Controller to handle incoming traffic. Most managed Kubernetes services offered by major cloud providers (AWS EKS, Google GKE, Azure AKS) do not include an Ingress Controller by default.
+
+{{% notice Note %}}
+If your cluster already has an Ingress Controller installed and configured, you can skip this step and proceed directly to the [HTTP Scaling guide](../http-scaling/).
+{{% /notice %}}
+
+## Install NGINX Ingress Controller via Helm
+
+Add the NGINX Ingress Controller Helm repository:
+
+```bash
+helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
+helm repo update
+```
+
+Install the NGINX Ingress Controller:
+
+```bash
+helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \
+  --namespace ingress-nginx \
+  --create-namespace \
+  \
+  --set "controller.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "controller.tolerations[0].key=kubernetes.io/arch" \
+  --set "controller.tolerations[0].operator=Equal" \
+  --set "controller.tolerations[0].value=arm64" \
+  --set "controller.tolerations[0].effect=NoSchedule" \
+  \
+  --set "controller.admissionWebhooks.patch.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "controller.admissionWebhooks.patch.tolerations[0].key=kubernetes.io/arch" \
+  --set "controller.admissionWebhooks.patch.tolerations[0].operator=Equal" \
+  --set "controller.admissionWebhooks.patch.tolerations[0].value=arm64" \
+  --set "controller.admissionWebhooks.patch.tolerations[0].effect=NoSchedule"
+```
+
+Wait for the LoadBalancer to be ready:
+
+```bash
+kubectl wait --namespace ingress-nginx \
+  --for=condition=ready pod \
+  --selector=app.kubernetes.io/component=controller \
+  --timeout=300s
+```
+
+## Get the External Endpoint
+
+Get the external IP address or hostname for your ingress controller and save it as an environment variable:
+
+```bash
+export INGRESS_IP=$(kubectl get service ingress-nginx-controller --namespace=ingress-nginx -o jsonpath='{.status.loadBalancer.ingress[0].ip}{.status.loadBalancer.ingress[0].hostname}')
+echo "Ingress IP/Hostname: $INGRESS_IP"
+```
+
+This will save the external IP or hostname in the `INGRESS_IP` environment variable and display it. If the command doesn't print any value, please repeat it after some time. Please note the value:
+- **AWS EKS**: You'll see an AWS LoadBalancer hostname (e.g., `a1234567890abcdef-123456789.us-west-2.elb.amazonaws.com`)
+- **Google GKE**: You'll see an IP address (e.g., `34.102.136.180`)
+- **Azure AKS**: You'll see an IP address (e.g., `20.62.196.123`)
+
+## Configure Access
+
+To configure access to the ingress controller, you have two options:
+
+### Option 1: DNS Setup (Recommended for production)
+Point `application.keda` to your ingress controller's external IP/hostname using your DNS provider.
+
+### Option 2: Host Header (Quick setup)
+Use the external IP/hostname directly with a `Host:` header in your requests. When testing, you will use:
+
+```bash
+curl -H "Host: application.keda" http://$INGRESS_IP
+```
+
+The `$INGRESS_IP` environment variable contains the actual external IP or hostname from your ingress controller service.
+
+## Verification
+
+Verify that the ingress controller is working by checking its readiness:
+
+```bash
+kubectl get pods --namespace ingress-nginx
+```
+
+You should see the `ingress-nginx-controller` pod in `Running` status.
+
+
+Now that you have an Ingress Controller installed and configured, proceed to the next section to deploy an application and configure Kedify autoscaling.
diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-kedify-helm.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-kedify-helm.md
new file mode 100644
index 0000000000..0d65234874
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/install-kedify-helm.md
@@ -0,0 +1,124 @@
+---
+title: "Install Kedify via Helm"
+weight: 2
+layout: "learningpathall"
+---
+
+In this section you will learn how to install Kedify on your Kubernetes cluster using Helm. You will add the Kedify chart repo, install KEDA (Kedify build), the HTTP Scaler, and the Kedify Agent, then verify everything is running.
+
+For more details and all installation methods on Arm, you can refer to the [Kedify installation docs](https://docs.kedify.io/installation/helm#installation-on-arm)
+
+## Before you begin
+
+You will need:
+
+- A running Kubernetes cluster (e.g., kind, minikube, EKS, GKE, AKS, etc.), hosted on any cloud provider or local environment.
+- kubectl and helm installed and configured to communicate with your cluster
+- A Kedify Service account (https://dashboard.kedify.io/) to obtain Organization ID and API Key — log in or create an account if you don’t have one
+
+## Installation
+
+1) Get your Organization ID: In the Kedify dashboard (https://dashboard.kedify.io/) go to Organization -> Details and copy the ID.
+
+2) Get your API key:
+- If you already have a Kedify Agent deployed, you can retrieve it from the existing Secret:
+
+```bash
+kubectl get secret -n keda kedify-agent -o=jsonpath='{.data.apikey}' | base64 --decode
+```
+
+Otherwise, in the Kedify dashboard (https://dashboard.kedify.io/) go to Organization -> API Keys, click Create Agent Key, and copy the key.
+
+Note: The API Key is shared across all your Agent installations. If you regenerate it, update existing Agent installs and keep it secret.
+
+## Helm repository
+
+Add the Kedify Helm repository and update your local index:
+
+```bash
+helm repo add kedifykeda https://kedify.github.io/charts
+helm repo update
+```
+
+## Helm installation
+
+Most providers like AWS EKS and Azure AKS automatically place pods on Arm nodes when you specify `nodeSelector` for `kubernetes.io/arch=arm64`. However, Google Kubernetes Engine (GKE) applies an explicit taint on Arm nodes, requiring matching `tolerations`.
+
+To ensure a portable deployment strategy across all cloud providers, it is recommended that you configure both `nodeSelector` and `tolerations` in your Helm values or CLI flags.
+
+Install each component into the keda namespace. Replace placeholders where noted.
+
+1) Install Kedify build of KEDA:
+
+```bash
+helm upgrade --install keda kedifykeda/keda \
+  --namespace keda \
+  --create-namespace \
+  --devel \
+  --set "nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "tolerations[0].key=kubernetes.io/arch" \
+  --set "tolerations[0].operator=Equal" \
+  --set "tolerations[0].value=arm64" \
+  --set "tolerations[0].effect=NoSchedule"
+```
+
+2) Install Kedify HTTP Scaler:
+
+```bash
+helm upgrade --install keda-add-ons-http kedifykeda/keda-add-ons-http \
+  --namespace keda \
+  --devel \
+  --set "interceptor.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "interceptor.tolerations[0].key=kubernetes.io/arch" \
+  --set "interceptor.tolerations[0].operator=Equal" \
+  --set "interceptor.tolerations[0].value=arm64" \
+  --set "interceptor.tolerations[0].effect=NoSchedule" \
+  --set "scaler.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "scaler.tolerations[0].key=kubernetes.io/arch" \
+  --set "scaler.tolerations[0].operator=Equal" \
+  --set "scaler.tolerations[0].value=arm64" \
+  --set "scaler.tolerations[0].effect=NoSchedule"
+```
+
+3) Install Kedify Agent (edit clusterName, orgId, apiKey):
+
+```bash
+helm upgrade --install kedify-agent kedifykeda/kedify-agent \
+  --namespace keda \
+  --set "agent.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "agent.tolerations[0].key=kubernetes.io/arch" \
+  --set "agent.tolerations[0].operator=Equal" \
+  --set "agent.tolerations[0].value=arm64" \
+  --set "agent.tolerations[0].effect=NoSchedule" \
+  --set "agent.kedifyProxy.globalValues.nodeSelector.kubernetes\.io/arch=arm64" \
+  --set "agent.kedifyProxy.globalValues.tolerations[0].key=kubernetes.io/arch" \
+  --set "agent.kedifyProxy.globalValues.tolerations[0].operator=Equal" \
+  --set "agent.kedifyProxy.globalValues.tolerations[0].value=arm64" \
+  --set "agent.kedifyProxy.globalValues.tolerations[0].effect=NoSchedule" \
+  \
+  --set clusterName="my-arm-cluster" \
+  --set agent.orgId="$YOUR_ORG_ID" \
+  --set agent.apiKey="$YOUR_API_KEY"
+```
+
+## Verify installation
+
+You are now ready to verify your installation:
+
+```bash
+kubectl get pods -n keda
+```
+
+Expected output should look like (names may differ):
+
+```output
+NAME                                             READY   STATUS    RESTARTS   AGE
+keda-add-ons-http-external-scaler-xxxxx          1/1     Running   0          1m
+keda-add-ons-http-interceptor-xxxxx              1/1     Running   0          1m
+keda-admission-webhooks-xxxxx                    1/1     Running   0          1m
+keda-operator-xxxxx                              1/1     Running   0          1m
+keda-operator-metrics-apiserver-xxxxx            1/1     Running   0          1m
+kedify-agent-xxxxx                               1/1     Running   0          1m
+```
+
+Proceed to the next section to learn how to install an Ingress controller before deploying a sample HTTP app and testing autoscaling.
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/1_overview.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/1_overview.md
new file mode 100644
index 0000000000..790f5c66bd
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/1_overview.md
@@ -0,0 +1,31 @@
+---
+title: Overview
+weight: 2
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Overview: Profiling LLMs on Arm CPUs with Streamline
+
+Large Language Models (LLMs) run efficiently on Arm CPUs.  
+Frameworks that run LLMs, such as [**llama.cpp**](https://github.com/ggml-org/llama.cpp), provides a convenient framework for running LLMs, it also comes with a certain level of complexity. 
+
+To analyze their execution and use profiling insights for optimization, you need both a basic understanding of transformer architectures and the right analysis tools.
+
+This learning path demonstrates how to use the **llama-cli** application from llama.cpp together with **Arm Streamline** to analyze the efficiency of LLM inference on Arm CPUs.  
+
+In this guide you will learn how to:
+- Profile token generation at the **Prefill** and **Decode** stages
+- Profile execution of individual tensor nodes and operators
+- Profile LLM execution across **multiple threads and cores**
+
+You will run the **Qwen1_5-0_5b-chat-q4_0.gguf** model with llama-cli on **Arm64 Linux** and use Streamline for analysis.  
+The same method can also be applied to **Arm64 Android** platforms.  
+
+## Prerequisites
+Before starting this guide, you should be familiar with:
+- Basic understanding of llama.cpp
+- Understanding of transformer model
+- Knowledge of Streamline usage
+- An Arm Neoverse or Cortex-A hardware platform running Linux or Android to test the application
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md
new file mode 100644
index 0000000000..addcdd28b4
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md
@@ -0,0 +1,57 @@
+---
+title: Understand the llama.cpp
+weight: 3
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Understand the llama.cpp
+
+**llama.cpp** is an open-source LLM framework implemented in C++ that supports both training and inference.
+This learning path focuses only on **inference on the CPU**.  
+
+The **llama-cli** tool provides a command-line interface to run LLMs with the llama.cpp inference engine. 
+It supports text generation, chat mode, and grammar-constrained output directly from the terminal.  
+
+![text#center](images/llama_structure.png "Figure 1. llama-cli Flow")
+
+### What llama-cli does
+- Load and interpret LLMs in **.gguf** format  
+- Build a **compute graph** based on the model structure  
+  - The graph can be divided into subgraphs, each assigned to the most suitable backend device  
+  - In this guide, all operators are executed on the **CPU backend**  
+- Allocate memory for tensor nodes using the **graph planner**  
+- Execute tensor nodes in the graph during the **graph_compute** stage, which traverses nodes and forwards work to backend devices  
+
+Step2 to Step4 are wrapped inside the function **`llama_decode`**.
+During **Prefill** and **Decode**, `llama-cli` repeatedly calls `llama_decode` to generate tokens.  
+The parameter **`llama_batch`** passed to `llama_decode` differs between stages, containing input tokens, their count, and their positions.  
+
+### Components of llama.cpp
+The components of llama.cpp include: 
+![text#center](images/llama_componetns.jpg "Figure 2. llmama.cpp components")
+
+llama.cpp supports various backends such as `CPU`, `GPU`, `CUDA`, `OpenCL` etc.
+
+For the CPU backend, it provides an optimized `ggml-cpu` library (mainly utilizing CPU vector instructions). 
+For Arm CPUs, the `ggml-cpu` library also offers an `aarch64` trait that leverages the new **I8MM** instructions for acceleration. 
+The `ggml-cpu` library also integrates the Arm [KleidiAI](https://github.com/ARM-software/kleidiai) library as an additional trait.
+
+### Prefill and Decode in autoregressive LLMs
+Most autoregressive LLMs are Decoder-only model.
+Here is a brief introduction to Prefill and Decode stage of autoregressive LLMs.
+![text#center](images/llm_prefill_decode.jpg "Figure 3. Prefill and Decode stage")
+
+At the Prefill stage, multiple input tokens of the prompt are processed.
+It mainly performs GEMM (A matrix is multiplied by another matrix) operations to generate the first output token. 
+![text#center](images/transformer_prefill.jpg "Figure 4. Prefill stage")
+
+At the Decode stage, by utilizing the [KV cache](https://huggingface.co/blog/not-lain/kv-caching), it mainly performs GEMV (A vector is multiplied by a matrix) operations to generate subsequent output tokens one by one.
+![text#center](images/transformer_decode.jpg "Figure 5. Decode stage")
+
+Therefore, 
+- **Prefill** is **compute-bound**, dominated by large GEMM operations  
+- **Decode** is **memory-bound**, dominated by KV cache access and GEMV operations 
+
+This can be seen in the subsequent analysis with Streamline.
\ No newline at end of file
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md
new file mode 100644
index 0000000000..85ddc43038
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md
@@ -0,0 +1,196 @@
+---
+title: Integrating Streamline Annotations into llama.cpp
+weight: 4
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Integrating Streamline Annotations into llama.cpp
+
+To visualize token generation at the **Prefill** and **Decode** stages, we use **Streamline’s Annotation Marker** feature.  
+This requires integrating annotation support into the **llama.cpp** project.  
+More information about the Annotation Marker API can be found [here](https://developer.arm.com/documentation/101816/9-7/Annotate-your-code?lang=en).
+
+{{% notice Note %}}
+You can either build natively on an **Arm platform**, or cross-compile on another architecture using an Arm cross-compiler toolchain.
+{{% /notice %}}
+
+### Step 1: Build Streamline Annotation library
+
+Install [Arm DS](https://developer.arm.com/Tools%20and%20Software/Arm%20Development%20Studio) or [Arm Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer) on your development machine first.
+
+Streamline Annotation support code in the installation directory such as *"Arm\Development Studio 2024.1\sw\streamline\gator\annotate"*.
+
+For installation guidance, refer to the [Streamline installation guide](https://learn.arm.com/install-guides/streamline/).
+
+Clone the gator repository that matches your Streamline version and build the `Annotation support library`.
+
+The installation step is depends on your development machine.
+
+For Arm native build, you can use following insturction to install the packages.
+For other machine, you need to set up the cross compiler environment by install [aarch64 gcc compiler toolchain](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads).
+You can refer this [guide](https://learn.arm.com/install-guides/gcc/cross/) for Cross-compiler installation.
+
+{{< tabpane code=true >}}
+  {{< tab header="Arm Native Build" language="bash">}}
+    apt-get update
+    apt-get install ninja-build cmake gcc g++ g++-aarch64-linux-gnu curl zip unzip tar pkg-config git
+    cd ~
+    git clone https://github.com/ARM-software/gator.git
+    cd gator
+    ./build-linux.sh
+
+    cd annotate
+    make  
+  {{< /tab >}}
+  {{< tab header="Cross Compiler" language="bash">}}
+    apt-get update
+    apt-get install ninja-build cmake gcc g++ g++-aarch64-linux-gnu curl zip unzip tar pkg-config git
+    cd ~
+    git clone https://github.com/ARM-software/gator.git
+
+    cd gator
+    make CROSS_COMPILE=/path/to/aarch64_linux_gcc_tool
+  {{< /tab >}}
+{{< /tabpane >}}
+
+Once complete, the static library **libstreamline_annotate.a** will be generated at `~/gator/annotate/libstreamline_annotate.a` and the header file at: `gator/annotate/streamline_annotate.h`
+
+### Step 2: Integrate Annotation Marker into llama.cpp
+
+Next, we need to install **llama.cpp** to run the LLM model.
+To make the following performance profiling content easier to follow, this Learning Path will use a specific release version of llama.cpp to ensure the steps and results remain consistent.
+
+Before the build **llama.cpp**, create a directory `streamline_annotation` and copy the library `libstreamline_annotate.a` and the header file `streamline_annotate.h` into the folder. 
+
+```bash
+cd ~
+wget https://github.com/ggml-org/llama.cpp/archive/refs/tags/b6202.tar.gz 
+tar -xvzf b6202.tar.gz
+mv llama.cpp-b6202 llama.cpp
+cd ./llama.cpp
+mkdir streamline_annotation
+cp ~/gator/annotate/libstreamline_annotate.a ~/gator/annotate/streamline_annotate.h streamline_annotation
+```
+
+To link `libstreamline_annotate.a` library when building llama-cli, adding following lines in the end of `llama.cpp/tools/main/CMakeLists.txt`.
+
+```makefile
+set(STREAMLINE_LIB_PATH "${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a")
+target_include_directories(llama-cli PRIVATE "${CMAKE_SOURCE_DIR}/streamline_annotation")
+target_link_libraries(llama-cli PRIVATE "${STREAMLINE_LIB_PATH}")
+```
+
+To add Annotation Markers to llama-cli, change the llama-cli code **llama.cpp/tools/main/main.cpp** by adding
+
+```c
+#include "streamline_annotate.h" 
+```
+
+After the call to common_init(), add the setup macro:
+
+```c
+    common_init();
+    //Add the Annotation setup code
+    ANNOTATE_SETUP;
+```
+
+Finally, add an annotation marker inside the main loop:
+
+```c
+          for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+	
+                // Add annotation marker code for Streamline
+                {
+                  char printf_buf[200];
+                  sprintf(printf_buf, "past %d, n_eval %d", n_past,n_eval );
+                  ANNOTATE_MARKER_STR(printf_buf);
+                }
+                // End of annotation marker 
+
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
+                    return 1;
+                }
+```
+
+A string is added to the Annotation Marker to record the position of input tokens and number of tokens to be processed.
+
+### Step 3: Build llama-cli
+
+For convenience, llama-cli is **static linked**.
+
+Firstly, create a new directory `build` understand llama.cpp root directory and go into it.
+
+```bash
+cd ~/llama.cpp
+mkdir ./build & cd ./build
+```
+
+Then configure the project by running 
+
+{{< tabpane code=true >}}
+  {{< tab header="Arm Native Build" language="bash">}}
+    cmake .. \
+      -DGGML_NATIVE=ON \
+      -DLLAMA_F16C=OFF \
+      -DLLAMA_GEMM_ARM=ON \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DCMAKE_EXE_LINKER_FLAGS="-static -g" \
+      -DGGML_OPENMP=OFF \
+      -DCMAKE_C_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" \
+      -DCMAKE_CXX_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" \
+      -DGGML_CPU_KLEIDIAI=ON \
+      -DLLAMA_BUILD_TESTS=OFF \
+      -DLLAMA_BUILD_EXAMPLES=ON \
+      -DLLAMA_CURL=OFF  
+  {{< /tab >}}
+  {{< tab header="Cross Compiler" language="bash">}}
+    cmake .. \
+      -DCMAKE_SYSTEM_NAME=Linux \
+      -DCMAKE_SYSTEM_PROCESSOR=arm \
+      -DCMAKE_C_COMPILER=aarch64-none-linux-gnu-gcc \
+      -DCMAKE_CXX_COMPILER=aarch64-none-linux-gnu-g++ \
+      -DLLAMA_NATIVE=OFF \
+      -DLLAMA_F16C=OFF \
+      -DLLAMA_GEMM_ARM=ON \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DCMAKE_EXE_LINKER_FLAGS="-static -g" \
+      -DGGML_OPENMP=OFF \
+      -DCMAKE_C_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" \
+      -DCMAKE_CXX_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" \
+      -DGGML_CPU_KLEIDIAI=ON \
+      -DLLAMA_BUILD_TESTS=OFF \
+      -DLLAMA_BUILD_EXAMPLES=ON \
+      -DLLAMA_CURL=OFF
+  {{< /tab >}}
+{{< /tabpane >}}
+
+
+Set `CMAKE_C_COMPILER` and `DCMAKE_CXX_COMPILER` to your cross compiler path. Make sure that **-march** in `DCMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS` matches your Arm CPU hardware. 
+
+
+In this learning path, we run llama-cli on an Arm CPU that supports **NEON Dotprod** and **I8MM** instructions.  
+Therefore, we specify: **armv8.2-a+dotprod+i8mm**.
+
+We also specify **-static** and **-g** options:
+- **-static**: produces a statically linked executable, so it can run on different Arm64 Linux/Android environments without needing shared libraries.
+- **-g**: includes debug information, which makes source code and function-level profiling in Streamline much easier.  
+
+so that the llama-cli executable is static linked and with debug info. This makes source code/function level profiling easier and the llama-cli executable runnable on various version of Arm64 Linux/Android.
+
+Now you can build the project by running:
+
+```bash
+cd ~/llama.cpp/build
+cmake --build ./ --config Release
+```
+
+After the building process, you should find the llama-cli will be generated at **~/llama.cpp/build/bin/** directory.
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md
new file mode 100644
index 0000000000..d33d989fa9
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md
@@ -0,0 +1,144 @@
+---
+title: Running llama-cli and Analyzing Data with Streamline
+weight: 5
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Running llama-cli and Analyzing Data with Streamline
+
+After successfully building **llama-cli**, the next step is to set up the runtime environment on your Arm64 platform.
+
+### Setup gatord 
+
+Depending on how you built llama.cpp:
+
+- **Cross Build:** 
+  - Copy the `llama-cli` executable to your Arm64 target. 
+  - Also copy the `gatord` binary from the Arm DS or Streamline installation:  
+    - Linux: `Arm\Development Studio 2024.1\sw\streamline\bin\linux\arm64`  
+    - Android: `Arm\Development Studio 2024.1\sw\streamline\bin\android\arm64`  
+
+- **Native Build:** 
+  - Use the `llama-cli` from your local build and the `gatord` you compiled earlier (`~/gator/build-native-gcc-rel/gatord`).  
+
+### Download a lightweight model
+
+Then, download the LLM model into the target platform.
+For demonstration, we use the lightweight **Qwen1_5-0_5b-chat-q4_0.gguf** model, which can run on both Arm servers and resource-constrained edge devices:
+
+```bash
+cd ~
+wget https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_0.gguf
+```
+
+### Run gatord
+
+Start the gator daemon on your Arm64 target:
+```bash
+./gatord
+```
+
+You should see similar messages as below, 
+
+``` bash
+Streamline Data Recorder v9.4.0 (Build 9b1e8f8)
+Copyright (c) 2010-2024 Arm Limited. All rights reserved.
+Gator ready
+```
+
+### Connect Streamline
+
+Next, we will need use Streamline to setup the collect CPU performance data.
+
+If you’re accessing the Arm server via **SSH**, you need to forward port `8080` from the host platform to your local machine.
+``` bash
+ssh -i <key.pem> user@arm-server -L 8080:localhost:8080 -N
+```
+Append `-L 8080:localhost:8080 -N` to your original SSH command to enable local port forwarding, this allows Arm Streamline on your local machine to connect to the Arm server. 
+
+Then launch the Streamline application on your host machine, connect to the gatord running on your Arm64 target with either TCP or ADB connection. 
+You can select PMU events to be monitored at this point. 
+
+{{% notice Note %}}
+If you are using ssh port forwarding, you need select TCP `127.0.0.1:8080`.
+{{% /notice %}}
+
+![text#center](images/streamline_capture.png "Figure 6. Streamline Start Capture ")
+
+Set the path of llama-cli executable for Streamline so that its debug info can be used for analysis.
+![text#center](images/streamline_capture_image.png "Figure 7. Streamline image path")
+
+Click `Start Capture` button on Streamline to start collecting data from the Arm64 target.
+
+{{% notice Note %}}
+This guide is not intended to introduce how to use Streamline, if you encounter any issue during setting up gatord or Streamline, please refer this [user guide](https://developer.arm.com/documentation/101816/latest/?lang=en)
+{{% /notice %}}
+
+### Run llama-cli
+
+Now, run the llama-cli executable as below,
+
+``` bash
+cd ~/llama.cpp/build/bin
+./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 1
+``` 
+
+After a while, you can stop the Streamline data collection by clicking ‘Stop’ button on Streamline. Then Streamline tool on your host PC will start the data analysis.
+
+### Analyze the data with Streamline
+
+From the timeline view of Streamline, we can see some Annotation Markers. Since we add an Annotation Marker before llama_decode function, each Annotation Marker marks the start time of a token generation. 
+![text#center](images/annotation_marker_1.png "Figure 8. Annotation Marker")
+
+The string in the Annotation Marker can be shown when clicking those Annotation Markers. For example,
+![text#center](images/annotation_marker_2.png "Figure 9. Annotation String")
+
+The number after `past` indicates the position of input tokens, the number after `n_eval` indicates the number of tokens to be processed this time.
+
+As shown in the timeline view below, with help of Annotation Markers, we can clearly identify the Prefill stage and Decode stage. 
+![text#center](images/annotation_marker_prefill.png "Figure 10. Annotation Marker at Prefill and Decode stage")
+
+By checking the string of Annotation Marker, the first token generation at Prefill stage has `past 0, n_eval 78`, which means that the position of input tokens starts at 0 and there are 78 input tokens to be processed.
+
+We can see that the first token generated at Prefill stage takes more time, since 78 input tokens have to be processed at Prefill stage, it performs lots of GEMM operations. At Decode stage, tokens are generated one by one at mostly equal speed, one token takes less time than that of Prefill stage, thanks to the effect of KV cache. At Decode stage, it performs many GEMV operations.
+
+We can further investigate it with PMU event counters that are captured by Streamline. At Prefill stage, the amount of computation, which are indicated by PMU event counters that count number of Advanced SIMD (NEON), Floating point, Integer data processing instruction, is large. However, the memory access is relatively low. Especially, the number of L3 cache refill/miss is much lower than that of Decode stage.
+
+At Decode stage, the amount of computation is relatively less (since the time of each token is less), but the number of L3 cache refill/miss goes much higher.
+By monitoring other PMU events, Backend Stall Cycles and Backend Stall Cycles due to Memory stall, 
+![text#center](images/annotation_pmu_stall.png "Figure 11. Backend stall PMU event")
+
+We can see that at Prefill stage, Backend Stall Cycles due to Memory stall are only about 10% of total Backend Stall Cycles. However, at Decode stage, Backend Stall Cycles due to Memory stall are around 50% of total Backend Stall Cycles.
+All those PMU event counters indicate that it is compute-bound at Prefill stage and memory-bound at Decode stage.
+
+Now, let us further profile the code execution with Streamline. In the ‘Call Paths’ view of Streamline, we can see the percentage of running time of functions that are organized in form of call stack.
+![text#center](images/annotation_prefill_call_stack.png "Figure 12. Call stack")
+
+In the ‘Functions’ view of Streamline, we can see the overall percentage of running time of functions.
+![text#center](images/annotation_prefill_functions.png "Figure 13. Functions view")
+
+As we can see, the function, graph_compute, takes the largest portion of the running time. It shows that large amounts of GEMM and GEMV operations take most of the time. With Qwen1_5-0_5b-chat-q4_0 model,
+* The computation (GEMM and GEMV) of Q, K, V vectors and most of FFN layers: their weights are with Q4_0 data type and the input activations are with FP32 data type. The computation is forwarded to KleidiAI trait by *ggml_cpu_extra_compute_forward*. KleidiAI ukernels implemented with NEON Dotprod and I8MM vector instructions are used to accelerate the computation.
+    - At Prefill stage, *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* KleidiAI ukernel is used for GEMM (Matrix Multiply) operators. It takes the advantage of NEON I8MM instruction. Since Prefill stage only takes small percentage of the whole time, the percentage of this function is small as shown in figures above. However, if we focus on Prefill stage only, with ‘Samplings’ view in Timeline. We can see *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* takes the largest portion of the whole Prefill stage.
+    ![text#center](images/prefill_only.png "Figure 14. Prefill only view")
+
+    - At Decode stage, *kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod* KleidiAI ukernel is used for GEMV operators. It takes advantage of NEON Dotprod instruction. If we focus on Decode stage only, we can see this function takes the second largest portion. 
+    ![text#center](images/decode_only.png "Figure 15. Decode only view")
+
+- There is a result_output linear layer in Qwen1_5-0_5b-chat-q4_0 model, the wights are with Q6_K data type. The layer computes a huge [1, 1024] x [1024, 151936] GEMV operation, where 1024 is the embedding size and 151936 is the vocabulary size. This operation cannot be handled by KleidiAI yet, it is handled by the ggml_vec_dot_q6_K_q8_K function in ggml-cpu library.
+- The tensor nodes for computation of Multi-Head attention are presented as three-dimension matrices with FP16 data type (KV cache also holds FP16 values), they are computed by ggml_vec_dot_f16 function in ggml-cpu library.
+- The computation of RoPE, Softmax, RMSNorm layers does not take significant portion of the running time.
+
+### Analyzing results
+- Annotation Markers show token generation start points.
+- Prefill stage: past 0, n_eval 78 → compute-bound (large GEMM).
+- Decode stage: one token at a time → memory-bound (KV cache, GEMV).
+- PMU events: SIMD/FP/INT instructions high in Prefill, L3 cache misses high in Decode.
+- Backend stalls: ~10% memory stalls in Prefill vs ~50% in Decode.
+
+| Stage   | Main Ops | Bottleneck     | Observations                                     |
+|---------|----------|----------------|--------------------------------------------------|
+| Prefill | GEMM     | Compute-bound  | Heavy SIMD/FP/INT ops, few cache refills         |
+| Decode  | GEMV     | Memory-bound   | Light compute, many L3 cache misses, ~50% stalls |
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/5_operator_deepdive.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/5_operator_deepdive.md
new file mode 100644
index 0000000000..fd1cb948dc
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/5_operator_deepdive.md
@@ -0,0 +1,172 @@
+---
+title: Deep Dive Into Individual Operator
+weight: 6
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Deep Dive Into Individual Operator
+
+This module shows how to use **Streamline Annotation Channels** to analyze the execution time of each node in the compute graph. More details on Annotation Channels can be found [here](https://developer.arm.com/documentation/101816/9-7/Annotate-your-code/User-space-annotations/Group-and-Channel-annotations?lang=en).
+
+## Integrating Annotation Channels into llama.cpp
+
+In llama.cpp, tensor nodes are executed in the CPU backend inside the function `ggml_graph_compute_thread` (`~/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c`).  
+
+In our selected release tag, the loop over tensor nodes looks like this (around line 2862):
+
+```c
+for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+
+        ggml_compute_forward(&params, node);
+```
+
+To monitor operator execution time, let's create annotation channels for each type of operators (such as `GGML_OP_MUL_MAT`, `GGML_OP_SOFTMAX`, `GGML_OP_ROPE` and `GGML_OP_MUL`).
+
+Since `GGML_OP_MUL_MAT` including both GEMM and GEMV operation takes significant portion of execution time, two dedicated annotation channels are created for GEMM and GEMV respectively. 
+
+The annotation starts at the beginning of `ggml_compute_forward` and stops at the end, so that the computation of tensor node/operator can be monitored. 
+
+### Step 1: Add Annotation Code 
+
+Firstly, add Streamline annotation header file to ggml-cpu.c,
+
+```c
+#include "streamline_annotate.h" 
+```
+
+Edit `ggml_graph_compute_thread` function in `~/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c`. 
+
+Add following code in front and after the **ggml_compute_forward(&params, node)**. 
+
+Your code will be looks like:
+
+```c
+
+for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+        
+    // +++ Start Annotation Channel for Streamline
+    {
+        char printf_buf[256];
+        sprintf(printf_buf," %s, %s", node->name, ggml_get_name(node)); 
+
+        if(node->op==GGML_OP_MUL_MAT ) {
+            if (node->src[1]->ne[1] == 1)
+                ANNOTATE_CHANNEL(0, printf_buf);    //It is GEMV
+            else    
+                ANNOTATE_CHANNEL(1, printf_buf);    //It is GEMM
+        }
+        else
+            ANNOTATE_CHANNEL((node->op)+2, printf_buf);
+    }
+    // --- Start Annotation Channel for Streamline
+
+    ggml_compute_forward(&params, node);
+
+    // +++ End Annotation Channel for Streamline
+    {
+        if(node->op==GGML_OP_MUL_MAT) {
+            if (node->src[1]->ne[1] == 1)
+                ANNOTATE_CHANNEL_END(0);
+            else
+                ANNOTATE_CHANNEL_END(1);
+        }
+        else
+            ANNOTATE_CHANNEL_END((node->op)+2);
+    }
+    // --- End Annotation Channel for Streamline
+```
+
+### Step 2: Add Tensor Shape Info (Optional) 
+
+You can also add information of the shape and size of source tensor by replace sprintf function as follow:
+
+```c
+        sprintf(printf_buf,"%s %s %d_%d_%d %d_%d_%d",  node->name, ggml_get_name(node), \
+            node->src[0]? node->src[0]->ne[0] : 0,  \
+            node->src[0]? node->src[0]->ne[1] : 0 , \
+            node->src[0]? node->src[0]->ne[2] : 0 ,\
+            node->src[1]? node->src[1]->ne[0] : 0, \
+            node->src[1]? node->src[1]->ne[1] : 0, \
+            node->src[1]? node->src[1]->ne[2] : 0 \
+        ); 
+```
+
+### Step 3: Update CMakeLists 
+
+Edit `~/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt` to include Streamline Annotation header file and libstreamline_annotate.a library by adding codes, copy following lines inside ggml_add_cpu_backend_variant_impl function.
+
+```bash
+   set(STREAMLINE_LIB_PATH "${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a")
+   target_include_directories( ${GGML_CPU_NAME} PRIVATE "${CMAKE_SOURCE_DIR}/streamline_annotation")
+   target_link_libraries(${GGML_CPU_NAME} PRIVATE ${STREAMLINE_LIB_PATH} )
+```
+
+Then, build `llama-cli` again.
+
+### Analyze the data with Streamline
+
+Run llama-cli and collect profiling data with Streamline as previous session.
+
+String annotations are displayed as text overlays inside the relevant channels in the details panel of the `Timeline` view.
+
+For example, inside Channel 0 in the following screenshot. 
+![text#center](images/deep_dive_1.png "Figure 16. Annotation Channel")
+
+The letter A is displayed in the process list to indicate the presence of annotations. 
+String annotations are also displayed in the Message column in the Log view.
+![text#center](images/deep_dive_2.png "Figure 17. Annotation log")
+
+
+### View of individual operators at Prefill stage
+
+The screenshot of annotation channel view at Prefill stage is shown as below,
+![text#center](images/prefill_annotation_channel.png "Figure 18. Annotation Channel at Prefill stage")
+
+Note that the name of operator in the screenshot above is manually edited. If the name of operator needs to be shown instead of Channel number by Streamline, ANNOTATE_NAME_CHANNEL can be added to ggml_graph_compute_thread function. 
+This annotation macro is defined as,  
+
+```c
+ANNOTATE_NAME_CHANNEL(channel, group, string)
+```
+
+For example, 
+```c
+   ANNOTATE_NAME_CHANNEL(0, 0, "MUL_MAT_GEMV");
+   ANNOTATE_NAME_CHANNEL(1, 0, "MUL_MAT_GEMM"); 
+```
+
+The code above sets the name of annotation channel 0 as **MUL_MAT_GEMV** and channel 1 as **MUL_MAT_GEMM**.
+By zooming into the timeline view, you can see more details:
+![text#center](images/prefill_annotation_channel_2.png "Figure 19. Annotation Channel at Prefill stage")
+
+
+When moving the cursor over an annotation channel, Streamline shows:  
+- The tensor node name  
+- The operator type  
+- The shape and size of the source tensors  
+![text#center](images/prefill_annotation_channel_3.png "Figure 20. Annotation Channel Zoom in")
+
+In the example above, we see a `GGML_OP_MUL_MAT` operator for the **FFN_UP** node.  
+Its source tensors have shapes **[1024, 2816]** and **[1024, 68]**.  
+
+This view makes it clear that:  
+- The majority of time at the **Prefill stage** is spent on **MUL_MAT GEMM** operations in the attention and FFN layers.  
+- There is also a large **MUL_MAT GEMV** operation in the `result_output` linear layer.  
+- Other operators, such as **MUL, Softmax, Norm, RoPE**, consume only a small portion of execution time.
+
+### View of individual operators at Decode stage
+The annotation channel view for the **Decode stage** is shown below:
+![text#center](images/decode_annotation_channel.png "Figure 21. Annotation Channel at Decode stage")
+
+Zooming in provides additional details:
+![text#center](images/decode_annotation_channel_2.png "Figure 22. Annotation Channel string")
+
+From this view, we observe:  
+- The majority of time in **Decode** is spent on **MUL_MAT GEMV** operations in the attention and FFN layers.  
+- In contrast to Prefill, **no GEMM operations** are executed in these layers.  
+- The `result_output` linear layer has a **large GEMV operation**, which takes an even larger proportion of runtime in Decode.  
+- This is expected, since each token generation at Decode is shorter due to KV cache reuse, making the result_output layer more dominant.  
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/6_multithread_analyze.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/6_multithread_analyze.md
new file mode 100644
index 0000000000..908766cf8c
--- /dev/null
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/6_multithread_analyze.md
@@ -0,0 +1,64 @@
+---
+title: Analyzing Multi-Core/Multi-Thread Performance
+weight: 7
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+## Analyzing Multi-Core/Multi-Thread Performance
+
+The CPU backend in **llama.cpp** uses multiple cores and threads to accelerate operator execution.  
+It creates a **threadpool**, where:  
+- The number of threads is controlled by the `-t` option  
+- If `-t` is not specified, it defaults to the number of CPU cores in the system
+
+The entrypoint for secondary threads is the function **`ggml_graph_compute_secondary_thread`**.  
+
+When computing a tensor node/operator with a large workload, llama.cpp splits the computation into multiple parts and distributes them across threads.  
+
+### Example: MUL_MAT Operator
+
+For the **MUL_MAT** operator, the output matrix **C** can be divided across threads:
+![text#center](images/multi_thread.jpg "Figure 23. Multi-Thread")
+
+In this example, four threads each compute one quarter of matrix C.  
+
+### Observing Thread Execution with Streamline
+
+The execution of multiple threads on CPU cores can be observed using **Core Map** and **Cluster Map** modes in the Streamline Timeline.  
+Learn more about these modes [here](https://developer.arm.com/documentation/101816/9-7/Analyze-your-capture/Viewing-application-activity/Core-Map-and-Cluster-Map-modes).  
+
+Run llama-cli with `-t 2 -C 0x3` to specify two threads and thread affinity as CPU core0 and core1, 
+* -t 2 → creates two worker threads
+* -C 0x3 → sets CPU affinity to core0 and core1
+
+```bash
+./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 2 -C 0x3
+```
+
+### Streamline Results
+
+Collect profiling data with **Streamline**, then select **Core Map** and **Cluster Map** modes in the Timeline view.
+
+![text#center](images/multi_thread_core_map.png "Figure 24. Multi-Thread")
+
+In the screenshot above:  
+- Two threads are created  
+- They are running on **CPU core0** and **CPU core1**, respectively  
+
+In addition, you can use the **Annotation Channel** view to analyze operator execution on a per-thread basis.  
+Each thread generates its own annotation channel independently.
+
+![text#center](images/multi_thread_annotation_channel.png "Figure 25. Multi-Thread")
+
+In the screenshot above, at the highlighted time:  
+- Both threads are executing the **same node**  
+- In this case, the node is the **result_output linear layer**
+
+
+Congratulations — you have completed the walkthrough of profiling an LLM model on an Arm CPU.
+
+By combining **Arm Streamline** with a solid understanding of llama.cpp, you can visualize model execution, analyze code efficiency, and identify opportunities for optimization.
+
+Keep in mind that adding annotation code to llama.cpp and gatord may introduce a small performance overhead, so profiling results should be interpreted with this in mind.
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md
deleted file mode 100644
index 3773982b6f..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Analyzing_token_generation_at_Prefill_and_Decode_stage.md
+++ /dev/null
@@ -1,204 +0,0 @@
----
-title: Analyze token generation at Prefill and Decode stage
-weight: 4
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Analyze token generation at Prefill and Decode stage
-To get a visible token generation view at Prefill and Decode stage, Annotation Marker feature of Streamline is used and the Annotation Marker generation code is integrated to the llama.cpp project. 
-You can find more information about Annotation Marker feature here, https://developer.arm.com/documentation/101816/9-7/Annotate-your-code?lang=en. 
-
-## Steps of llama.cpp integration and Streamline setup
-
-### Step 1: Build Streamline Annotation library
-Install ArmDS or Arm Streamline on your host PC first. 
-You can get Streamline Annotation support code in the installation directory such as *"Arm\Development Studio 2024.1\sw\streamline\gator\annotate"*. 
-You also can get the Annotation support code here, https://github.com/ARM-software/gator/tree/main , please download the right code that matches the version of Streamline tool on your host PC.
-
-Then you can build the Streamline Annotation Library by running 
-```bash
-make CROSS_COMPILE=/path/to/aarch64_linux_gcc_tool 
-```
-
-for example,
-```bash
-make CROSS_COMPILE=./Work/arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu- 
-```
-You can get the aarch64 gcc compiler toolchain here, https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads .
-
-The static linked library, libstreamline_annotate.a, will be produced. 
-
-### Step 2: Integrate Annotation Marker code to llama.cpp
-Download llama.cpp code from  https://github.com/ggml-org/llama.cpp/archive/refs/tags/b6202.tar.gz 
-Go to llama.cpp root directory and create a directory ‘streamline_annotation’ there. 
-```bash
-cd ./llama.cpp
-mkdir streamline_annotation
-```
-
-Copy the library ‘libstreamline_annotate.a’ and the header file ‘streamline_annotate.h’ from Step 1 to the directory ‘streamline_annotation’.
-
-To link 'libstreamline_annotate.a' library when building llama-cli, change *llama.cpp\CMakeLists.txt* by adding following lines,
-
-```makefile
-set(STREAMLINE_LIB_PATH  ${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a)
-target_include_directories(llama-cli PRIVATE ${CMAKE_SOURCE_DIR}/streamline_annotation)
-target_link_libraries(${TARGET} PRIVATE ${STREAMLINE_LIB_PATH} )
-```
-
-To add Annotation Markers to llama-cli, change the llama-cli code *llama.cpp/tools/main/main.cpp* by adding
-```c
-#include "streamline_annotate.h" 
-```
-and the Annotation Marker code in the 'main' function,
-
-Firstly, add the Streamline Annotation setup code after *common_init*, 
-```c
-    common_init();
- 
-    //Add the Annotation setup code
-    ANNOTATE_SETUP;
-
-```
-
-
-then add the Annotation Marker generation code here, 
-
-
-```c
-          for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-
-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
-	
-               // Add annotation marker code for Streamline 				
-	           {
-                   char printf_buf[200]; 
-                   sprintf(printf_buf, "past %d, n_eval %d", n_past,n_eval ); 
-                   ANNOTATE_MARKER_STR(printf_buf);
-                 }
-              // End of annotation marker 
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-```
-
-A string is added to the Annotation Marker to record the position of input tokens and number of tokens to be processed.
-
-### Step 3: Build llama-cli executable 
-For convenience, llama-cli is static linked. 
-
-Firstly, create a new directory ‘build’ understand llama.cpp root directory and go into it.
-```bash
-mkdir ./build & cd ./build
-```
-Then configure the project by running 
-```bash
-cmake .. -DCMAKE_SYSTEM_NAME=Linux  -DCMAKE_SYSTEM_PROCESSOR=arm -DCMAKE_C_COMPILER=aarch64-none-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-none-linux-gnu-g++ -DLLAMA_NATIVE=OFF -DLLAMA_F16C=OFF  -DLLAMA_GEMM_ARM=ON -DBUILD_SHARED_LIBS=OFF  -DCMAKE_EXE_LINKER_FLAGS="-static -g" -DGGML_OPENMP=OFF -DCMAKE_C_FLAGS="-march=armv8.2-a+i8mm+dotprod -g" -DCMAKE_CXX_FLAGS="-march=armv8.2-a+dotprod+i8mm -g" -DGGML_CPU_KLEIDIAI=ON -DGGML_OPENMP=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON  -DLLAMA_CURL=OFF
-```
-
-Set CMAKE_C_COMPILER and DCMAKE_CXX_COMPILER to your cross compiler path. Make sure that “-march” in DCMAKE_C_FLAGS and CMAKE_CXX_FLAGS matches your Arm CPU hardware. 
-
-In this guide, we run llama-cli on an Arm CPU which supports NEON Dotprod and I8MM instructions, so ‘-march’ is specified as ‘armv8.2-a+dotprod+i8mm’. We also specify ‘-static’ and ‘-g’ options so that the llama-cli executable is static linked and with debug info. This makes source code/function level profiling easier and the llama-cli executable runnable on various version of Arm64 Linux/Android.
-
-Now, we can build the project by running
-```bash
-cmake --build ./ --config Release
-```
-
-After the building process, you should find the llama-cli executable in *./build/bin/* directory.
-
-### Step 4: Run llama-cli and analyze the data with Streamline
-Copy following files to your Arm64 platform,
-* llama-cli executable 
-* the ‘gatord’ executable in Arm DS or Streamline installation folder, such as *Arm\Development Studio 2024.1\sw\streamline\bin\linux\arm64*  for Linux and *Arm\Development Studio 2024.1\sw\streamline\bin\android\arm64* for Android
-* the LLM model, Qwen1_5-0_5b-chat-q4_0.gguf
-
-Then run the gatord on your Arm64 target
-```bash
-./gatord
-```
-You should see similar messages as below, 
-
-``` bash
-Streamline Data Recorder v9.4.0 (Build 9b1e8f8)
-Copyright (c) 2010-2024 Arm Limited. All rights reserved.
-Gator ready
-```
-
-Then launch the Streamline application on your host PC, connect to the gatord running on your Arm64 target with either TCP or ADB connection. You can select PMU events to be monitored at this point. 
-
-![text#center](images/streamline_capture.png "Figure 6. Streamline Start Capture ")
-
-Set the path of llama-cli executable for Streamline so that its debug info can be used for analysis.
-
-![text#center](images/streamline_capture_image.png "Figure 7. Streamline image path")
-
-Click ‘Start Capture’ button on Streamline to start collecting data from the Arm64 target.
-
-*Note: This guide is not intended to introduce how to use Streamline, if you encounter any issue during setting up gatord or Streamline, please seek for help from Arm support.*
-
-Now, run the llama-cli executable as below,
-
-``` bash
-./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 1
-``` 
-
-After a while, you can stop the Streamline data collection by clicking ‘Stop’ button on Streamline. Then Streamline tool on your host PC will start the data analysis.
-
-## Analyze the data with Streamline
-From the timeline view of Streamline, we can see some Annotation Markers. Since we add an Annotation Marker before llama_decode function, each Annotation Marker marks the start time of a token generation. 
-
-![text#center](images/annotation_marker_1.png "Figure 8. Annotation Marker")
-
-The string in the Annotation Marker can be shown when clicking those Annotation Markers. For example,
-
-![text#center](images/annotation_marker_2.png "Figure 9. Annotation String")
-
-The number after ‘past’ indicates the position of input tokens, the number after ‘n_eval’ indicates the number of tokens to be processed this time.
-
-As shown in the timeline view below, with help of Annotation Markers, we can clearly identify the Prefill stage and Decode stage. 
-
-![text#center](images/annotation_marker_prefill.png "Figure 10. Annotation Marker at Prefill and Decode stage")
-
-By checking the string of Annotation Marker, the first token generation at Prefill stage has 'past 0, n_eval 78', which means that the position of input tokens starts at 0 and there are 78 input tokens to be processed. 
-We can see that the first token generated at Prefill stage takes more time, since 78 input tokens have to be processed at Prefill stage, it performs lots of GEMM operations. At Decode stage, tokens are generated one by one at mostly equal speed, one token takes less time than that of Prefill stage, thanks to the effect of KV cache. At Decode stage, it performs many GEMV operations.
-
-We can further investigate it with PMU event counters that are captured by Streamline. At Prefill stage, the amount of computation, which are indicated by PMU event counters that count number of Advanced SIMD (NEON), Floating point, Integer data processing instruction, is large. However, the memory access is relatively low. Especially, the number of L3 cache refill/miss is much lower than that of Decode stage.
-
-At Decode stage, the amount of computation is relatively less (since the time of each token is less), but the number of L3 cache refill/miss goes much higher.
-By monitoring other PMU events, Backend Stall Cycles and Backend Stall Cycles due to Memory stall, 
-
-![text#center](images/annotation_pmu_stall.png "Figure 11. Backend stall PMU event")
-
-We can see that at Prefill stage, Backend Stall Cycles due to Memory stall are only about 10% of total Backend Stall Cycles. However, at Decode stage, Backend Stall Cycles due to Memory stall are around 50% of total Backend Stall Cycles.
-All those PMU event counters indicate that it is compute-bound at Prefill stage and memory-bound at Decode stage.
-
-Now, let us further profile the code execution with Streamline. In the ‘Call Paths’ view of Streamline, we can see the percentage of running time of functions that are organized in form of call stack.
-
-![text#center](images/annotation_prefill_call_stack.png "Figure 12. Call stack")
-
-In the ‘Functions’ view of Streamline, we can see the overall percentage of running time of functions.
-
-![text#center](images/annotation_prefill_functions.png "Figure 13. Functions view")
-
-As we can see, the function, graph_compute, takes the largest portion of the running time. It shows that large amounts of GEMM and GEMV operations take most of the time. With Qwen1_5-0_5b-chat-q4_0 model,
-* The computation (GEMM and GEMV) of Q, K, V vectors and most of FFN layers: their weights are with Q4_0 data type and the input activations are with FP32 data type. The computation is forwarded to KleidiAI trait by *ggml_cpu_extra_compute_forward*. KleidiAI ukernels implemented with NEON Dotprod and I8MM vector instructions are used to accelerate the computation.
-    - At Prefill stage, *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* KleidiAI ukernel is used for GEMM (Matrix Multiply) operators. It takes the advantage of NEON I8MM instruction. Since Prefill stage only takes small percentage of the whole time, the percentage of this function is small as shown in figures above. However, if we focus on Prefill stage only, with ‘Samplings’ view in Timeline. We can see *kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm* takes the largest portion of the whole Prefill stage.
-
-    ![text#center](images/Prefill_only.png "Figure 14. Prefill only view")
-
-    - At Decode stage, *kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod* KleidiAI ukernel is used for GEMV operators. It takes advantage of NEON Dotprod instruction. If we focus on Decode stage only, we can see this function takes the second largest portion. 
-
-    ![text#center](images/Decode_only.png "Figure 15. Decode only view")
-
-* There is a result_output linear layer in Qwen1_5-0_5b-chat-q4_0 model, the wights are with Q6_K data type. The layer computes a huge [1, 1024] x [1024, 151936] GEMV operation, where 1024 is the embedding size and 151936 is the vocabulary size. This operation cannot be handled by KleidiAI yet, it is handled by the ggml_vec_dot_q6_K_q8_K function in ggml-cpu library.
-* The tensor nodes for computation of Multi-Head attention are presented as three-dimension matrices with FP16 data type (KV cache also holds FP16 values), they are computed by ggml_vec_dot_f16 function in ggml-cpu library.
-* The computation of RoPE, Softmax, RMSNorm layers does not take significant portion of the running time.
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md
deleted file mode 100644
index 55adcb95bc..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Conclusion.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-title: Conclusion
-weight: 7
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Conclusion 
-By leveraging the Streamline tool together with a good understanding of the llama.cpp code, the execution process of the LLM model can be visualized, which helps analyze code efficiency and investigate potential optimization.
-
-Note that additional annotation code in llama.cpp and gatord might somehow affect the performance. 
-
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md
deleted file mode 100644
index 3802be4996..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Deep_dive.md
+++ /dev/null
@@ -1,132 +0,0 @@
----
-title: Deep dive into individual operator
-weight: 5
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Deep dive into individual operator
-This session provides a guide on how to use the Streamline Annotation Channel feature to analyze execution time of each node in the compute graph.
-More information about Streamline Annotation Channel can be found here https://developer.arm.com/documentation/101816/9-7/Annotate-your-code/User-space-annotations/Group-and-Channel-annotations?lang=en 
-
-## Integrate Annotation Channel code to llama.cpp 
-In llama.cpp project, tensor nodes in compute graph are computed by the function ggml_graph_compute_thread in CPU backend, *llama.cpp\ggml\src\ggml-cpu\ggml-cpu.c* 
-```c
-for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-
-        ggml_compute_forward(&params, node);
-```
-To monitor the execution time of each node, we create a annotation channel for each type of operators (such as GGML_OP_MUL_MAT, GGML_OP_SOFTMAX, GGML_OP_ROPE, GGML_OP_MUL), since GGML_OP_MUL_MAT including both GEMM and GEMV operation takes significant portion of execution time, two dedicated annotation channels are created for GEMM and GEMV respectively. 
-
-The annotation channel starts at the beginning of 'ggml_compute_forward’, it stops at the end of ‘ggml_compute_forward’, so that the computation of tensor node/operator can be monitored. 
-
-Firstly, add Streamline annotation header file to ggml-cpu.c,
-```c
-#include "streamline_annotate.h" 
-```
-Then add annotation channel code in ggml_graph_compute_thread function, 
-```c
-for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];		
-     // Start Annotation Channel for Streamline
-        {
-             char printf_buf[256]; 
-             sprintf(printf_buf," %s, %s", node->name, ggml_get_name(node)); 
-             
-             if(node->op==GGML_OP_MUL_MAT )
-                {
-                 if (node->src[1]->ne[1] == 1)     
-                     ANNOTATE_CHANNEL(0, printf_buf);    //It is GEMV
-                 else
-                     ANNOTATE_CHANNEL(1, printf_buf);    //It is GEMM
-                }               
-             else
-                 ANNOTATE_CHANNEL((node->op)+2, printf_buf);              
-        }       		
-		
-		
-
-        ggml_compute_forward(&params, node);
-		
-		
-	// End Annotation Channel for Streamline
-        {
-             if(node->op==GGML_OP_MUL_MAT)
-                {
-                 if (node->src[1]->ne[1] == 1)
-                      ANNOTATE_CHANNEL_END(0);
-                 else
-                      ANNOTATE_CHANNEL_END(1);
-                }                                       
-             else
-                 ANNOTATE_CHANNEL_END((node->op)+2);            
-        }      		
-```
-
-
-We also add tensor node names and the names of operation to the string annotation channels. 
-
-If information of the shape and size of source tensors is required, we can change the code as below,
-```c
-             sprintf(printf_buf,"%s %s %d_%d_%d %d_%d_%d",  node->name, ggml_get_name(node), \
-                                                                    node->src[0]? node->src[0]->ne[0] : 0,  \
-                                                                    node->src[0]? node->src[0]->ne[1] : 0 , \
-                                                                    node->src[0]? node->src[0]->ne[2] : 0 ,\
-                                                                    node->src[1]? node->src[1]->ne[0] : 0, \
-                                                                    node->src[1]? node->src[1]->ne[1] : 0, \
-                                                                    node->src[1]? node->src[1]->ne[2] : 0 \
-                                                                    ); 
-```
-Then we need to change *llama.cpp\ggml\src\ggml-cpu\CMakeLists.txt* to include Streamline Annotation header file and libstreamline_annotate.a library by adding codes as below,
-```bash
-    set(STREAMLINE_LIB_PATH  ${CMAKE_SOURCE_DIR}/streamline_annotation/libstreamline_annotate.a)
-    target_include_directories( ${GGML_CPU_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/streamline_annotation)
-    target_link_libraries(${GGML_CPU_NAME} PRIVATE ${STREAMLINE_LIB_PATH} )
-```
-
-Then build llama-cli executable, run llama-cli and collect profiling data with Streamline as previous session.
-
-
-## Analyze the data with Streamline
-String annotations are displayed as text overlays inside the relevant channels in the details panel of the Timeline view, for example inside Channel 0 in the following screenshot. 
-![text#center](images/deep_dive_1.png "Figure 16. Annotation Channel")
-
-The letter A is displayed in the process list to indicate the presence of annotations. 
-String annotations are also displayed in the Message column in the Log view.
-![text#center](images/deep_dive_2.png "Figure 17. Annotation log")
-
-### View of individual operators at Prefill stage
-
-The screenshot of annotation channel view at Prefill stage is shown as below,
-![text#center](images/prefill_annotation_channel.png "Figure 18. Annotation Channel at Prefill stage")
-
-Note that the name of operator in the screenshot above is manually edited. If the name of operator needs to be shown instead of Channel number by Streamline, ANNOTATE_NAME_CHANNEL can be added to ggml_graph_compute_thread function. 
-This annotation macro is defined as,  
-```c
-ANNOTATE_NAME_CHANNEL(channel, group, string)
-```
-For example, 
-```c
-   ANNOTATE_NAME_CHANNEL(0, 0, "MUL_MAT_GEMV");
-   ANNOTATE_NAME_CHANNEL(1, 0, "MUL_MAT_GEMM"); 
-```
-The code above sets the name of annotation channel 0 as ‘MUL_MAT_GEMV’, the name of annotation channel 1 as ‘MUL_MAT_GEMM’.
-We can get more detailed information by zooming in the view,
-![text#center](images/prefill_annotation_channel_2.png "Figure 18. Annotation Channel at Decode stage")
-
-When moving the cursor to the Annotation channel, the tensor node name, the name of operation, the shape and size of source tensor nodes will be shown.
-![text#center](images/prefill_annotation_channel_3.png "Figure 19. Annotation Channel Zoom in")
-
-The screenshot above shows a GGML_OP_MUL_MAT operator of FFN_UP node, whose source tensors shape/size is [1024, 2816] and [1024, 68].
-The view clearly shows that the major time was spent on MUL_MAT GEMM operations of attention layers and FFN layers at Prefill stage. There is a large MUL_MAT GEMV operation at result_output linear layer. Other operators such as MUL, Softmax, Norm, RoPE do not take significant time. 
-
-### View of individual operators at Decode stage
-The screenshot of annotation channel view at Decode stage is shown as below,
-![text#center](images/decode_annotation_channel.png "Figure 20. Annotation Channel at Decode stage")
-
-We can get more detailed information by zooming in the view,
-![text#center](images/decode_annotation_channel_2.png "Figure 21. Annotation Channel string")
-
-The view shows that the major time was spent on MUL_MAT GEMV operations of attention layers and FFN layers at Decode stage. Comparing with Prefill stage, there is no GEMM at those layers, GEMV operations are performed instead. The large MUL_MAT GEMV operation at result_output linear layer takes more significant portion of time at Decode stage, since the time spent on each token generation at Decode stage is less due to utilization of KV cache. This corresponds to the percentage of execution time of the function ggml_vec_dot_q6_K_q8_K that we observed in previous session.
\ No newline at end of file
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md
deleted file mode 100644
index bdc885dad5..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-title: Overview
-weight: 2
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Overview 
-Large Language Models (LLM) run very smoothly on Arm CPUs. The framework that runs LLM models is usually complex. To analyze the execution of LLM and utilize profiling information for potential code optimization, a good understanding of transformer architecture and an appropriate analysis tool is required.
-This guide uses llama-cli application from llama.cpp and Arm’s Streamline tool to analyze the efficiency of LLM running on arm CPU. 
-
-The guide includes,
-* How to profile LLM token generation at Prefill and Decode stage
-* How to profile execution of individual tensor node/operator
-* How to profile LLM execution with multi-thread/multi-core
-
-Understanding this guide requires prerequisite knowledge of transformer architecture, llama.cpp and Streamline.
-
-We run Qwen1_5-0_5b-chat-q4_0.gguf model with llama-cli on Arm64 Linux and use Streamline for analysis. This guide should also work on Arm64 Android platform. 
\ No newline at end of file
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md
deleted file mode 100644
index 15bc501c7d..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Introduction_to_llama_cpp.md
+++ /dev/null
@@ -1,39 +0,0 @@
----
-title: Introduction to llama.cpp
-weight: 3
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Introduction to llama.cpp
-llama.cpp is a LLM framework implemented in C++ that can be used for both training and inference. This guide only covers inference on the CPU.
-llama-cli provides a terminal interface to interact with LLM using the llama.cpp inference engine. It enables LLM inference, chat mode, grammar-constrained generation directly from the command line.
-![text#center](images/llama_structure.png "Figure 1. Annotation String")
-
-llama-cli does the following things,
-* Load and interpret LLMs in .gguf format.
-* Build a compute graph according to the model structure. The compute graph can be divided into subgraphs that are assigned to the most suitable backend devices. At this step, the model structure are converted into a compute graph with many tensor nodes/operators (such as ADD, MUL_MAT, NORM, SOFTMAX) that can be actually computed. 
-Since this guide only focuses on running LLM on CPU, all operators are assigned to CPU backend. 
-* Allocate memory for tensors nodes in the compute graph by the graph planner.
-* Compute tensor nodes at the graph compute stage, where the ‘graph_compute’ function forwards the compute subgraphs to the backend devices. The computation is performed by traversing the tree of nodes in the compute graph.
-
-Those steps above are wrapped in the function ‘llama_decode’. At LLM Prefill and Decode stage, llama-cli calls ‘llama_decode’ repeatedly to generate tokens. However, the parameter ‘llama_batch’ passed to ‘llama_decode' is different at Prefill and Decode stage. ‘llama_batch’ includes information such as input tokens, number of input tokens, the position of input tokens.
-
-The components of llama.cpp include
-![text#center](images/llama_componetns.jpg "Figure 2. llmama.cpp components")
-
-llama.cpp supports various backends such as CPU, GPU, CUDA, OpenCL etc. 
-For the CPU backend, it provides an optimized ggml-cpu library (mainly utilizing CPU vector instructions). For Arm CPUs, the ggml-cpu library also offers an aarch64 trait that leverages the new I8MM instructions for acceleration. The ggml-cpu library also integrates the Arm KleidiAI library as an additional trait.
-
-Most autoregressive LLMs are Decoder-only model. Here is a brief introduction to Prefill and Decode stage of autoregressive LLMs.
-![text#center](images/llm_prefill_decode.jpg "Figure 3. Prefill and Decode stage")
-
-At the Prefill stage, multiple input tokens of the prompt are processed. It mainly performs GEMM (A matrix is multiplied by another matrix) operations to generate the first output token. 
-![text#center](images/transformer_prefill.jpg "Figure 4. Prefill stage")
-
-
-At the Decode stage, by utilizing the KV cache, it mainly performs GEMV (A vector is multiplied by a matrix) operations to generate subsequent output tokens one by one.
-![text#center](images/transformer_decode.jpg "Figure 5. Decode stage")
-
-Therefore, the prefill stage is compute-bound, while the decode stage has relatively less computation and is more memory-bound due to lots of KV cache memory access. This can be seen in the subsequent analysis with Streamline.
\ No newline at end of file
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md
deleted file mode 100644
index c3eee09f8c..0000000000
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/Multi_threads.md
+++ /dev/null
@@ -1,39 +0,0 @@
----
-title: Use Streamline to analyze multi-core/multi-thread support in llama.cpp
-weight: 6
-
-### FIXED, DO NOT MODIFY
-layout: learningpathall
----
-
-# Use Streamline to analyze multi-core/multi-thread support in llama.cpp
-The CPU backend in llama.cpp utilizes multi-core/multi-thread to accelerate the computation of operators.
-llama.cpp creates a threadpool. The number of threads in threadpool is decided by ‘-t’  option, if ‘-t’ option is not specified, then it is set as the number of CPU cores in the system by default. 
-The entrypoint of secondary thread is ggml_graph_compute_secondary_thread.
-When computing one tensor node/operator in the compute graph, if the worksize is big, llama.cpp splits its computation into multiple parts for those threads. 
-Here is an example of MUL_MAT operator to demonstrate how the splitting is done. 
-
-![text#center](images/multi_thread.jpg "Figure 22. Multi-thread")
-
-In this example, the result matrix C is split equally between four threads, each thread computes a quarter of matrix C.
-The execution of multi-threads on CPU cores can be observed by Streamline. Core Map and Cluster Map modes in the Streamline Timeline view map threads to CPU cores. 
-
-More information about Core Map and Cluster Map modes can be found here
-https://developer.arm.com/documentation/101816/9-7/Analyze-your-capture/Viewing-application-activity/Core-Map-and-Cluster-Map-modes 
-
-Run llama-cli with ‘-t 2 -C 0x3’ to specify two threads and thread affinity as CPU core0 and core1, 
-```bash
-./llama-cli -m qwen1_5-0_5b-chat-q4_0.gguf -p "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nTell me a story about a fox and a crow? Please do not tell the traditional story in Aesop's fables. Please tell me a positive story about friendship and love. The story should have no more than 400 words<|im_end|>\n<|im_start|>assistant\n" -st -t 2 -C 0x3
-```
-
-Collect profiling data with Streamline, then select Core Map and Cluster Map modes in the Streamline Timeline view.
-
-![text#center](images/multi_thread_core_map.png "Figure 23. Multi-thread")
-
-As shown in the screenshot above, two threads are created and running on CPU core0 and core1 respectively.
-Furthermore, individual operator view with annotation channel can be used to view two threads’ operators in parallel. 
-Note that annotation channels are created independently per-thread.
-
-![text#center](images/multi_thread_annotation_channel.png "Figure 24. Multi-thread")
-
-As shown in screenshot above, at the specific time, both threads are computing for the same node. In this example, it is result_output linear node.
\ No newline at end of file
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md
index 28843b0bde..d78e492e38 100644
--- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/_index.md
@@ -1,24 +1,30 @@
 ---
-title: Use Streamline to analyze LLM running on CPU with llama.cpp and KleidiAI
+title: Use Streamline to analyze LLM on CPU with llama.cpp and KleidiAI
 
 draft: true
 cascade:
     draft: true
-    
+
 minutes_to_complete: 50
 
-who_is_this_for: Engineers who want to learn LLM inference on CPU or profile and optimize llama.cpp code.
+who_is_this_for: This advanced topic is for software developers, performance engineers, and AI practitioners who want to run llama.cpp on Arm-based CPUs, learn how to use Arm Streamline to capture and analyze performance data, understand how LLM inference behaves at the Prefill and Decode stages.
 
-learning_objectives: 
-    - Be able to use Streamline to profile llama.cpp code
-    - Learn the execution of LLM on CPU
+learning_objectives:
+    - Describe the architecture of llama.cpp and the role of Prefill and Decode stages
+    - Integrate Streamline Annotations into llama.cpp for fine-grained performance insights
+    - Capture and interpret profiling data with Streamline
+    - Use Annotation Channels to analyze specific operators during token generation
+    - Evaluate multi-core and multi-thread execution of llama.cpp on Arm CPUs
 
 prerequisites:
-    - Understanding of llama.cpp
+    - Basic understanding of llama.cpp
     - Understanding of transformer model
     - Knowledge of Streamline usage
+    - An Arm Neoverse or Cortex-A hardware platform running Linux or Android to test the application
 
-author: Zenon(Zhilong) Xiu
+author: 
+    - Zenon Zhilong Xiu
+    - Odin Shen
 
 ### Tags
 skilllevels: Advanced
@@ -29,6 +35,10 @@ armips:
 tools_software_languages:
     - Arm Streamline
     - C++
+    - llama.cpp
+    - KleidiAI
+    - Neoverse
+    - Profiling
 operatingsystems:
     - Linux
     - Android
@@ -47,8 +57,6 @@ further_reading:
         link: https://developer.arm.com/documentation/101816/9-7
         type: website
 
-
-
 ### FIXED, DO NOT MODIFY
 # ================================================================================
 weight: 1                       # _index.md always has weight of 1 to order correctly
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png
index f6095be12c..e56067cabc 100644
Binary files a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_annotation_channel_2.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Decode_only.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_only.png
similarity index 100%
rename from content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Decode_only.png
rename to content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/decode_only.png
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png
deleted file mode 100644
index 5fdf8f3a66..0000000000
Binary files a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/llama_componetns.png and /dev/null differ
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png
deleted file mode 100644
index 47188a01b8..0000000000
Binary files a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread.png and /dev/null differ
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png
index 1b435ae958..2d56abda3a 100644
Binary files a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/multi_thread_annotation_channel.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png
index b42cff8220..3fc7193779 100644
Binary files a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png and b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_annotation_channel_3.png differ
diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Prefill_only.png b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_only.png
similarity index 100%
rename from content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/Prefill_only.png
rename to content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/images/prefill_only.png
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md
index de5c91fb33..9692fc1176 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/_index.md
@@ -1,22 +1,18 @@
 ---
-title: Run MongoDB on the Microsoft Azure Cobalt 100 processors 
-
-draft: true
-cascade:
-    draft: true
+title: Run MongoDB on Arm-based Azure Cobalt 100 instances
 
 minutes_to_complete: 30   
 
-who_is_this_for: This Learning Path is designed for software developers looking to migrate their MongoDB workloads to Arm-based platforms, specifically on the Microsoft Azure Cobalt 100 processors.
+who_is_this_for: This is an introductory topic for software developers who want to migrate MongoDB workloads to Arm-based platforms, with a focus on Microsoft Azure Cobalt 100 Arm64 instances.
 
 learning_objectives: 
-    - Provision an Azure Arm64 Cobalt 100 based virtual machine using Azure console, with Ubuntu Pro 24.04 LTS as the base image.
-    - Deploy MongoDB on an Azure Cobalt 100 based virtual machine.
-    - Perform MongoDB baseline testing and benchmarking on the Arm64 virtual machine.
+    - Provision an Arm64-based Cobalt 100 virtual machine in Azure using Ubuntu Pro 24.04 LTS
+    - Deploy MongoDB on the Cobalt 100 instance
+    - Run baseline tests and performance benchmarks on MongoDB in the Arm64 environment
 
 prerequisites:
-    - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 based instances (Dpsv6). 
-    - Familiarity with the [MongoDB architecture](https://www.mongodb.com/) and deployment practices on Arm64 platforms.
+    - A [Microsoft Azure](https://azure.microsoft.com/) account with access to Cobalt 100 (Dpsv6) instances
+    - Familiarity with the [MongoDB architecture](https://www.mongodb.com/) and deployment practices on Arm64 platforms
 
 author: Pareena Verma
 
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md
index fa257f0c98..0c76840503 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/background.md
@@ -1,5 +1,5 @@
 ---
-title: "Overview"
+title: "What are Cobalt 100 and MongoDB?"
 
 weight: 2
 
@@ -8,13 +8,12 @@ layout: "learningpathall"
 
 ## Cobalt 100 Arm-based processor
 
-Azure’s Cobalt 100 is built on Microsoft's first-generation, in-house Arm-based processor: the Cobalt 100. Designed entirely by Microsoft and based on Arm’s Neoverse N2 architecture, this 64-bit CPU delivers improved performance and energy efficiency across a broad spectrum of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU, ensuring consistent and predictable performance. 
+Azure’s Cobalt 100 is Microsoft’s first-generation Arm-based processor. Built on the Arm Neoverse N2 architecture, the 64-bit CPU improves performance and energy efficiency for a wide range of cloud-native, scale-out Linux workloads. These include web and application servers, data analytics, open-source databases, caching systems, and more. Running at 3.4 GHz, the Cobalt 100 processor allocates a dedicated physical core for each vCPU to ensure consistent, predictable performance.
 
-To learn more about Cobalt 100, refer to the blog [Announcing the preview of new Azure virtual machine based on the Azure Cobalt 100 processor](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353).
+See the [Azure Cobalt 100 announcement blog](https://techcommunity.microsoft.com/blog/azurecompute/announcing-the-preview-of-new-azure-vms-based-on-the-azure-cobalt-100-processor/4146353) to find out more.
 
 ## MongoDB
-MongoDB is a popular open-source NoSQL database designed for high performance, scalability, and flexibility.
 
-It stores data in JSON-like BSON documents, making it ideal for modern applications that require dynamic, schema-less data structures.
+MongoDB is an open-source NoSQL database known for high performance, scalability, and flexibility. It stores data in JSON-like BSON documents, making it ideal for applications that need dynamic, schema-less data structures. Developers commonly use MongoDB for web, mobile, IoT, and real-time analytics workloads. 
 
-MongoDB is widely used for web, mobile, IoT, and real-time analytics workloads. Learn more from the [MongoDB official website](https://www.mongodb.com/) and its [official documentation](https://www.mongodb.com/docs/).
+To find out more, see the [MongoDB website](https://www.mongodb.com/) and the [MongoDB documentation](https://www.mongodb.com/docs/).
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md
index 4b981f3286..6d2ff936f1 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/baseline-testing.md
@@ -6,21 +6,25 @@ weight: 5
 layout: learningpathall
 ---
 
+## Overview
 
-### Baseline testing of MongoDB
-In this section you will perform baseline testing by verifying MongoDB is running, logging into the shell, executing a few test queries, and monitoring live performance. This ensures the database is functioning correctly before starting any benchmarks.
+Use this section to validate your **MongoDB** setup on an **Azure Cobalt 100 (Arm64)** VM running **Ubuntu 24.04 LTS**. You will check service health, run a quick storage baseline with **fio**, validate CRUD operations with **mongosh**, and do light query, index, and concurrency checks before larger benchmarks.
 
-1. Verify Installation & Service Health
+{{% notice Note %}}
+For this exercise, `mongod` runs locally and **access control is disabled** by default. Keep the server bound to `127.0.0.1` until you configure users and authentication.
+{{% /notice %}}
+
+## Verify MongoDB installation and service health (Azure Cobalt 100 Arm64)
 
 ```console
 ps -ef | grep mongod
 mongod --version
 netstat -tulnp | grep 27017
 ```
-An explanation of what each command is doing:
-- **ps -ef | grep mongod** – Checks if the MongoDB server process is running.
-- **mongod --version** – Shows the version of MongoDB installed.
-- **netstat -tulnp | grep 27017** – Checks if MongoDB is listening for connections on its default port 27017.
+What each command does:
+- **ps -ef | grep mongod** checks if the MongoDB server process is running
+- **mongod --version** shows the installed MongoDB version
+- **netstat -tulnp | grep 27017** confirms MongoDB is listening on the default port **27017**
 
 You should see output similar to:
 
@@ -47,9 +51,9 @@ Build Info: {
 tcp        0      0 127.0.0.1:27017         0.0.0.0:*               LISTEN      4288/mongod
 ```
 
-2. Storage and Health Check
+## Run a storage baseline with fio (random read IOPS on Ubuntu 24.04)
 
-To perform a storage and health check, run the command below. This command checks how fast your storage can randomly read small 4KB chunks from a 100 MB file for 30 seconds, using one job, followed by a summary report:
+This reads random **4 KB** blocks from a **100 MB** file for **30 seconds** with one job and prints a summary.
 
 ```console
 fio --name=baseline --rw=randread --bs=4k --size=100M --numjobs=1 --time_based --runtime=30 --group_reporting
@@ -70,7 +74,7 @@ baseline: (groupid=0, jobs=1): err= 0: pid=3753: Mon Sep  1 10:25:07 2025
      | 30.00th=[  190], 40.00th=[  229], 50.00th=[  243], 60.00th=[  253],
      | 70.00th=[  269], 80.00th=[  289], 90.00th=[  318], 95.00th=[  330],
      | 99.00th=[  416], 99.50th=[  490], 99.90th=[  799], 99.95th=[ 1106],
-     | 99.99th=[ 3884]
+     | 99.99th=[  3884]
    bw (  KiB/s): min=14536, max=19512, per=100.00%, avg=17046.10, stdev=1359.69, samples=59
    iops        : min= 3634, max= 4878, avg=4261.53, stdev=339.92, samples=59
   lat (usec)   : 100=1.27%, 250=56.61%, 500=41.65%, 750=0.34%, 1000=0.06%
@@ -88,22 +92,24 @@ Run status group 0 (all jobs):
 Disk stats (read/write):
   sda: ios=127195/29, sectors=1017560/552, merge=0/15, ticks=29133/8, in_queue=29151, util=96.37%
 ```
-The output shows how fast it read data (**16.6 MB/s**) and how many reads it did per second (**~4255 IOPS**), which tells you how responsive your storage is for random reads.
+Interpretation:
+- **16.6 MB/s** is the measured read bandwidth for **4 KB** random reads.
+- **~4255 IOPS** indicates random read responsiveness.
 
-3. Connectivity and CRUD Sanity Check
+## Connectivity and CRUD sanity check
 
 To verify that the MongoDB server is reachable you will perform a connectivity check. You will run a sanity test of core database functionality and permissions, refered to as CRUD:
 
-C - Create: Insert a new record/document into the database.
-R - Read: Query the database to retrieve data.
-U - Update: Modify an existing record.
-D - Delete: Remove a record.
+- C - Create: insert a new record/document into the database
+- R - Read: query the database to retrieve data
+- U - Update: modify an existing record
+- D - Delete: remove a record
 
 ```console
 mongosh --host localhost --port 27017
 ```
 
-Inside shell:
+Inside the shell:
 
 ```javascript
 use baselineDB
@@ -113,7 +119,11 @@ db.testCollection.updateOne({ name: "baseline-check" }, { $set: { value: 2 } })
 db.testCollection.deleteOne({ name: "baseline-check" })
 exit
 ```
-These commands create a test record, read it, update its value, and then delete it a simple way to check if MongoDB’s basic **add, read, update, and delete** operations are working.
+What these commands do:
+- Create a test document.
+- Read it.
+- Update its value.
+- Delete it.
 
 You should see output similar to:
 
@@ -147,30 +157,31 @@ baselineDB> db.testCollection.deleteOne({ name: "baseline-check" })
 { acknowledged: true, deletedCount: 1 }
 ```
 
-4. Basic Query Performance Test
+## Run a basic query performance test (count filter)
 
-You will now perform a lightweight query performance check:
+Run a lightweight query performance check.
 
 ```console
 mongosh --eval '
 db = db.getSiblingDB("baselineDB");
 for (let i=0; i<1000; i++) { db.perf.insertOne({index:i, value:Math.random()}) };
 var start = new Date();
-db.perf.find({ value: { $gt: 0.5 } }).count();
+db.perf.countDocuments({ value: { $gt: 0.5 } });
 print("Query Time (ms):", new Date() - start);
 '
 ```
-The command connected to MongoDB, switched to the `baselineDB` database, inserted 1,000 documents into the perf collection, and then measured the execution time for counting documents where value > 0.5. The final output displayed the query execution time in milliseconds.
+This connects to MongoDB, selects the `baselineDB` database, inserts **1,000** documents into the `perf` collection, and measures the time to count documents where `value > 0.5`.
 
-You should see the Query Time output similar to:
+You should see output similar to:
 
 ```output
 Query Time (ms): 2
 ```
 
-5. Index Creation Speed Test
+## Measure index creation time in MongoDB
+
+Measure how long MongoDB takes to create an index on a collection.
 
-You will now run a performance sanity check that measures how long MongoDB takes to create an index on a given collection:
 ```console
 mongosh --eval '
 db = db.getSiblingDB("baselineDB");
@@ -179,7 +190,7 @@ db.perf.createIndex({ value: 1 });
 print("Index Creation Time (ms):", new Date() - start);
 '
 ```
-The test connected to MongoDB, switched to the `baselineDB` database, and created an index on the value field in the `perf` collection. The index creation process completed in 22 milliseconds, indicating relatively fast index building for the dataset size.
+This creates an index on the `value` field in the `perf` collection and prints the time taken.
 
 You should see output similar to:
 
@@ -187,9 +198,9 @@ You should see output similar to:
 Index Creation Time (ms): 22
 ```
 
-6. Concurrency Smoke Test
+## Run a concurrency smoke test with parallel mongosh sessions
 
-You will now verify that MongoDB can handle concurrent client connections and inserts without errors:
+Verify that MongoDB can handle concurrent client connections and inserts.
 
 ```console
 for i in {1..5}; do
@@ -197,10 +208,9 @@ for i in {1..5}; do
 done
 wait
 ```
-This command runs five MongoDB insert jobs at the same time, each adding 1,000 new records to the `baselineDB.concurrent` collection.
-It is a quick way to test how MongoDB handles multiple users writing data at once.
+This runs five parallel MongoDB shell sessions, each inserting **1,000** documents into the `baselineDB.concurrent` collection.
 
-You should see an output similar to:
+You should see output similar to:
 
 ```output
 [1] 3818
@@ -220,8 +230,8 @@ switched to db baselineDB;
 [5]+  Done                    mongosh --eval 'use baselineDB; db.concurrent.insertMany([...Array(1000).keys()].map(k => ({ test: k, ts: new Date() })))'
 ```
 
-Five parallel MongoDB shell sessions were executed, each inserting 1,000 test documents into the baselineDB.concurrent collection. All sessions completed successfully, confirming that concurrent data insertion works as expected.
+All sessions completed successfully, confirming that concurrent inserts work as expected.
 
-With these tests you have confirmed that MongoDB is installed successfully and is functioning as expected on the Azure Cobalt 100 (Arm64) environment.
+## Next steps
 
-You are now ready to perform further benchmarking for MongoDB.
+With these checks complete, proceed to the MongoDB benchmarking section to run workload‑focused tests on the **Azure Cobalt 100 Arm64** instance.
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md
index 877920677a..81289af90b 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/benchmarking.md
@@ -1,17 +1,17 @@
 ---
-title: MongoDB Benchmarking
-weight: 6
+title: Monitor MongoDB with mongostat
+weight: 8
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-## Benchmark MongoDB with **mongotop** and **mongostat**
+## Benchmark MongoDB with mongotop and mongostat
 
 In this section, you will measure MongoDB's performance in real time.
 You will install the official MongoDB database tools, start MongoDB and run a script to simulate heavy load. With the script running you will then measure the database's live performance using **mongotop** and **mongostat**.
 
-1. Install MongoDB Database Tools
+## Install MongoDB database tools
 
 ```console
 wget https://fastdl.mongodb.org/tools/db/mongodb-database-tools-ubuntu2404-arm64-100.13.0.deb
@@ -22,240 +22,87 @@ source ~/.bashrc
 ```
 These commands download and unpack MongoDB's official monitoring tools (**mongotop** & **mongostat**), then add them to your PATH so you can run them from any terminal.
 
-2. Verify the Installation
+## Verify the installation
 
 ```console
-mongotop --version
-mongostat --version
+mongostat 2
 ```
-This checks that both tools were installed correctly and are ready to use.
-
-You should see output similar to:
-```output
-mongostat --version
-mongotop version: 100.13.0
-git version: 23008ff975be028544710a5da6ae749dc7e90ab7
-Go version: go1.23.11
-   os: linux
-   arch: arm64
-   compiler: gc
-mongostat version: 100.13.0
-git version: 23008ff975be028544710a5da6ae749dc7e90ab7
-Go version: go1.23.11
-   os: linux
-   arch: arm64
-   compiler: gc
-```
-
-3. Make sure that the MongoDB Server that you started in the previous section is still running. If not, start it again, using the command as shown:
-
-```console
-mongod --dbpath /var/lib/mongo --logpath /var/log/mongodb/mongod.log --fork
-```
-These commands create a folder for MongoDB's data, then start the database server in the background, allowing connections from any IP, and save logs for troubleshooting.
-
-4. Create a Long-Running Load Script for Benchmarking
-
-Use a file editor of your choice and create a file named `long_system_load.js` with the content below:
-
-```javascript
-function randomString(len) {
-    return Math.random().toString(36).substring(2, 2 + len);
-}
-
-var systemCollections = [
-    { db: "admin", coll: "atlascli" },
-    { db: "config", coll: "system_sessions_bench" },
-    { db: "config", coll: "transactions_bench" },
-    { db: "local", coll: "system_replset_bench" },
-    { db: "benchmarkDB", coll: "testCollection" },
-    { db: "benchmarkDB", coll: "cursorTest" },
-    { db: "test", coll: "atlascli" },
-    { db: "test", coll: "system_sessions_bench" },
-    { db: "test", coll: "admin_system_version_test" }
-];
-
-systemCollections.forEach(function(ns) {
-    let col = db.getSiblingDB(ns.db).getCollection(ns.coll);
-    col.drop();
-    for (let i = 0; i < 100; i++) {
-        col.insertOne({ rnd: randomString(10), ts: new Date(), idx: i });
-    }
-    col.findOne();
-});
-
-var totalCycles = 50;   
-var pauseMs = 1000;      
-
-for (let cycle = 0; cycle < totalCycles; cycle++) {
-    systemCollections.forEach(function(ns) {
-        let col = db.getSiblingDB(ns.db).getCollection(ns.coll);
-
-        col.insertOne({ cycle, action: "insert", value: randomString(8), ts: new Date() });
-        col.find({ cycle: { $lte: cycle } }).limit(10).toArray();
-        col.updateMany({}, { $set: { updatedAt: new Date() } });
-        col.deleteMany({ idx: { $gt: 80 } });
-
-        let cursor = col.find().batchSize(5);
-        while (cursor.hasNext()) {
-            cursor.next();
-        }
-    });
-
-    print(`Cycle ${cycle + 1} / ${totalCycles} completed`);
-    sleep(pauseMs);
-}
-
-print("=== Long load generation completed ===");
-```
-
-This is the load generator script, it creates several collections and repeatedly inserts, queries, updates and deletes data. Running it simulates real application traffic so the monitors have something to measure.
-
-{{% notice Note %}}
-Before proceeding, the load script and the monitoring tools must be run in separate terminals simultaneously.
-
-- The load script continuously generates activity in MongoDB, keeping the database busy with multiple operations.
-- The mongotop and mongostat tools monitor and report this activity in real time as it happens.
-
-If all commands are run in the same terminal, the monitoring tools will only start after the script finishes, preventing real-time observation of MongoDB's performance.
-{{% /notice %}}
-
-### Run the load script (start the workload) — Terminal 1
+This prints a line every two seconds with key metrics. Press **Ctrl+C** to stop.
 
+**Connect explicitly if needed.** If your instance is on a non‑default host or port, provide them:
 ```console
-mongosh < long_system_load.js
+mongostat --host 127.0.0.1 --port 27017 2
 ```
 
-This command tells the MongoDB shell to execute the entire script. The script will run through its cycles and print the progress while generating the read/write activity on the server.
+## Example output
 
-You should see output similar to:
 ```output
-test> // long_system_load.js
-
-test> // Run with: mongosh < long_system_load.js
-
-test>
-
-test> function randomString(len) {
-...     return Math.random().toString(36).substring(2, 2 + len);
-... }
-[Function: randomString]
-test>
-
-test> // ---------- 1. Safe shadow "system-like" namespaces ----------
-
-test> var systemCollections = [
-...     { db: "admin", coll: "atlascli" },
-...     { db: "config", coll: "system_sessions_bench" },
-...     { db: "config", coll: "transactions_bench" },
-...     { db: "local", coll: "system_replset_bench" },
-...     { db: "benchmarkDB", coll: "testCollection" },
-...     { db: "benchmarkDB", coll: "cursorTest" },
-...     { db: "test", coll: "atlascli" },
-...     { db: "test", coll: "system_sessions_bench" },
-...     { db: "test", coll: "admin_system_version_test" }
-... ];
-
-test>
-
-test> // Create and warm up
-
-test> systemCollections.forEach(function(ns) {
-...     let col = db.getSiblingDB(ns.db).getCollection(ns.coll);
-...     col.drop();
-...     for (let i = 0; i < 100; i++) {
-...         col.insertOne({ rnd: randomString(10), ts: new Date(), idx: i });
-...     }
-...     col.findOne();
-... });
-
-test>
-
-test> // ---------- 2. Generate load loop ----------
-
-test> var totalCycles = 50;   // increase this for longer runs
-
-test> var pauseMs = 1000;      // 1 second pause between cycles
-
-test>
-
-test> for (let cycle = 0; cycle < totalCycles; cycle++) {
-...     systemCollections.forEach(function(ns) {
-...         let col = db.getSiblingDB(ns.db).getCollection(ns.coll);
-...
-...         col.insertOne({ cycle, action: "insert", value: randomString(8), ts: new Date() });
-...         col.find({ cycle: { $lte: cycle } }).limit(10).toArray();
-...         col.updateMany({}, { $set: { updatedAt: new Date() } });
-...         col.deleteMany({ idx: { $gt: 80 } });
-...
-...         let cursor = col.find().batchSize(5);
-...         while (cursor.hasNext()) {
-...             cursor.next();
-...         }
-...     });
-...
-...     print(`Cycle ${cycle + 1} / ${totalCycles} completed`);
-...     sleep(pauseMs);
-... }
-Cycle 1 / 50 completed
-Cycle 2 / 50 completed
-Cycle 3 / 50 completed
-Cycle 4 / 50 completed
-Cycle 5 / 50 completed
-Cycle 6 / 50 completed
-Cycle 7 / 50 completed
-Cycle 8 / 50 completed
-Cycle 9 / 50 completed
-Cycle 10 / 50 completed
-Cycle 11 / 50 completed
-Cycle 12 / 50 completed
-Cycle 13 / 50 completed
-Cycle 14 / 50 completed
-Cycle 15 / 50 completed
-Cycle 16 / 50 completed
-Cycle 17 / 50 completed
-Cycle 18 / 50 completed
-Cycle 19 / 50 completed
-Cycle 20 / 50 completed
-Cycle 21 / 50 completed
-Cycle 22 / 50 completed
-Cycle 23 / 50 completed
-Cycle 24 / 50 completed
-Cycle 25 / 50 completed
-Cycle 26 / 50 completed
-Cycle 27 / 50 completed
-Cycle 28 / 50 completed
-Cycle 29 / 50 completed
-Cycle 30 / 50 completed
-Cycle 31 / 50 completed
-Cycle 32 / 50 completed
-Cycle 33 / 50 completed
-Cycle 34 / 50 completed
-Cycle 35 / 50 completed
-Cycle 36 / 50 completed
-Cycle 37 / 50 completed
-Cycle 38 / 50 completed
-Cycle 39 / 50 completed
-Cycle 40 / 50 completed
-Cycle 41 / 50 completed
-Cycle 42 / 50 completed
-Cycle 43 / 50 completed
-Cycle 44 / 50 completed
-Cycle 45 / 50 completed
-Cycle 46 / 50 completed
-Cycle 47 / 50 completed
-Cycle 48 / 50 completed
-Cycle 49 / 50 completed
-Cycle 50 / 50 completed
-
-test>
-
-test> print("=== Long load generation completed ===");
-=== Long load generation completed ===
-
+insert query update delete getmore command dirty used flushes vsize  res qrw arw net_in net_out conn                time
+     8    16      8      8     182     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  34.0k    172k   11 Sep  4 04:57:56.761
+     4     8      4      4      98     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  18.3k    116k   11 Sep  4 04:57:58.762
+     4     9      4      4      99     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  18.3k    117k   11 Sep  4 04:58:02.760
+     8    17      8      8     202     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  37.0k    183k   11 Sep  4 04:58:04.762
+     4     9      4      4     103     2|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  19.0k    119k   11 Sep  4 04:58:06.760
+     8    15      7      7     183     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  33.5k    171k   11 Sep  4 04:58:08.761
+     5    11      5      5     126     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  23.1k    135k   11 Sep  4 04:58:10.760
+     6    12      6      6     133     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  24.5k    138k   11 Sep  4 04:58:12.760
+     7    14      7      7     190     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  34.1k    174k   11 Sep  4 04:58:14.761
+insert query update delete getmore command dirty used flushes vsize  res qrw arw net_in net_out conn                time
+     4     9      4      4     108     2|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  19.6k    123k   11 Sep  4 04:58:16.760
+     9    18      9      9     220     2|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  39.7k    195k   11 Sep  4 04:58:18.760
+     4     8      4      4     112     0|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  20.1k    125k   11 Sep  4 04:58:20.762
+     7    15      7      7     179     1|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  32.4k    169k   11 Sep  4 04:58:22.760
+     5    11      5      5     158     1|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  28.1k    155k   11 Sep  4 04:58:24.761
+     5     9      4      4     117     2|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  21.1k    128k   11 Sep  4 04:58:26.761
+     4     8      4      4     117     1|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0  20.7k    127k    6 Sep  4 04:58:28.761
+    *0    *0     *0     *0       0     0|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0    98b   53.3k    6 Sep  4 04:58:30.762
+    *0    *0     *0     *0       0     1|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0    87b   51.0k    3 Sep  4 04:58:32.761
 ```
 
-The load has been generated successfully. Now, you can proceed to the next section where you will monitor this running workload with:
-
-- **mongotop** to observe activity per collection.
-- **mongostat** to monitor overall operations per second, memory usage, and network activity.
+## Explanation of mongostat metrics
+
+- **insert** - number of document insert operations per second
+- **query** - number of query operations (reads) per second
+- **update** - number of document update operations per second
+- **delete** - number of delete operations per second
+- **getmore** - number of getMore operations per second (used when fetching more results from a cursor)
+- **command** - number of database commands executed per second (e.g., createIndex, count, aggregate)
+  - command = number of regular commands | number of getLastError (GLE) commands
+- **dirty/used** - percentage of the WiredTiger cache that is dirty (not yet written to disk) and the percentage actively used
+- **flushes** - how many times data has been flushed to disk (per second)
+- **vsize** - virtual memory size of the mongod process
+- **res** - resident memory size (actual RAM in use)
+- **qrw arw** - queued and active readers/writers
+  - `qrw` = queued read | queued write
+  - `arw` = active read | active write
+- **net_in/net_out** - amount of network traffic coming into (net_in) and going out of (net_out) the database per second
+- **conn** - number of active client connections
+- **time** - timestamp of the sample
+
+## Benchmark summary on Arm64
+Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**
+
+| insert | query | update | delete | getmore | command | dirty | used | flushes | vsize | res  | qrw  | arw  | net_in | net_out | conn | time                 |
+|--------|-------|--------|--------|---------|---------|-------|------|---------|-------|------|------|------|--------|---------|------|----------------------|
+| 50     | 0     | 0      | 0      | 0       | 7/0     | 0.0%  | 0.0% | 0       | 3.53G | 141M | 0/0  | 0/0  | 10.9k  | 57.8k   | 10   | Sep  4 04:57:18.761 |
+| 404    | 13    | 4      | 4      | 71      | 8/0     | 0.0%  | 0.0% | 0       | 3.53G | 143M | 0/0  | 0/0  | 96.3k  | 114k    | 10   | Sep  4 04:57:20.761 |
+| 7      | 14    | 7      | 7      | 108     | 2/0     | 0.0%  | 0.0% | 0       | 3.53G | 143M | 0/0  | 0/0  | 21.8k  | 118k    | 10   | Sep  4 04:57:22.760 |
+| 6      | 12    | 6      | 6      | 112     | 0/0     | 0.0%  | 0.0% | 0       | 3.53G | 143M | 0/0  | 0/0  | 21.9k  | 120k    | 10   | Sep  4 04:57:24.760 |
+| 8      | 16    | 8      | 8      | 136     | 1/0     | 0.0%  | 0.0% | 0       | 3.53G | 144M | 0/0  | 0/0  | 27.1k  | 137k    | 10   | Sep  4 04:57:26.762 |
+| 5      | 10    | 5      | 5      | 93      | 2/0     | 0.0%  | 0.0% | 0       | 3.54G | 144M | 0/0  | 0/0  | 18.2k  | 111k    | 11   | Sep  4 04:57:28.760 |
+| 7      | 15    | 7      | 7      | 135     | 0/0     | 0.0%  | 0.0% | 0       | 3.54G | 144M | 0/0  | 0/0  | 26.5k  | 139k    | 11   | Sep  4 04:57:30.761 |
+| 5      | 11    | 5      | 5      | 102     | 1/0     | 0.0%  | 0.0% | 0       | 3.54G | 144M | 0/0  | 0/0  | 19.7k  | 118k    | 11   | Sep  4 04:57:32.761 |
+| 7      | 16    | 10     | 7      | 138     | 2/0     | 0.0%  | 0.0% | 0       | 3.54G | 145M | 0/0  | 0/0  | 27.0k  | 143k    | 11   | Sep  4 04:57:34.761 |
+| 5      | 10    | 5      | 5      | 104     | 1/0     | 0.0%  | 0.0% | 0       | 3.54G | 145M | 0/0  | 0/0  | 20.1k  | 121k    | 11   | Sep  4 04:57:36.761 |
+
+
+### Highlights from Azure Ubuntu Pro 24.04 LTS Arm64 benchmarking
+
+- **insert, query, update, delete rates:** throughput remains consistent, with inserts and queries ranging from **5–50 ops/sec**, while updates and deletes generally track queries; a workload burst is observed with an **insert spike of 404**, highlighting MongoDB’s ability to handle sudden surges
+- **memory usage:** resident memory remains stable at **141–145 MB**, with virtual memory steady at **3.53–3.54 GB**, confirming efficient memory allocation and stability
+- **network activity:** network traffic scales proportionally with workload, with **net_in ranging ~18k–96k** and **net_out ~111k–143k**, showing balanced data flow
+- **connections:** active connections hold steady at **10–11**, indicating reliable support for concurrent client sessions without instability
+- **command execution & system load:** command executions (0–8) stay minimal, with dirty/used at **0.0%** and no flushes recorded, reflecting efficient internal resource handling
+- **overall system behavior:** MongoDB demonstrates stable throughput, predictable memory usage, and balanced network performance, while also showcasing resilience under workload bursts on Arm64
+
+You have now successfully benchmarked MongoDB on an Azure Cobalt 100 Arm64 virtual machine.
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md
index 55f6b3cadf..a74e830f92 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/create-instance.md
@@ -1,5 +1,5 @@
 ---
-title: Create an Arm based cloud virtual machine using Microsoft Cobalt 100 CPU 
+title: Create an Arm-based cloud virtual machine using Cobalt 100  
 weight: 3
 
 ### FIXED, DO NOT MODIFY
@@ -8,39 +8,48 @@ layout: learningpathall
 
 ## Introduction
 
-There are several ways to create an Arm-based Cobalt 100 virtual machine : the Microsoft Azure console, the Azure CLI tool, or using your choice of IaC (Infrastructure as Code). In this section, you will use the Azure console to create a virtual machine with Arm-based Azure Cobalt 100 Processor. 
+There are several ways to create an Arm-based Cobalt 100 virtual machine: 
 
-This learning path focuses on the general-purpose virtual machine of the D series. Please read the guide on [Dpsv6 size series](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series) offered by Microsoft Azure.  
+- The Azure console
+- The Azure CLI
+- An infrastructure as code (IaC) tool
 
-While the steps to create this instance are included here for your convenience, you can also refer to the [Deploy a Cobalt 100 Virtual Machine on Azure Learning Path](/learning-paths/servers-and-cloud-computing/cobalt/)
+In this section, you will use the Azure console to create a virtual machine with the Arm-based Azure Cobalt 100 processor.
 
-#### Create an Arm-based Azure Virtual Machine 
+This Learning Path focuses on the general-purpose virtual machines in the **Dpsv6** series. For further information, see the [Microsoft Azure Dpsv6 sizes series guide](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/general-purpose/dpsv6-series).  
 
-Creating a virtual machine based on Azure Cobalt 100 is no different from creating any other virtual machine in Azure. To create an Azure virtual machine, launch the Azure portal and navigate to "Virtual Machines".
-1. Select "Create", and click on "Virtual Machine" from the drop-down list.
-2. Inside the "Basic" tab, fill in the Instance details such as "Virtual machine name" and "Region".
-3. Choose the image for your virtual machine (for example, Ubuntu Pro 24.04 LTS) and select “Arm64” as the VM architecture.
-4. In the “Size” field, click on “See all sizes” and select the D-Series v6 family of virtual machines. Select “D4ps_v6” from the list.
+While the steps to create this instance are included here for convenience, you can also refer to the [Deploy a Cobalt 100 virtual machine on Azure Learning Path](/learning-paths/servers-and-cloud-computing/cobalt/).
 
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Figure 1: Select the D-Series v6 family of virtual machines")
+## Create an Arm-based Azure virtual machine 
 
-5. Select "SSH public key" as an Authentication type. Azure will automatically generate an SSH key pair for you and allow you to store it for future use. It is a fast, simple, and secure way to connect to your virtual machine.
-6. Fill in the Administrator username for your VM.
-7. Select "Generate new key pair", and select "RSA SSH Format" as the SSH Key Type. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key.
-8. In the "Inbound port rules", select HTTP (80) and SSH (22) as the inbound ports.
+Creating a virtual machine on Azure Cobalt 100 is similar to creating any other virtual machine in Azure. To create an Azure virtual machine, open the Azure portal and navigate to **Virtual machines**.
 
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance1.png "Figure 2: Allow inbound port rules")
+Now follow these steps:
 
-9. Click on the "Review + Create" tab and review the configuration for your virtual machine. It should look like the following:
+- Select **Create**, then select **Virtual machine** from the drop-down list.
+- In the **Basics** tab, enter the instance details such as **Virtual machine name** and **Region**.
+- Choose the image for your virtual machine (for example, **Ubuntu Pro 24.04 LTS**) and select **Arm64** as the **VM architecture**.
+- In the **Size** field, select **See all sizes** and select the **D-series v6** family of virtual machines. Select **D4ps_v6** from the list.
 
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/ubuntu-pro.png "Figure 3: Review and Create an Azure Cobalt 100 Arm64 VM")
+![Azure portal VM creation: Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance.png "Select a VM size")
 
-10. Finally, when you are confident about your selection, click on the "Create" button, and click on the "Download Private key and Create Resources" button.
+- Next, select **SSH public key** as the **Authentication type**. Azure can generate an **SSH key pair** for you and store it for future use. This is a fast, simple, and secure way to connect to your virtual machine.
+- Enter the **Administrator username** for your VM.
+- Select **Generate new key pair**, and select **RSA SSH format** as the **SSH key type**. RSA could offer better security with keys longer than 3072 bits. Give a Key pair name to your SSH key.
+- In **Inbound port rules**, select **HTTP (80)** and **SSH (22)** as the inbound ports:
 
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/instance4.png "Figure 4: Download Private key and Create Resources")
+![Azure portal VM creation: allow inbound port rules alt-text#center](images/instance1.png "Allow inbound port rules")
 
-11. Your virtual machine should be ready and running within no time. You can SSH into the virtual machine using the private key, along with the Public IP details.
+Select the **Review and create** tab and confirm your configuration. It should look similar to the following:
 
-![Azure portal VM creation — Azure Cobalt 100 Arm64 virtual machine (D4ps_v6) alt-text#center](images/final-vm.png "Figure 5: VM deployment confirmation in Azure portal")
+![Azure portal VM creation: review and create an Azure Cobalt 100 Arm64 VM alt-text#center](images/ubuntu-pro.png "Review and create an Azure Cobalt 100 Arm64 VM")
 
-While the virtual machine ready, proceed to the next section to deploy MongoDB on your running instance.
+When you are satisfied with your selections, select **Create**, then select **Download private key and create resource**.
+
+![Azure portal VM creation: download private key and create resources alt-text#center](images/instance4.png "Download private key and create resource")
+
+After deployment, your virtual machine will be running. Use the private key and the **Public IP** to connect over **SSH**.
+
+![Azure portal VM creation: VM deployment confirmation in Azure portal alt-text#center](images/final-vm.png "VM deployment confirmation in Azure portal")
+
+With the virtual machine ready, proceed to the next section to deploy MongoDB on your running instance.
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md
index 2bee07f312..d10934d87d 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/deploy.md
@@ -6,83 +6,98 @@ weight: 4
 layout: learningpathall
 ---
 
+## Get started
 
-## Install MongoDB and Mongosh on the Ubuntu Pro 24.04 LTS Arm instance
+Install MongoDB and **mongosh** on Ubuntu Pro 24.04 LTS (Arm64). You will download the binaries, add them to your `PATH`, create data and log directories, and start the server locally to verify the setup.
 
-Install MongoDB and mongosh on Ubuntu Pro 24.04 LTS Arm64 by downloading the binaries, setting up environment paths, configuring data and log directories, and starting the server for local access and verification.
+{{% notice Note %}}
+For this exercise, `mongod` runs locally and **access control is disabled** by default. Keep the server bound to `127.0.0.1` until you configure users and authentication. To accept remote connections later, set `--bind_ip` (or `bindIp` in the config) and enable authorization.
+{{% /notice %}}
 
-1. Install System Dependencies
+## Install system dependencies
+
+Install the required system packages to support MongoDB:
 
-Install required system packages to support MongoDB:
 ```console
 sudo apt update
 sudo apt install -y curl wget tar fio openssl libcurl4 net-tools
 ```
 
-2. Download and Extract MongoDB
+## Download and extract MongoDB
 
 Fetch and unpack the MongoDB binaries for Arm64:
+
 ```console
 wget https://fastdl.mongodb.org/linux/mongodb-linux-aarch64-ubuntu2404-8.0.12.tgz
 tar -xvzf mongodb-linux-aarch64-ubuntu2404-8.0.12.tgz
 sudo mv mongodb-linux-aarch64-ubuntu2404-8.0.12 /usr/local/mongodb
 ```
 
-3. Add MongoDB to System PATH
+## Add MongoDB to the system PATH
 
 Enable running MongoDB from any terminal session:
+
 ```console
 echo 'export PATH=/usr/local/mongodb/bin:$PATH' | sudo tee /etc/profile.d/mongodb.sh
 source /etc/profile.d/mongodb.sh
 ```
 
-4. Create data and log directories to use with MongoDB:
+## Create data and log directories
+
+Create the database data and log directories and assign ownership to your user:
 
-Set up the database data directory:
 ```console
 sudo mkdir -p /var/lib/mongo
 sudo mkdir -p /var/log/mongodb
 sudo chown -R $USER:$USER /var/lib/mongo /var/log/mongodb
 ```
 
-5. Start MongoDB Server 
+## Start the MongoDB server (local)
+
+Start `mongod` in the background and write logs to `/var/log/mongodb/mongod.log`:
 
-You can start MongoDB manually as shown:
 ```console
 mongod --dbpath /var/lib/mongo --logpath /var/log/mongodb/mongod.log --fork
 ```
 
-The output from this command should look like:
+Sample output:
+
 ```output
 about to fork child process, waiting until server is ready for connections.
 forked process: 3356
 child process started successfully, parent exiting
 ```
 
-6. Install mongosh
+## Install mongosh (MongoDB shell)
 
-**mongosh** is the MongoDB Shell used to interact with your MongoDB server. It provides a modern, user-friendly CLI for running queries and database operations.
+**mongosh** is the MongoDB shell for running queries and database operations.
+
+Download and install the MongoDB shell for Arm64:
 
-Download and install MongoDB’s command-line shell for Arm:
 ```console
 wget https://downloads.mongodb.com/compass/mongosh-2.3.8-linux-arm64.tgz
 tar -xvzf mongosh-2.3.8-linux-arm64.tgz
 sudo mv mongosh-2.3.8-linux-arm64 /usr/local/mongosh
 ```
-Add mongosh to  System `PATH`
+
+Add `mongosh` to the system `PATH`:
+
 ```console
 echo 'export PATH=/usr/local/mongosh/bin:$PATH' | sudo tee /etc/profile.d/mongosh.sh
 source /etc/profile.d/mongosh.sh
 ```
 
-### Verify MongoDB and mongosh Installation
+## Verify MongoDB and mongosh installation
+
+Confirm that MongoDB and `mongosh` are installed and available:
 
-Check if MongoDB and mongosh are properly installed on your machine:
 ```console
 mongod --version
 mongosh --version
 ```
-You should see output similar to: 
+
+You should see output similar to:
+
 ```output
 db version v8.0.12
 Build Info: {
@@ -100,13 +115,16 @@ Build Info: {
 2.3.8
 ```
 
-### Connect to MongoDB via mongosh
+## Connect to MongoDB using mongosh
+
+Connect to the local instance and verify an interactive prompt:
 
-You can now start interacting with MongoDB through its shell interface:
 ```console
 mongosh mongodb://127.0.0.1:27017
 ```
-You should see output on your terminal similar to: 
+
+You should see output similar to:
+
 ```output
 Current Mongosh Log ID: 68b573411523231d81a00aa0
 Connecting to:          mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.8
@@ -130,4 +148,4 @@ For mongosh info see: https://www.mongodb.com/docs/mongodb-shell/
 test>
 ```
 
-With this you have verified that the MongoDB installation is complete. You can now proceed with the baseline testing of MongoDB on your Azure Cobalt 100 based VM.
+With this verification complete, proceed to MongoDB baseline testing on your Azure Cobalt 100 VM.
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md
index 73fff11963..c1f73db7c3 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongostat.md
@@ -6,26 +6,33 @@ weight: 8
 layout: learningpathall
 ---
 
-## Monitoring MongoDB Performance using mongostat
-In this section you will monitor MongoDB in real-time using **mongostat** on Arm64 Azure virtual machines. You will notice low-latency, stable insert, query, update, and delete operations, with consistent memory usage and network throughput.
+## Overview
 
-## Monitor with mongostat — Terminal 3
+In this section you will use mongostat to monitor MongoDB in real time on an Azure Cobalt 100 Arm64 VM running Ubuntu 24.04 LTS. It provides a one‑line snapshot of operations per second, cache pressure, memory, and network throughput - perfect as a quick health and throughput dashboard while your workload runs.
 
-With the workload script running on your first terminal, you will now run mongostat on another terminal to view the real-time performance:
+## Monitor with mongostat - terminal 3
+
+With the workload running, start `mongostat` in another terminal:
 
 ```console
 mongostat 2
 ```
-**mongostat** gives a one-line summary every 2 seconds of inserts, queries, updates, deletes, memory use and network I/O. It is your quick health-and-throughput dashboard during the test.
+This prints a line every two seconds with key metrics. Press **Ctrl+C** to stop.
+
+**Connect explicitly if needed.** If your instance is on a non‑default host or port, provide them:
+```console
+mongostat --host 127.0.0.1 --port 27017 2
+```
+
+## Example output
 
-You should see output similar to:
 ```output
 insert query update delete getmore command dirty used flushes vsize  res qrw arw net_in net_out conn                time
      8    16      8      8     182     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  34.0k    172k   11 Sep  4 04:57:56.761
      4     8      4      4      98     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  18.3k    116k   11 Sep  4 04:57:58.762
      9    18      9      9     198     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  36.4k    179k   11 Sep  4 04:58:00.760
      4     9      4      4      99     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  18.3k    117k   11 Sep  4 04:58:02.760
-     8    17      8      8     202     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  37.0k    183k   11 Sep  4 04:58:04.762
+     8    17      8      8     202     1|0  0.0% 0.0%       0 3.54G 146M 0|0  0|0  37.0k    183k   11 Sep  4 04:58:04.762
      4     9      4      4     103     2|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  19.0k    119k   11 Sep  4 04:58:06.760
      8    15      7      7     183     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  33.5k    171k   11 Sep  4 04:58:08.761
      5    11      5      5     126     1|0  0.0% 0.0%       0 3.54G 146M 0|0 0|0  23.1k    135k   11 Sep  4 04:58:10.760
@@ -43,25 +50,25 @@ insert query update delete getmore command dirty used flushes vsize  res qrw arw
     *0    *0     *0     *0       0     1|0  0.0% 0.0%       0 3.54G 147M 0|0 0|0    87b   51.0k    3 Sep  4 04:58:32.761
 ```
 
-## Explanation of mongostat Metrics
+## Explanation of mongostat metrics
 
-- **insert** - Number of document insert operations per second.
-- **query** - Number of query operations (reads) per second.
-- **update** - Number of document update operations per second.
-- **delete** - Number of delete operations per second.
-- **getmore** - Number of getMore operations per second (used when fetching more results from a cursor).
-- **command** - Number of database commands executed per second (e.g., createIndex, count, aggregate).
+- **insert** - number of document insert operations per second
+- **query** - number of query operations (reads) per second
+- **update** - number of document update operations per second
+- **delete** - number of delete operations per second
+- **getmore** - number of getMore operations per second (used when fetching more results from a cursor)
+- **command** - number of database commands executed per second (for example, createIndex, count, aggregate)
   - command = number of regular commands | number of getLastError (GLE) commands
-- **dirty/used** - Percentage of the WiredTiger cache that is dirty (not yet written to disk) and the percentage actively used.
-- **flushes** - How many times data has been flushed to disk (per second).
-- **vsize** - Virtual memory size of the mongod process.
-- **res** - Resident memory size (actual RAM in use).
-- **qrw arw** - Queued and active readers/writers:
-  - `qrw` = queued read | queued write.
-  - `arw` = active read | active write.
-- **net_in/net_out** - Amount of network traffic coming into (net_in) and going out of (net_out) the database per second.
-- **conn** - Number of active client connections.
-- **time** - Timestamp of the sample.
+- **dirty/used** - percentage of the WiredTiger cache that is dirty (not yet written to disk) and the percentage actively used
+- **flushes** - how many times data has been flushed to disk (per second)
+- **vsize** - virtual memory size of the mongod process
+- **res** - resident memory size (actual RAM in use)
+- **qrw arw** - queued and active readers/writers
+  - `qrw` = queued read | queued write
+  - `arw` = active read | active write
+- **net_in/net_out** - amount of network traffic coming into (net_in) and going out of (net_out) the database per second
+- **conn** - number of active client connections
+- **time** - timestamp of the sample
 
 ## Benchmark summary on Arm64
 Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pro 24.04 LTS virtual machine**.
@@ -80,15 +87,13 @@ Here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Ubuntu Pr
 | 5      | 10    | 5      | 5      | 104     | 1/0     | 0.0%  | 0.0% | 0       | 3.54G | 145M | 0/0  | 0/0  | 20.1k  | 121k    | 11   | Sep  4 04:57:36.761 |
 
 
-### Highlights from Azure Ubuntu Pro 24.04 LTS Arm64 Benchmarking
-
+### Highlights from Azure Ubuntu Pro 24.04 LTS Arm64 benchmarking
 
-- **Insert, Query, Update, Delete Rates:** Throughput remains consistent, with inserts and queries ranging from **5–50 ops/sec**, while updates and deletes generally track queries. A workload burst is observed with an **insert spike of 404**, highlighting MongoDB’s ability to handle sudden surges.
-- **Memory Usage:** Resident memory remains stable at **141–145 MB**, with virtual memory steady at **3.53–3.54 GB**, confirming efficient memory allocation and stability.
-- **Network Activity:** Network traffic scales proportionally with workload, with **net_in ranging ~18k–96k** and **net_out ~111k–143k**, showing balanced data flow.
-- **Connections:** Active connections hold steady at **10–11**, indicating reliable support for concurrent client sessions without instability.
-- **Command Execution & System Load:** Command executions (0–8) stay minimal, with dirty/used at **0.0%** and no flushes recorded, reflecting efficient internal resource handling.
-- **Overall System Behavior:** MongoDB demonstrates stable throughput, predictable memory usage, and balanced network performance, while also showcasing resilience under workload bursts on Arm64.
+- **insert, query, update, delete rates:** throughput remains consistent, with inserts and queries ranging from **5–50 ops/sec**, while updates and deletes generally track queries; a workload burst is observed with an **insert spike of 404**, highlighting MongoDB’s ability to handle sudden surges
+- **memory usage:** resident memory remains stable at **141–145 MB**, with virtual memory steady at **3.53–3.54 GB**, confirming efficient memory allocation and stability
+- **network activity:** network traffic scales proportionally with workload, with **net_in ranging ~18k–96k** and **net_out ~111k–143k**, showing balanced data flow
+- **connections:** active connections hold steady at **10–11**, indicating reliable support for concurrent client sessions without instability
+- **command execution & system load:** command executions (0–8) stay minimal, with dirty/used at **0.0%** and no flushes recorded, reflecting efficient internal resource handling
+- **overall system behavior:** MongoDB demonstrates stable throughput, predictable memory usage, and balanced network performance, while also showcasing resilience under workload bursts on Arm64
 
-  
 You have now successfully benchmarked MongoDB on an Azure Cobalt 100 Arm64 virtual machine.
diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md
index 5ce917d298..a5c7a841c0 100644
--- a/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md
+++ b/content/learning-paths/servers-and-cloud-computing/mongodb-on-azure/mongotop.md
@@ -6,20 +6,28 @@ weight: 7
 layout: learningpathall
 ---
 
-## Monitor MongoDB Performance using Mongotop
-This guide demonstrates how to monitor MongoDB performance using **mongotop**, showing read and write activity across collections in real time. It includes benchmark results collected on Azure Arm64 virtual machines, providing a reference for expected latencies.
+## Overview
 
-## Run mongotop — Terminal 2
+This section demonstrates how to monitor MongoDB performance using `mongotop`, which reports how much time the server spends reading and writing per collection in real time. It includes benchmark results collected on Azure Arm64 virtual machines, providing a reference for expected latencies.
+
+## Prerequisites
+
+- `mongod` is running locally and bound to `127.0.0.1` (as started earlier)
+- Your **long_system_load.js** script is actively generating traffic in another terminal
+- MongoDB Database Tools (which include `mongotop`) are installed
+
+## Run mongotop - terminal 2
 
 ```console
 mongotop 2
 ```
-**mongotop** shows how much time the server spends reading and writing each collection (refreshes every 2 seconds here). It helps you see which collections are busiest and whether reads or writes dominate.
+This refreshes every 2 seconds and displays per‑collection time spent on reads and writes. Press **Ctrl+C** to stop.
+
+## Example output
 
 The tail end of the output should look like:
 ```output
-
-                            ns    total    read    write    2025-09-04T04:58:23Z
+`                           ns    total    read    write    2025-09-04T04:58:23Z
 test.admin_system_version_test      5ms     2ms      3ms
     test.system_sessions_bench      5ms     2ms      3ms
                 admin.atlascli      3ms     1ms      1ms
@@ -55,51 +63,51 @@ test.admin_system_version_test      6ms     2ms      3ms
   config.system_sessions_bench      3ms     2ms      1ms
           admin.system.version      0ms     0ms      0ms
 ```
-## Explanation of Metrics and Namespaces
 
-**Metrics**
+## Interpret the metrics
 
-  - **ns (Namespace)** – Identifies the specific database and collection being measured.
-  - **total** – Total time spent on both read and write operations.
-  - **read** – Time taken by read operations like queries or fetches.
-  - **write** – Time taken by write operations like inserts, updates, or deletes.
-  - **timestamp** – Marks when the metric snapshot was captured.
+Before you dive into individual columns, read each line as a snapshot of *where time was spent* in the last interval. Start with **total** to spot hot namespaces, then compare **read** vs **write** to understand the workload shape. Look across multiple refreshes for trends—steady growth or a single collection dominating is a signal to investigate indexes, query shapes, or storage latency.
+
+- **ns (namespace)** the `database.collection` being measured
+- **total** total time the server spent servicing operations for that namespace in the interval
+- **read** time spent on read operations such as queries and fetches
+- **write** time spent on write operations such as inserts, updates, and deletes
+- **timestamp** the time when the snapshot was captured
 
 **Namespaces**
 
-  - **benchmarkDB.testCollection** – Core benchmark collection with balanced read/write load.
-  - **admin.atlascli** – Tracks admin-level client activity.
-  - **benchmarkDB.cursorTest** – Measures cursor operations during benchmarking.
-  - **config.system_sessions_bench** – Benchmarks session handling in config DB.
-  - **config.transactions_bench** – Evaluates transaction performance in config DB.
-  - **local.system_replset_bench** – Tests replication set metadata access.
-  - **test.admin_system_version_test** – Monitors versioning metadata in test DB.
-  - **test.atlascli** – Simulates client-side workload in test DB.
-  - **test.system_sessions_bench** – Benchmarks session handling in test DB.
-  - **admin.system.version** – Static metadata collection with minimal activity.
+  - **benchmarkDB.testCollection** core benchmark collection with balanced read/write load
+  - **admin.atlascli** tracks admin-level client activity
+  - **benchmarkDB.cursorTest** measures cursor operations during benchmarking
+  - **config.system_sessions_bench** benchmarks session handling in config DB
+  - **config.transactions_bench** evaluates transaction performance in config DB
+  - **local.system_replset_bench** tests replication set metadata access
+  - **test.admin_system_version_test** monitors versioning metadata in test DB
+  - **test.atlascli** simulates client-side workload in test DB
+  - **test.system_sessions_bench** benchmarks session handling in test DB
+  - **admin.system.version** static metadata collection with minimal activity
 
 ## Benchmark summary on Arm64
-For easier visualization, shown here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Azure Ubuntu Pro 24.04 LTS virtual machine**.
-
-| Namespace (ns)                  | Total Time Range | Read Time Range | Write Time Range | Notes |
-| :------------------------------- | :--------------- | :-------------- | :--------------- | :------------------------------------------------------------ |
-| **admin.atlascli**                   | 2–6 ms           | 0–2 ms          | 1–3 ms           | Admin CLI operations. |
-| **benchmarkDB.cursorTest**           | 2–5 ms           | 0–2 ms          | 1–3 ms           | Cursor benchmark load. |
-| **benchmarkDB.testCollection**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Main benchmark workload. |
-| **config.system_sessions_bench**     | 2–6 ms           | 0–2 ms          | 1–3 ms           | System/benchmark sessions. |
-| **config.transactions_bench**        | 2–6 ms           | 0–2 ms          | 1–3 ms           | Internal transaction benchmark. |
-| **local.system_replset_bench**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Local replica set benchmark. |
-| **test.admin_system_version_test**   | 2–5 ms           | 0–2 ms          | 1–3 ms           | Version check workload. |
-| **test.atlascli**                    | 2–5 ms           | 0–2 ms          | 1–3 ms           | CLI/system background operations (test namespace). |
-| **test.system_sessions_bench**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Session benchmark (test namespace). |
-| **admin.system.version**             | 0 ms             | 0 ms            | 0 ms             | Appears inactive or instantaneous responses. |
 
+For easier visualization, here is a summary of benchmark results collected on an Arm64 **D4ps_v6 Azure Ubuntu Pro 24.04 LTS** virtual machine.
 
+| Namespace (ns)                  | Total Time Range | Read Time Range | Write Time Range | Notes |
+| :------------------------------ | :--------------- | :-------------- | :--------------- | :---- |
+| **admin.atlascli**                   | 2–6 ms           | 0–2 ms          | 1–3 ms           | Admin CLI operations |
+| **benchmarkDB.cursorTest**           | 2–5 ms           | 0–2 ms          | 1–3 ms           | Cursor benchmark load |
+| **benchmarkDB.testCollection**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Main benchmark workload |
+| **config.system_sessions_bench**     | 2–6 ms           | 0–2 ms          | 1–3 ms           | System/benchmark sessions |
+| **config.transactions_bench**        | 2–6 ms           | 0–2 ms          | 1–3 ms           | Internal transaction benchmark |
+| **local.system_replset_bench**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Local replica set benchmark |
+| **test.admin_system_version_test**   | 2–5 ms           | 0–2 ms          | 1–3 ms           | Version check workload |
+| **test.atlascli**                    | 2–5 ms           | 0–2 ms          | 1–3 ms           | CLI/system background operations (test namespace) |
+| **test.system_sessions_bench**       | 2–5 ms           | 0–2 ms          | 1–3 ms           | Session benchmark (test namespace) |
+| **admin.system.version**             | 0 ms             | 0 ms            | 0 ms             | Appears inactive or instantaneous responses |
 
 With the MongoDB performance summary of the results on your Arm-based Azure Cobalt 100 VM, you will notice:
-  - Stable, low-latency behavior across all tested namespaces.
-  - Read operations are near-instant (sub-2 ms), showing efficient query performance.
-  - Write operations remain consistently low, supporting reliable data modifications.
-  - System and transaction overheads are predictable, indicating a well-tuned environment for concurrent/replicated workloads.
+  - Stable, low-latency behavior across all tested namespaces
+  - Read operations are near-instant (sub-2 ms), showing efficient query performance
+  - Write operations remain consistently low, supporting reliable data modifications
+  - System and transaction overheads are predictable, indicating a well-tuned environment for concurrent or replicated workloads
 
-**Overall observation:** MongoDB operations on Arm64 are lightweight with predictable, low-latency reads and writes, confirming efficient performance on Azure Ubuntu Pro 24.04 LTS Arm64 Virtual machines. 
+In conclusion, MongoDB operations on Arm64 are lightweight with predictable, low-latency reads and writes, confirming efficient performance on Azure Ubuntu Pro 24.04 LTS Arm64 virtual machines.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md
index 2a67263998..eed61b5415 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/1_introduction_openbmc.md
@@ -1,70 +1,59 @@
 ---
-title: Introduction to OpenBMC and UEFI
+title: What are OpenBMC and UEFI?
 weight: 2
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
+## Overview
 
-## Introduction to OpenBMC and UEFI
+This section explains the roles of OpenBMC and UEFI in the Arm server boot flow and why simulating their integration is essential for early-stage development.
 
-This section explains the roles of OpenBMC and UEFI in the Arm server boot flow, and highlights why simulating their integration is essential for early-stage development.
-
-### OpenBMC
+### What is OpenBMC?
 
 [OpenBMC](https://www.openbmc.org/) is a collaborative open-source firmware stack for Baseboard Management Controllers (BMC), hosted by the Linux Foundation.  
-BMCs are embedded microcontrollers on server motherboards that enable both in-band and out-of-band system management.  
-Out-of-band access allows remote management even when the host system is powered off or unresponsive, while in-band interfaces support communication with the host operating system during normal operation.
+BMCs are embedded controllers on server motherboards that enable both in-band and out-of-band system management. Out-of-band access allows remote management even when the host is powered off or unresponsive, while in-band interfaces support communication with the host operating system during normal operation.
 
-The OpenBMC stack is built using the Yocto Project and includes a Linux kernel, system services, D-Bus interfaces, and support for industry-standard APIs such as Redfish and IPMI. It provides features like hardware monitoring, fan control, power sequencing, sensor telemetry, event logging, BIOS configuration, and more.
+The OpenBMC stack is built using the Yocto Project and includes a Linux kernel, system services, D-Bus interfaces, and support for industry-standard APIs such as Redfish and IPMI. It provides features like hardware monitoring, fan control, power sequencing, sensor telemetry, event logging, firmware/BIOS configuration, and more.
 
-Its architecture is modular by design—each board or platform can define its own layers and packages through Yocto recipes, enabling custom extensions to firmware functionality without modifying upstream code.
+Its architecture is modular. Each board or platform can define its own layers and packages through Yocto recipes, enabling custom extensions without modifying upstream code.
 
-It is widely adopted by hyperscalers and enterprise vendors to manage servers, storage systems, and network appliances.  
-OpenBMC is particularly well-suited to Arm-based server platforms like **[Neoverse RD-V3](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html)**, where it provides early-stage platform control and boot orchestration even before silicon is available.
+OpenBMC is widely adopted by hyperscalers and enterprise vendors to manage servers, storage systems, and network appliances. It is particularly well-suited to Arm-based server platforms like **[Neoverse RD-V3](https://neoverse-reference-design.docs.arm.com/en/latest/platforms/rdv3.html)**, where it provides early-stage platform control and boot orchestration even before silicon is available.
 
 **Key features of OpenBMC include:**
-- **Remote management:** power control, Serial over LAN (SOL), and virtual media  
-- **Hardware health monitoring:** sensors, fans, temperature, voltage, and power rails  
+- **Remote management:** power control, Serial over LAN (SOL), virtual media  
+- **Hardware health monitoring:** sensors, fans, temperature, voltage, power rails  
 - **Firmware update mechanisms:** support for signed image updates and secure boot  
-- **Industry-standard APIs:** IPMI, Redfish, PLDM, and MCTP  
-- **Modular and extensible design:** device tree-based configuration and layered architecture  
+- **Industry-standard APIs:** IPMI, Redfish, PLDM, MCTP  
+- **Modular, extensible design:** device tree-based configuration and layered architecture  
 
 OpenBMC enables faster development cycles, open innovation, and reduced vendor lock-in across data centers, cloud platforms, and edge environments.
 
-In this Learning Path, you’ll simulate how OpenBMC manages the early stage boot process, power sequencing, and remote access for a virtual Neoverse RD-V3 server. You will interact with the BMC console, inspect boot logs, and verify serial-over-LAN and UART communication with the host.
+In this Learning Path, you’ll simulate how OpenBMC manages the early-stage boot process, power sequencing, and remote access for a virtual Neoverse RD-V3 server. You will interact with the BMC console, inspect boot logs, and verify SOL and UART communication with the host.
 
-### UEFI
+## What is UEFI?
 
-The [Unified Extensible Firmware Interface (UEFI)](https://uefi.org/) is the modern replacement for legacy BIOS, responsible for initializing hardware and loading the operating system.  
-UEFI provides a robust, modular, and extensible interface between platform firmware and OS loaders. It supports:
+The [Unified Extensible Firmware Interface (UEFI)](https://uefi.org/) is the modern replacement for legacy BIOS, responsible for initializing hardware and loading the operating system. UEFI provides a robust, modular, and extensible interface between platform firmware and OS loaders. It supports:
 
-- A modular and extensible architecture  
 - Faster boot times and reliable system initialization  
-- Large storage device support using GPT (GUID Partition Table)  
+- Large-capacity device support using GPT (GUID Partition Table)  
 - Secure Boot for verifying boot integrity  
-- Pre-boot networking and diagnostics via UEFI Shell or applications  
+- Pre-boot networking and diagnostics using the UEFI Shell or applications  
 
-UEFI executes after the platform powers on and before the OS kernel takes over.  
-It discovers and initializes system hardware, configures memory and I/O, and launches the bootloader.  
-It is governed by the UEFI Forum and is now the standard firmware interface across server-class, desktop, and embedded systems.
+UEFI executes after the platform powers on and before the OS kernel takes over. It discovers and initializes system hardware, configures memory and I/O, and launches the bootloader. It is governed by the UEFI Forum and is the standard firmware interface across server-class, desktop, and embedded systems.
 
-In platforms that integrate OpenBMC, the BMC operates independently from the host CPU and manages platform power, telemetry, and recovery.  
-During system boot, UEFI and OpenBMC coordinate via mechanisms such as IPMI over KCS, PLDM over MCTP, or shared memory buffers.  
+In platforms that integrate OpenBMC, the BMC operates independently from the host CPU and manages platform power, telemetry, and recovery. During system boot, UEFI and OpenBMC coordinate via mechanisms such as IPMI over KCS, PLDM over MCTP, or shared memory buffers.
 
 These interactions are especially critical in Arm server-class platforms—like Neoverse RD-V3—for secure boot, remote diagnostics, and system recovery during pre-silicon or bring-up phases.
 
-### Key Interactions Between OpenBMC and UEFI
-
-| **Interaction**           | **Direction**     | **Description**                                                                 |
-|---------------------------|-------------------|---------------------------------------------------------------------------------|
-| Boot power sequencing     | BMC → Host        | BMC controls host power-on flow, ensuring UEFI starts in the correct sequence.  |
-| Boot status reporting     | UEFI → BMC        | UEFI sends boot state and progress via IPMI (KCS) or PLDM.                      |
-| Serial-over-LAN (SOL)     | BMC ↔ Host        | BMC bridges host UART console to remote clients over the network.               |
-| Pre-boot configuration    | BMC ↔ UEFI        | BMC may inject or read boot config settings via shared memory or commands.      |
-| System recovery signaling | UEFI → BMC        | UEFI can request BMC to initiate reboot, NMI, or recovery actions.              |
-
+### Key interactions between OpenBMC and UEFI
 
-In this Learning Path, you will build and run the UEFI firmware on the RD-V3 FVP host platform.
+| Interaction                | Direction | Description                                                                 |
+|---------------------------|-----------|-----------------------------------------------------------------------------|
+| Boot power sequencing     | BMC → Host| BMC controls host power-on flow, ensuring UEFI starts in the correct sequence |
+| Boot status reporting     | UEFI → BMC| UEFI reports boot state and progress via IPMI (KCS) or PLDM                 |
+| Serial over LAN (SOL)     | BMC ↔ Host| BMC bridges the host UART console to remote clients over the network        |
+| Pre-boot configuration    | BMC ↔ UEFI| BMC may inject or read boot settings via shared memory or commands          |
+| System recovery signaling | UEFI → BMC| UEFI can request BMC-initiated reboot, NMI, or recovery actions             |
 
-You will use OpenBMC to power on the virtual Arm server, access the serial console, and monitor the host boot progress like real hardware platform. By inspecting the full boot log and observing system behavior in simulation, you will gain valuable insights into how BMC and UEFI coordinate during early firmware bring-up.
+In this Learning Path, you will build and run the UEFI firmware on the RD-V3 FVP host platform. You will use OpenBMC to power on the virtual Arm server, access the serial console, and monitor host boot progress like real hardware. By inspecting the full boot log and observing system behavior in simulation, you will see how BMC and UEFI coordinate during early firmware bring-up.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md
index 1b0e0d99a3..ed475d44cb 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/2_openbmc_setup.md
@@ -1,33 +1,40 @@
 ---
-title: Set Up the Pre-Silicon Development Environment for OpenBMC and UEFI
+title: Set up the development environment for OpenBMC and UEFI
 weight: 3
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-## Set Up Development Environment
+## Set up your development environment
 
-In this section, you’ll prepare your workspace to build and simulate OpenBMC and UEFI firmware on the Neoverse RD-V3 platform using Arm Fixed Virtual Platforms (FVPs). 
-You will install the required tools, configure repositories, and set up a Docker-based build environment for both BMC and host firmware.
+In this section, you prepare your workspace to build and simulate OpenBMC and UEFI firmware on the Neoverse RD-V3 r1 platform using Arm Fixed Virtual Platforms (FVPs). You will install the required tools, configure repositories, and set up a Docker-based build environment for both BMC and host firmware.
 
-Before getting started, it’s strongly recommended to review the previous Learning Path: [CSS-V3 Pre-Silicon Software Development Using Neoverse Servers](https://learn.arm.com/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack).
-It walks you through how to use the CSSv3 reference design on FVP to perform early-stage development and validation.
+Before you start, review the related Learning Path [CSS-V3 pre-silicon software development using Neoverse servers](https://learn.arm.com/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack). It walks through using the CSSv3 reference design on FVP to perform early development and validation.
 
 You will perform the steps outlined below on your Arm Neoverse-based Linux machine running Ubuntu 22.04 LTS. You will need at least 80 GB of free disk space, 48 GB of RAM.
 
-### Install Required Packages
+## Install the required packages
 
 Install the base packages for building OpenBMC with the Yocto Project:
 
 ```bash
 sudo apt update
-sudo apt install -y git gcc g++ make file wget gawk diffstat bzip2 cpio chrpath zstd lz4 bzip2 unzip
+sudo apt install -y git gcc g++ make file wget gawk diffstat bzip2 cpio chrpath zstd lz4 bzip2 unzip xz-utils python3
 ```
 
-Install [Docker](/install-guides/docker)
+Now install Docker: 
 
-### Set Up the repo Tool
+```bash
+curl -fsSL get.docker.com -o get-docker.sh && sh get-docker.sh
+sudo usermod -aG docker $USER ; newgrp docker
+```
+
+{{% notice Note %}}
+See the [Docker Install Guide](/install-guides/docker) for further information.
+{{% /notice %}}
+
+Next install the `repo` tool:
 
 ```bash
 mkdir -p ~/.bin
@@ -36,37 +43,33 @@ curl https://storage.googleapis.com/git-repo-downloads/repo > ~/.bin/repo
 chmod a+rx ~/.bin/repo
 ```
 
-### Download and Install the Arm FVP Model (RD-V3)
 
-Download and extract the RD-V3 FVP:
+
+
+## Download and install the Arm FVP (RD-V3 r1)
+
 ```bash
-mkdir ~/fvp
+mkdir -p ~/fvp
 cd ~/fvp
 wget https://developer.arm.com/-/cdn-downloads/permalink/FVPs-Neoverse-Infrastructure/RD-V3-r1/FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz
 tar -xvf FVP_RD_V3_R1_11.29_35_Linux64_armv8l.tgz
 ./FVP_RD_V3_R1.sh
 ```
 
-The FVP installation may prompt you with a few questions, choosing the default options is sufficient for this learning path. By default, the FVP will be installed in `$HOME/FVP_RD_V3_R1`.
-
+Accept the defaults unless you have a reason not to. By default, the FVP installs under `$HOME/FVP_RD_V3_R1`.
 
-### Initialize the Host Build Environment
-
-Set up a workspace for host firmware builds:
+## Initialize the host build environment
 
 ```bash
-mkdir ~/host
-cd host
-~/.bin/repo init -u "https://git.gitlab.arm.com/infra-solutions/reference-design/infra-refdesign-manifests.git" \
- -m "pinned-rdv3r1-bmc.xml" \
- -b "refs/tags/RD-INFRA-2025.07.03" \
- --depth=1
+mkdir -p ~/host
+cd ~/host
+~/.bin/repo init -u "https://git.gitlab.arm.com/infra-solutions/reference-design/infra-refdesign-manifests.git"  -m "pinned-rdv3r1-bmc.xml"  -b "refs/tags/RD-INFRA-2025.07.03"  --depth=1
 repo sync -c -j $(nproc) --fetch-submodules --force-sync --no-clone-bundle
 ```
 
-### Apply Required Patches
+## Apply required patches
 
-To enable platform-specific functionality such as Redfish support and UEFI enhancements, apply a set of pre-defined patches from Arm’s GitLab repository.
+These patches enable platform-specific functionality such as Redfish support and UEFI enhancements and align the build system with the RD-V3 FVP setup.
 
 Use sparse checkout to download only the `patch/` folder:
 
@@ -79,14 +82,15 @@ echo /patch >> .git/info/sparse-checkout
 git pull origin main
 ```
 
-This approach allows you to fetch only the `patch` folder from the remote Git repository—saving time and disk space.
+This approach allows you to fetch only the `patch` folder from the remote Git repository - saving time and disk space.
 
 Next, using a file editor of your choice, create an `apply_patch.sh` script inside the `~` directory and paste in the following content. 
-This script will automatically apply the necessary patches to each firmware component.
+This script automatically applies the necessary patches to each firmware component:
 
 ```bash
 FVP_DIR="host"
-SOURCE=${PWD}
+SOURCE="$HOME/host"
+
 
 GREEN='\033[0;32m'
 NC='\033[0m'
@@ -121,7 +125,7 @@ popd > /dev/null
 popd > /dev/null
 ```
 
-Run the patch script:
+Run the script:
 
 ```bash
 cd ~
@@ -132,10 +136,10 @@ chmod +x ./apply_patch.sh
 This script automatically applies patches to edk2, edk2-platforms, buildroot, and related components.
 These patches enable additional UEFI features, integrate the Redfish client, and align the build system with the RD-V3 simulation setup.
 
-### Build RDv3 R1 Host Docker Image
+## Build RDv3 R1 host Docker image
 
 Before building the host image, update the following line in `~/host/grub/bootstrap` to replace the `git://` protocol.
-Some networks may restrict `git://` access due to firewall or security policies. Switching to `https://` ensures reliable and secure access to external Git repositories.
+Some networks might restrict `git://` access due to firewall or security policies. Switching to `https://` ensures reliable and secure access to external Git repositories.
 
 ```bash
 diff --git a/bootstrap b/bootstrap
@@ -150,7 +154,7 @@ usage() {
     cat <<EOF
 ```
 
-This command builds the Docker image used for compiling the host firmware components in a controlled environment.
+Build the container image and host artifacts:
 
 ```bash
 cd ~/host/container-scripts
@@ -170,10 +174,10 @@ docker run --rm \
   bash -c "./build-scripts/rdinfra/build-test-busybox.sh -p rdv3r1 all"
 ```
 
-Once complete, you can observe the build binary in `~/host/output/rdv3r1/rdv3r1/`:
+Verify the build artifacts:
 
 ```bash
-ls -la host/output/rdv3r1/rdv3r1/
+ls -la ~/host/output/rdv3r1/rdv3r1/
 ```
 The directory contents should look like:
 
@@ -210,17 +214,13 @@ lrwxrwxrwx 1 ubuntu ubuntu      33 Aug 18 10:19 uefi.bin -> ../components/css-co
 
 
 {{% notice Note %}}
-This [Arm Learning Path](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build/) provides a complete introduction to setting up the RDv3 development environment, please refer to it for more details.
-{{% /notice %}}
+See the Learning Path [Develop and Validate Firmware Pre-Silicon on Arm Neoverse CSS V3](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/3_rdv3_sw_build/) for an introduction to setting up the RD-V3 development environment. {{% /notice %}}
 
+## Build the OpenBMC image
 
-### Build OpenBMC Image
+OpenBMC is built on the Yocto Project, which uses BitBake as its build tool. BitBake is included in the OpenBMC environment, so you do not install it separately.
 
-OpenBMC is built on the Yocto Project, which uses `BitBake` as its build tool.
-You don’t need to download BitBake separately, as it is included in the OpenBMC build environment. 
-Once you’ve set up the OpenBMC repository and initialized the build environment, BitBake is already available for building images, compiling packages, or running other tasks.
-
-Start by cloning and building the OpenBMC image using the bitbake build system:
+Clone and build:
 
 ```bash
 cd ~
@@ -230,7 +230,7 @@ source setup fvp
 bitbake obmc-phosphor-image
 ```
 
-During the OpenBMC build process, you may encounter a native compilation error when building `Node.js` (especially version 22+) due to high memory usage during the V8 engine build phase.
+During the OpenBMC build process, you might encounter a native compilation error when building `Node.js` (especially version 22+) due to high memory usage during the V8 engine build phase.
 
 ```output
 g++: fatal error: Killed signal terminated program cc1plus
@@ -287,10 +287,12 @@ NOTE: Executing Tasks
 This confirms that the OpenBMC image was built successfully.
 
 {{% notice Note %}}
-The first build may take up to an hour depending on your system performance, as it downloads and compiles the entire firmware stack.
+The first build can take up to an hour depending on system performance because it downloads and compiles the full firmware stack.
 {{% /notice %}}
 
-Your workspace should now be structured to separate the FVP, host build system, OpenBMC source, and patches—simplifying organization, maintenance, and troubleshooting.
+## Workspace layout
+
+Your workspace separates the FVP, host build system, OpenBMC source, and patches for easier maintenance and troubleshooting.
 
 ```output
 ├── FVP_RD_V3_R1
@@ -319,4 +321,4 @@ Your workspace should now be structured to separate the FVP, host build system,
 └── run.sh
 ```
 
-With both the OpenBMC and host firmware environments built and configured, you’re now fully prepared to launch the full system simulation and observe the boot process in action.
+With both the OpenBMC and host firmware environments built and configured, you are ready to launch full-system simulation and observe the boot process.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md
index 881cd079f9..19d83590c5 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/3_openbmc_simulate.md
@@ -1,69 +1,66 @@
 ---
-title: Run OpenBMC and Host UEFI Simulation on RD-V3 FVP
+title: Run OpenBMC and host UEFI simulation on RD-V3 FVP
 weight: 4
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-## Prepare Pre-Silicon OpenBMC Simulation
+## Prepare pre-silicon OpenBMC simulation
 
-With your environment prepared, you can now simulate the full pre-silicon firmware boot flow using the Arm Neoverse RD-V3 reference design.
-You’ll build the OpenBMC image, launch the Arm Fixed Virtual Platform (FVP), and observe the full boot process of both the BMC and host UEFI firmware in a simulated environment.
+With your environment ready, you can simulate the full pre-silicon firmware boot flow using the Arm Neoverse RD-V3 r1 reference design. You’ll build the OpenBMC image, launch the Arm Fixed Virtual Platform (FVP), and observe the boot of both the BMC and host UEFI firmware.
 
-This simulation launches multiple UART consoles,each mapped to a separate terminal window for different subsystems (e.g., Neoverse V3, Cortex‑M55, Cortex‑M7, and the Cortex-A BMC).
+This simulation launches multiple UART consoles, each in a separate terminal window for different subsystems (for example, Neoverse V3, Cortex-M55, Cortex-M7, and the Cortex-A BMC). These graphical terminals require a desktop session. If you’re connecting over SSH only, they won’t render.
 
-These graphical terminal windows require a desktop session. If you're accessing the simulation over SSH (e.g., on a cloud instance), they may not display properly.
-
-To ensure proper display and interactivity, it is recommended to install a Remote Desktop environment using XRDP.
-
-On an Arm cloud Ubuntu 22.04 instance, you will need to install required packages:
+Install a remote desktop environment with XRDP:
 
 ```bash
 sudo apt update
-sudo apt install -y ubuntu-desktop xrdp xfce4 xfce4-goodies pv xterm sshpass socat retry
+sudo apt install -y xrdp xorg xfce4 xfce4-goodies xterm pv sshpass socat
+echo xfce4-session > ~/.xsession
+sudo adduser xrdp ssl-cert
 sudo systemctl enable --now xrdp
 ```
 
-You may need to follow the Step 2 on the [RD-V3 learning path](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp/) to setup the networking and GDM configuration.
+If you use GNOME on the server, you may need the networking and GDM tweaks in Step 2 of the [RD-V3 Learning Path](/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/4_rdv3_on_fvp/).
 
-Once connected via Remote Desktop, open a terminal and launch the RD‑V3 FVP simulation:
+Once connected through Remote Desktop, open a terminal and launch the RD-V3 FVP simulation.
 
-## Execute Pre-Silicon OpenBMC Simulation
+## Execute pre-silicon OpenBMC simulation
 
-To make the simulation process more intuitive, you’ll need to modify a script from Arm’s GitLab repository:
+Download the helper script from Arm’s GitLab:
 
 ```bash
 cd ~
 wget https://gitlab.arm.com/server_management/PoCs/fvp-poc/-/raw/2a79ae93560969a3b802dfb3d7d89f9fd9dee5a6/run.sh
 ```
 
-Before running the simulation, open the `run.sh` script and locate the line that defines FVP_KEYWORD.
+Before running the simulation, open the `run.sh` script and locate the line that defines `FVP_KEYWORD`.
 This variable determines when the host FVP should be launched by monitoring OpenBMC’s console output.
-If not set correctly, the script may hang or fail to start the host simulation.
+
+If not set correctly, the script might hang or fail to start the host simulation.
 Update the line to:
 
 ```output
 FVP_KEYWORD="terminal2: Listening for serial connection on port"
 ```
 
-Then, execute the script.
+Then, execute the script:
 
 ```bash
 chmod +x ./run.sh
 ./run.sh -m ~/FVP_RD_V3_R1/models/Linux64_GCC-9.3/FVP_RD_V3_R1
 ```
 
-The `run.sh` script will:
+The script will:
 
 - Launch the OpenBMC FVP and wait for BMC boot
-- Automatically start the host FVP for RD-V3 (running UEFI)
-- Connect the UART consoles between the BMC and host via virtual pipes
-- Connect MCTP and IPMI tunnels between the OpenBMC FVP and the RD-V3 host FVP
-- Stop the OpenBMC FVP and RD-V3 host FVP when CTRL+C is pressed
+- Start the host FVP for RD-V3 r1 (UEFI)
+- Bridge UART consoles between BMC and host using virtual pipes
+- Create MCTP and IPMI tunnels between the OpenBMC FVP and the host FVP
+- Stop both FVPs when you press Ctrl+C
 
-
-Once the simulation is running, the `OpenBMC FVP console` will stop at the Linux login prompt:
+When running, the **OpenBMC FVP console** stops at a Linux login prompt:
 
 ```output
 [  OK  ] Started phosphor systemd target monitor.
@@ -81,39 +78,40 @@ Phosphor OpenBMC (Phosphor OpenBMC Project Reference Distro) nodistro.0 fvp ttyA
 fvp login:
 ```
 
-Enter the OpenBMC default username `root` and password, which is `0penBmc`.
-
+Log in with user `root` and password `0penBmc`.
 
 {{% notice Note %}}
-The first character of the password is the number ***0***, not a capital ***O***.
+The first character of the password is the number **0** (zero), not a capital **O**.
 {{% /notice %}}
 
-After login, you will be dropped into the OpenBMC shell, a minimal Linux environment running inside the simulated BMC.
+The host-side UEFI appears in the **FVP terminal_ns_uart0** window. You might briefly see the UEFI firmware setup. 
 
-The host-side UEFI simulation will appear in the `FVP terminal_ns_uart0` console. 
-You may briefly see the UEFI Firmware Setup Menu—select `Continue` to proceed with boot. 
-The system will then enter GRUB and begin booting Linux.
+Select **Continue** to proceed, then GRUB appears and Linux boots:
 
-![img2 alt-text#center](openbmc_hostuefi.jpg "UEFI Firmware Setup Menu")
+![img2 UEFI firmware setup menu in the FVP `terminal_ns_uart0` console before GRUB boots (OpenBMC/UEFI on RD-V3 FVP)#center](openbmc_hostuefi.jpg "UEFI firmware setup menu before GRUB boots")
 
-The simulation will carry on the CSS-V3-R1 part, enter the GRUB menu. Press Enter to proceed.
+The simulation proceeds to the **CSSv3 r1** GRUB menu. Press **Enter** to boot.
 
-A successful simulation will show login prompts on both BMC and host consoles. You can also confirm success by seeing the final system state in the Web UI or UART output.
+A successful run shows login prompts on both BMC and host consoles. You can also confirm final state in the Web UI or using UART output.
 
-![img2 alt-text#center](openbmc_cssv3_sim.jpg "Simuation Success")
+![img2 BMC and host consoles each showing a login prompt after a successful OpenBMC + host UEFI simulation on RD-V3 FVP#center](openbmc_cssv3_sim.jpg "Simulation success with BMC and host consoles")
 
-Shown here is a simulation recording. It gives you a quick visual overview of how OpenBMC and UEFI boot and interact during pre-silicon execution.
 
-![img1 alt-text#center](openbmc_cssv3_running.gif "Simuation Running")
+Shown here is a short recording that illustrates OpenBMC and UEFI interaction during pre-silicon execution.
 
-After simulation completes, logs for both the BMC and host will be stored in `~/logs`. These are useful for verifying boot success or troubleshooting issues.
+![img1 Animated capture of OpenBMC and host UEFI consoles interacting during pre-silicon execution on the RD-V3 FVP#center](openbmc_cssv3_running.gif "OpenBMC and UEFI consoles interacting during pre-silicon execution")
 
-- `obmc_boot.log`: BMC boot output  
-- `obmc_console.log`: BMC serial output  
-- `fvp_boot.log`: Host UEFI boot output
 
-By reviewing the contents of the logs folder, you can verify the expected system behavior or quickly diagnose
-any anomalies that arise during boot or runtime.
+After the simulation, logs for both BMC and host are stored in `~/logs`:
+
+- `obmc_boot.log`  BMC boot output
+- `obmc_console.log`  BMC serial output
+- `fvp_boot.log`  Host UEFI boot output
+
+Tail them to verify behavior or troubleshoot:
+
+```bash
+tail -n +1 ~/logs/* | less -R
+```
 
-With the simulation running successfully, you are now ready to perform real-time testing between the host and the BMC.  
-In the next section, you will explore how to interact with the BMC using UART and IPMI from the host side, validating communication channels in a pre-silicon context.
+With the simulation running successfully, you’re ready to exercise host↔BMC flows. You'll now move on to interact with the BMC using UART and IPMI from the host to validate pre-silicon communication paths.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md
index 2ff9a0729f..99d05f6721 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/4_openbmc_communicate.md
@@ -1,25 +1,24 @@
 ---
-title: Monitor and Control the Host CPU via OpenBMC SOL and Web UI
+title: Monitor and control the host CPU using OpenBMC SOL and web UI
 weight: 5
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-## Access the Host Console via OpenBMC SOL
+## Access the host console via OpenBMC SOL
 
-The OpenBMC platform provides `Serial over LAN` (SOL), allowing you to access the host console (RD-V3 FVP) remotely through the BMC, without needing a physical serial cable.
-In this section, you will use `socat` to create a virtual UART bridge, verify port mappings, and access the host console via the BMC Web UI.
+OpenBMC provides Serial oOer LAN (SOL) so you can access the host console (RD-V3 FVP) remotely through the BMC without a physical serial cable. In this section, you create a virtual UART bridge with `socat`, verify the port mappings, and open the host console from the BMC web UI.
 
-### Step 1: Connect the BMC and Host Consoles
+### Step 1: Connect the BMC and host consoles
 
-Run the following command on your development Linux machine (where the simulation is running) to bridge the BMC and host UART ports:
+Run this command on the Linux machine where the simulation is running to bridge the BMC and host UART ports:
 
 ```bash
 socat -x tcp:localhost:5005 tcp:localhost:5067
 ```
 
-This command connects the host-side UART port (5005) to the BMC-side port (5067), allowing bi-directional serial communication.
+This command connects the host-side UART port 5005 to the BMC-side port 5067 to enable bidirectional serial communication.
 
 {{% notice Note %}}
 If you see a Connection refused error, check the FVP logs to verify the port numbers:
@@ -32,23 +31,21 @@ terminal_3: Listening for serial connection on port 5067
 Ensure both ports are active and match the socat command arguments.
 
 
-### Step 2: Manually Set Host Power State
+### Step 2: Manually set the host power state
 
-Once the SOL bridge is established, run the following command from the OpenBMC console shell to simulate the host being powered on:
+After the SOL bridge is established, run the following command from the OpenBMC console shell to simulate the host as powered on:
 
 ```bash
-busctl set-property xyz.openbmc_project.State.Host \
-/xyz/openbmc_project/state/host0 xyz.openbmc_project.State.Host \
-CurrentHostState s xyz.openbmc_project.State.Host.HostState.Running
+busctl set-property xyz.openbmc_project.State.Host /xyz/openbmc_project/state/host0 xyz.openbmc_project.State.Host CurrentHostState s xyz.openbmc_project.State.Host.HostState.Running
 ```
 
-This updates the BMC’s internal host state, allowing UEFI to begin execution.
+This command updates the BMC internal host state so UEFI can begin execution.
 
-### Step 3: Access Host Console from Web UI
+### Step 3: Access the host console from the web UI
 
 - From your simulation host, launch a browser and open the BMC Web UI at:
   https://127.0.0.1:4223
-   ![img3 alt-text#center](openbmc_webui_login.jpg "WebUI login") 
+   ![BMC web UI login page with username and password fields alt-text#center](openbmc_webui_login.jpg "BMC web UI login")
 
 - Login using the default credentials:
    - Username: root
@@ -59,13 +56,13 @@ This updates the BMC’s internal host state, allowing UEFI to begin execution.
    After login, you should see the Web UI dashboard:
 
 - From the Overview page, click the `SOL Console` button.
-   ![img4 alt-text#center](openbmc_webui_overview.jpg "WebUI Overview")
+   ![BMC web UI overview dashboard showing system status and SOL Console button alt-text#center](openbmc_webui_overview.jpg "Web UI overview")
 
 - The SOL terminal in the Web UI will display the host console output (UEFI shell or Linux login). You can type commands directly as if you were connected over a physical serial line.
-   ![img5 alt-text#center](openbmc_webui_sol.jpg "WebUI SOL")
+   ![BMC web UI SOL terminal showing host console output and input prompt alt-text#center](openbmc_webui_sol.jpg "Web UI SOL console")
 
-Once connected to the SOL terminal, you can monitor the UEFI boot sequence, interact with the host shell, and run diagnostic or recovery workflows, just as if you were connected to a physical serial port.
+When you are connected to the SOL terminal, you can monitor the UEFI boot sequence, interact with the host shell, and run diagnostic or recovery workflows just as you would over a physical serial port.
 
-This confirms that OpenBMC is fully managing host power and console access in your simulated environment.
+This process confirms that OpenBMC manages host power and console access in your simulated environment.
 
-In the next module, you'll expand this control further by sending IPMI commands to the BMC—allowing you to test low-level system interactions and even implement your own OEM command handlers.
+In the next section, you extend this control by sending IPMI commands to the BMC to test low-level system interactions and implement custom OEM command handlers.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md
index 1eafa587c4..b025ba304c 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/5_openbmc_ipmi.md
@@ -1,20 +1,20 @@
 ---
-title: Customize IPMI Commands in OpenBMC
+title: Customize IPMI commands in OpenBMC
 weight: 6
 
 ### FIXED, DO NOT MODIFY
 layout: learningpathall
 ---
 
-## Customize IPMI Commands in OpenBMC
+## Customize IPMI commands in OpenBMC
 
-With the host console accessible through OpenBMC, you are now ready to extend its functionality by implementing a custom IPMI command handler.
+With the host console accessible through OpenBMC, you can extend platform management by implementing a custom IPMI command handler.
 
-The Intelligent Platform Management Interface ([IPMI](https://en.wikipedia.org/wiki/Intelligent_Platform_Management_Interface)) is a standardized protocol for managing and monitoring servers, even when the operating system is not running. In OpenBMC, IPMI support is built-in and can be extended using custom handlers through the D-Bus/IPMI infrastructure.
+The Intelligent Platform Management Interface ([IPMI](https://en.wikipedia.org/wiki/Intelligent_Platform_Management_Interface)) is a standardized protocol for out-of-band server management, even when the operating system is not running. In OpenBMC, IPMI support is built-in and can be extended with custom handlers through the D-Bus and IPMI infrastructure.
 
-In this module, you'll implement a custom IPMI command handler that returns a simple string response. You will write the handler in C++, package it with a BitBake recipe, build it into the OpenBMC image, and test it using `ipmitool` inside the simulated FVP environment.
+In this section, you implement a custom IPMI command handler that returns a simple string response. You write the handler in C++, package it with a BitBake recipe, build it into the OpenBMC image, and test it with `ipmitool` in the FVP simulation environment.
 
-### Step 1: Create a BitBake Recipe
+### Step 1: Create a BitBake recipe
 
 Create a new file named `phosphor-ipmi-example.bb` in the same folder.
 
@@ -22,7 +22,7 @@ Create a new file named `phosphor-ipmi-example.bb` in the same folder.
 touch ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/phosphor-ipmi-example.bb
 ```
 
-Paste the following content into it:
+Paste the following content into the file:
 
 ```bash
 SUMMARY = "Custom IPMI commands"
@@ -36,10 +36,7 @@ TARGET_CXXFLAGS += " -std=c++23"
 TARGET_LDFLAGS += " -lsystemd -lsdbusplus"
 
 do_compile() {
-    ${CXX} ${TARGET_CXXFLAGS} -fPIC -shared \
-    -o libmyipmi.so ${UNPACKDIR}/fvp-ipmi.cpp \
-    -I${STAGING_INCDIR} -L${STAGING_LIBDIR} \
-    ${TARGET_LDFLAGS}
+    ${CXX} ${TARGET_CXXFLAGS} -fPIC -shared     -o libmyipmi.so ${UNPACKDIR}/fvp-ipmi.cpp     -I${STAGING_INCDIR} -L${STAGING_LIBDIR}     ${TARGET_LDFLAGS}
 }
 
 do_install() {
@@ -50,7 +47,7 @@ do_install() {
 FILES:${PN} += "${libdir}/ipmid-providers/libmyipmi.so"
 ```
 
-### Step 2: Create a Custom IPMI Handler
+### Step 2: Create a custom IPMI handler
 
 Create a folder `phosphor-ipmi-example` at the same path, and add a new file called `fvp-ipmi.cpp`:
 
@@ -59,7 +56,7 @@ mkdir ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/ph
 touch ~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/recipes-phosphor/ipmi/phosphor-ipmi-example/fvp-ipmi.cpp
 ```
 
-Add the contents below into `fvp-ipmi.cpp`:
+Add the following contents to `fvp-ipmi.cpp`:
 
 ```cpp
 #include <ipmid/api.hpp>
@@ -84,27 +81,24 @@ void register_my_ipmi() {
 }
 ```
 
-This function registers a custom IPMI handler using NetFn `0x30` and Command `0x20`. 
-When triggered, it returns a static ASCII string: `"Hello from OpenBMC IPMI!"`.
-At runtime, this string is encoded as a sequence of hex bytes and sent back through the IPMI response. 
-You will observe this by running `ipmitool raw` and decoding the output.
+This function registers a custom IPMI handler with NetFn `0x30` and command `0x20`.  
+When invoked, it returns the ASCII string `"Hello from OpenBMC IPMI!"`.  
+At runtime, the string is encoded as hexadecimal bytes and returned in the IPMI response. You will observe this with `ipmitool raw` and by decoding the output.
 
-### Step 3: Add to Build Configuration
+### Step 3: Add to build configuration
 
-To verify the IPMI command, you need to add the following to the configuration to install `ipmitool` and `phosphor-ipmi-example`. 
+To test the IPMI command, add `ipmitool` and `phosphor-ipmi-example` to your build configuration. Edit `fvp.conf` at:
 
-Edit `fvp.conf` at `~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/conf/machine/fvp.conf`
+`~/openbmc/meta-evb/meta-evb-arm/meta-evb-fvp-base/conf/machine/fvp.conf`
 
 Append the following packages:
 
 ```bash
 IMAGE_INSTALL:append = "\ 
-    phosphor-ipmi-example \
-    ipmitool \
-"
+    phosphor-ipmi-example     ipmitool "
 ```
 
-Now rebuild the OpenBMC image with your IPMI handler included:
+Rebuild the OpenBMC image with your IPMI handler included
 
 ```bash
 cd ~/openbmc
@@ -112,47 +106,55 @@ source setup fvp
 bitbake obmc-phosphor-image
 ```
 
-After the build completes, the generated image will contain both `ipmitool` and `phosphor-ipmi-example`.
+After the build completes, the generated image contains both `ipmitool` and `phosphor-ipmi-example`
 
-For more details about the final image configuration, you can inspect the generated FVP configuration file at `~/openbmc/build/fvp/tmp/deploy/images/fvp/obmc-phosphor-image-fvp.fvpconf`.
+For details about the final image configuration, inspect the generated FVP configuration file at:
 
-### Step 4: Verify the IPMI Command in Simulation
+`~/openbmc/build/fvp/tmp/deploy/images/fvp/obmc-phosphor-image-fvp.fvpconf`
 
-After launching the FVP simulation and logging into the OpenBMC console, run the following command to invoke your custom IPMI handler:
+### Step 4: Verify the IPMI command in simulation
+
+After launching the FVP simulation and logging into the OpenBMC console, run the following command to invoke your custom IPMI handler
 
 ```bash
 ipmitool raw 0x30 0x20
 ```
 
-This command invokes your custom IPMI handler registered under:
-* NetFn: 0x30 (OEM function)
-* Command: 0x20
+This command invokes your custom IPMI handler registered under
+
+- NetFn: `0x30` (OEM function)  
+- Command: `0x20`
+
+You should see a response similar to
 
-You should see a response similar to:
 ```output
 root@fvp:~# ipmitool raw 0x30 0x20
  18 48 65 6c 6c 6f 20 66 72 6f 6d 20 4f 70 65 6e 
  42 4d 43 20 49 50 4d 49 21
 ```
 
-This response is a sequence of hexadecimal bytes returned by the BMC:
-* The first byte indicates the length of the payload — in this case `0x18`, 24 bytes.
-* The remaining 24 bytes represent the actual data payload, encoded as ASCII.
+This response is a sequence of hexadecimal bytes returned by the BMC
 
-![img6 alt-text#center](openbmc_ipmi.jpg "OpenBMC IPMI command")
+- The first byte indicates the length of the payload — in this case `0x18`, 24 bytes  
+- The remaining 24 bytes are the data payload, encoded as ASCII
 
-To decode the message, copy the payload portion (excluding the first byte) and run:
+![OpenBMC IPMI command output showing hexadecimal response alt-text#center](openbmc_ipmi.jpg "OpenBMC IPMI command output")
+
+To decode the message, copy the payload portion (excluding the first byte) and run
 
 ```bash
 echo "48 65 6c 6c 6f 20 66 72 6f 6d 20 4f 70 65 6e 42 4d 43 20 49 50 4d 49 21" | tr -d ' ' | xxd -r -p
 ```
 
-The output will be:
+The output is
+
 ```output
-"Hello from OpenBMC IPMI!"
+Hello from OpenBMC IPMI!
 ```
-This output confirms that the custom string returned by your `myIpmiCommand()` function has been correctly encoded and transmitted via IPMI:
-```bash
+
+This confirms that the custom string returned by your `myIpmiCommand()` function has been correctly encoded and transmitted using IPMI:
+
+```cpp
 std::string reply = "Hello from OpenBMC IPMI!";
 return ipmi::responseSuccess(reply);
 ```
@@ -160,13 +162,10 @@ return ipmi::responseSuccess(reply);
 The response from `ipmitool raw` confirms that your custom IPMI handler was:
 
 - Successfully compiled and included in the OpenBMC image  
-- Properly registered to respond to NetFn `0x30`, Command `0x20`  
-- Correctly executed in the simulated environment via IPMI raw access  
-- Returning the intended payload, encoded as ASCII and received in hex format
-
-By decoding the hex payload into ASCII, you have verified the full path from handler registration to command execution and payload delivery.
+- Properly registered to respond to NetFn `0x30` and command `0x20`  
+- Executed correctly in the simulated environment via IPMI raw access  
+- Returning the intended payload, encoded as ASCII and received in hexadecimal format
 
-You have now successfully implemented and tested a custom IPMI command in OpenBMC using pre-silicon simulation.
+By decoding the hexadecimal payload into ASCII, you verified the full path from handler registration to command execution and payload delivery.
 
-This sets the foundation for adding OEM commands or platform-specific extensions to your BMC firmware.  
-You can now expand this pattern to support argument parsing, custom data formats, or system-level control—enabling rapid prototyping of features such as sensor telemetry, power domain control, or boot policy configuration.
+You have now implemented and tested a custom IPMI command in OpenBMC using pre-silicon simulation. This sets the foundation for adding OEM commands or platform-specific extensions to your BMC firmware. You can expand this pattern to support argument parsing, custom data formats, or system-level control to prototype features such as sensor telemetry, power domain control, or boot policy configuration.
diff --git a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md
index 879924ddb0..0f6122ee73 100644
--- a/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md
+++ b/content/learning-paths/servers-and-cloud-computing/openbmc-rdv3/_index.md
@@ -1,63 +1,63 @@
 ---
-title: Pre-Silicon Simulation of OpenBMC and UEFI on Neoverse RD-V3
+title: Simulate OpenBMC and UEFI pre-silicon on Neoverse RD-V3
 
-draft: true
-cascade:
-    draft: true
-    
 minutes_to_complete: 120
 
-who_is_this_for: This Learning Path is for firmware developers, platform software engineers, and system integrators working on Arm Neoverse-based platforms. It is especially useful for those exploring pre-silicon development, testing, and integration of Baseboard Management Controllers (BMC) with UEFI firmware. If you are building or validating server-class reference platforms, such as RD-V3—before hardware is available, this guide will help you simulate and debug the full boot path using Fixed Virtual Platforms (FVPs).
+who_is_this_for: This advanced topic is for firmware developers, platform software engineers, and system integrators working on Arm Neoverse-based platforms. It is especially useful for developers exploring pre-silicon development, testing, and integration of Baseboard Management Controllers (BMC) with UEFI firmware. If you are building or validating server-class reference platforms such as RD-V3, before hardware is available, this Learning Path shows you how to simulate and debug the full boot path using Fixed Virtual Platforms (FVPs).
 
 learning_objectives:
-    - Understand the role of OpenBMC and UEFI in the boot flow for an Arm server
-    - Simulate the firmware using the RD-V3 FVP
-    - Build and launch OpenBMC and UEFI images on the RD-V3 FVP
-    - Validate host-BMC communication via UART and Serial-over-LAN
-    - Implement and validate a custom IPMI command in OpenBMC
+  - Understand the role of OpenBMC and UEFI in the Arm server boot flow
+  - Simulate the firmware using the RD-V3 FVP
+  - Build and launch OpenBMC and UEFI images on the RD-V3 FVP
+  - Validate host–BMC communication using UART and Serial over LAN (SoL)
+  - Implement and validate a custom IPMI command in OpenBMC
 
 prerequisites:
-    - Access to an Arm Neoverse-based Linux machine (either cloud-based or local) is required, with at least 80 GB of free disk space, 48 GB of RAM, and running Ubuntu 22.04 LTS.
-    - Working knowledge of Docker, Git, and Linux terminal tools
-    - Basic understanding of server firmware stack (UEFI, BMC, TF-A, etc.)
+  - An Arm Neoverse-based Linux machine (cloud or local) running Ubuntu 22.04 LTS
+  - At least 80 GB free disk space and 48 GB RAM
+  - Working knowledge of Docker, Git, and common Linux terminal tools
+  - Basic understanding of the server firmware stack (such as UEFI, BMC, and TF-A)
 
 author:
-    - Odin Shen
-    - Ken Zhang
+  - Odin Shen
+  - Ken Zhang
 
 ### Tags
 skilllevels: Advanced
 subjects: Containers and Virtualization
 armips:
-    - Neoverse
+  - Neoverse
 tools_software_languages:
-    - C
-    - Docker
-    - FVP
+  - C
+  - Docker
+  - FVP
+  - OpenBMC
+  - Yocto/BitBake
+  - ipmitool
 operatingsystems:
-    - Linux
+  - Linux
 
 further_reading:
-    - resource:
-        title: Reference Design software stack architecture
-        link: https://neoverse-reference-design.docs.arm.com/en/latest/about/software_stack.html
-        type: website
-    - resource:
-        title: OpenBMC website
-        link: https://www.openbmc.org/
-        type: website
-    - resource:
-        title: Meta FVP base
-        link: https://github.com/openbmc/openbmc/tree/master/meta-evb/meta-evb-arm/meta-evb-fvp-base
-        type: website
-    - resource:
-        title: OpenBMC on FVP PoC
-        link: https://gitlab.arm.com/server_management/PoCs/fvp-poc
-        type: website
-    - resource:
-        title: ipmitool documentation
-        link: https://linux.die.net/man/1/ipmitool
-        type: website
+  - resource:
+      title: Reference Design software stack architecture
+      link: https://neoverse-reference-design.docs.arm.com/en/latest/about/software_stack.html
+      type: website
+  - resource:
+      title: OpenBMC website
+      link: https://www.openbmc.org/
+      type: website
+  - resource:
+      title: Meta FVP base (OpenBMC)
+      link: https://github.com/openbmc/openbmc/tree/master/meta-evb/meta-evb-arm/meta-evb-fvp-base
+      type: website
+  - resource:
+      title: OpenBMC on FVP PoC
+      link: https://gitlab.arm.com/server_management/PoCs/fvp-poc
+      type: website
+  - resource:
+      title: ipmitool documentation
+      link: https://linux.die.net/man/1/ipmitool
+      type: website
 
 ### FIXED, DO NOT MODIFY
 # ================================================================================
diff --git a/tools/check_open_category.py b/tools/check_open_category.py
index ac0b784db6..112e1d5222 100644
--- a/tools/check_open_category.py
+++ b/tools/check_open_category.py
@@ -329,17 +329,19 @@ def check_entries(lp_index_path: Path, model: str = "gpt-4o-2024-08-06") -> int:
 # ----------------------------
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python tools/check_tools_software_languages.py /path/to/content/<section>/<category>/<lp>/_index.md")
-        sys.exit(2)
-
-    lp_index_path = Path(sys.argv[1]).resolve()
-    if not lp_index_path.exists():
-        print(f"[ERROR] File not found: {lp_index_path}")
-        sys.exit(2)
 
-    # Single sequential flow so you can follow the variables:
-    problems = check_entries(lp_index_path, model="gpt-4o-2024-08-06")
+    files = sys.argv[1:]
+    problems = 0
+
+    for f in files:
+        if "content/" in f and f.endswith("_index.md"):
+            lp_index_path = Path(f).resolve()
+            if not lp_index_path.exists():
+                print(f"[ERROR] File not found: {lp_index_path}")
+                sys.exit(2)
+            print("Checking file:", f)
+            # Single sequential flow so you can follow the variables:
+            problems = problems + check_entries(lp_index_path, model="gpt-4o-2024-08-06")
 
     # Exit codes: 0 ok, 1 issues found
     sys.exit(1 if problems > 0 else 0)
diff --git a/tools/requirements.txt b/tools/requirements.txt
index 7138126344..79327db69c 100644
--- a/tools/requirements.txt
+++ b/tools/requirements.txt
@@ -5,3 +5,4 @@ pyspellchecker
 better-profanity
 setuptools
 alive-progress
+openai
diff --git a/tools/style_check.py b/tools/style_check.py
new file mode 100644
index 0000000000..3e76634bd5
--- /dev/null
+++ b/tools/style_check.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+"""
+Enhanced style checker for Arm Learning Paths content.
+This script checks markdown files against writing style guidelines from a JSON file
+and uses spaCy for passive voice detection.
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import re
+
+# Import spaCy if available
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+    # Try to load the English model
+    try:
+        nlp = spacy.load("en_core_web_sm")
+    except:
+        print("Warning: spaCy model 'en_core_web_sm' not found. Attempting to download it.")
+        try:
+            from spacy.cli import download
+            download("en_core_web_sm")
+            nlp = spacy.load("en_core_web_sm")
+            print("Successfully downloaded and loaded spaCy model.")
+        except Exception as e:
+            print(f"Error: Could not download spaCy model. Passive voice detection will be limited. Details: {e}")
+            raise SystemExit("Exiting due to missing spaCy model.")
+        try:
+            from spacy.cli import download
+            download("en_core_web_sm")
+            nlp = spacy.load("en_core_web_sm")
+            print("Successfully downloaded and loaded spaCy model.")
+        except:
+            print("Error: Could not download spaCy model. Passive voice detection will be limited.")
+            SPACY_AVAILABLE = False
+except ImportError:
+    print("Warning: spaCy not installed. Using basic passive voice detection.")
+    SPACY_AVAILABLE = False
+
+def load_style_rules(rules_file):
+    """Load style rules from a JSON file."""
+    try:
+        with open(rules_file, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error loading style rules: {e}")
+        return []
+
+def is_in_code_block(lines, line_index):
+    """Check if the line is within a code block."""
+    code_block_count = 0
+    for i in range(line_index):
+        if re.match(r'^```', lines[i]):
+            code_block_count += 1
+
+    return code_block_count % 2 == 1  # Odd count means inside a code block
+
+def is_in_yaml_frontmatter(lines, line_index):
+    """Check if the line is within YAML frontmatter."""
+    if line_index == 0 and lines[0].strip() == '---':
+        return True
+
+    frontmatter_markers = 0
+    for i in range(line_index):
+        if lines[i].strip() == '---':
+            frontmatter_markers += 1
+
+    # If we've seen an odd number of markers, we're in frontmatter
+    return frontmatter_markers % 2 == 1
+
+def should_capitalize_replacement(original, start_index, replacement):
+    """Determine if the replacement should be capitalized based on its position."""
+    should_capitalize = False
+    if start_index == 0 or (start_index >= 2 and original[start_index-2:start_index] == '. '):
+        if replacement and replacement[0].islower():
+            should_capitalize = True
+    return should_capitalize
+
+def detect_passive_voice_with_spacy(text):
+    """
+    Detect passive voice using spaCy's dependency parsing.
+    Returns a list of (passive_text, suggested_active) tuples.
+
+    Note: If spaCy is not available (`SPACY_AVAILABLE` is False), this function will return an empty list.
+    """
+    if not SPACY_AVAILABLE:
+        return []
+
+    doc = nlp(text)
+    passive_constructions = []
+
+    for token in doc:
+        # Look for passive auxiliary verbs
+        if token.dep_ == "auxpass":
+            # Find the main verb
+            verb = token.head
+
+            # Find the subject (usually nsubjpass)
+            subject = None
+            for child in verb.children:
+                if child.dep_ == "nsubjpass":
+                    subject = child
+                    break
+
+            # Find the agent (usually introduced by "by")
+            agent = None
+            for child in verb.children:
+                if child.dep_ == "agent":
+                    for agent_child in child.children:
+                        if agent_child.dep_ == "pobj":
+                            agent = agent_child
+                            break
+                    break
+
+            # If we have both subject and agent, we can suggest an active voice alternative
+            if subject and agent:
+                # Extract the spans of text
+                passive_span = doc[max(0, subject.i - 1):min(len(doc), verb.i + 2)]
+                if agent.i > verb.i:
+                    passive_span = doc[max(0, subject.i - 1):min(len(doc), agent.i + 1)]
+
+                # Create active voice suggestion
+                # Capitalize if at start of sentence
+                should_capitalize = should_capitalize_replacement(text, passive_span.start_char, active_suggestion)
+                if should_capitalize:
+                    active_suggestion = active_suggestion[0].upper() + active_suggestion[1:]
+
+                passive_constructions.append((passive_span.text, active_suggestion))
+
+    return passive_constructions
+
+def fix_passive_voice(line):
+    """
+    Fix passive voice constructions by swapping subject and object.
+    This is a more sophisticated approach than simple pattern replacement.
+    """
+    # Common passive voice patterns with specific replacements
+    passive_patterns = [
+        (r'The data is processed by the system', r'The system processes the data'),
+        (r'The code is handled by the compiler', r'The compiler handles the code'),
+        (r'The configuration was managed by the user', r'The user managed the configuration'),
+        (r'The documentation was created by the team', r'The team created the documentation'),
+        (r'The results are generated by the algorithm', r'The algorithm generates the results'),
+        (r'The API is provided by the service', r'The service provides the API')
+    ]
+
+    # Try each specific pattern first
+    for pattern, replacement in passive_patterns:
+        if re.search(pattern, line, re.IGNORECASE):
+            return re.sub(pattern, replacement, line, flags=re.IGNORECASE)
+
+    # Generic patterns as fallback
+    generic_patterns = [
+        # Present tense passive
+        (r'(\w+) is (\w+ed) by (\w+)', r'\3 \2s \1'),
+        (r'(\w+) are (\w+ed) by (\w+)', r'\3 \2 \1'),
+        # Past tense passive
+        (r'(\w+) was (\w+ed) by (\w+)', r'\3 \2 \1'),
+        (r'(\w+) were (\w+ed) by (\w+)', r'\3 \2 \1')
+    ]
+
+    for pattern, replacement in generic_patterns:
+        if re.search(pattern, line):
+            return re.sub(pattern, replacement, line)
+
+    return line
+
+def check_style(content, file_path, style_rules):
+    """Check content against style rules and return suggestions."""
+    suggestions = []
+    lines = content.split("\n")
+
+    # First, check for passive voice using spaCy if available
+    if SPACY_AVAILABLE:
+        # Process each paragraph separately to maintain context
+        paragraphs = []
+        current_paragraph = []
+
+        for i, line in enumerate(lines):
+            # Skip code blocks, YAML frontmatter, headings, and links
+            if (is_in_code_block(lines, i) or
+                is_in_yaml_frontmatter(lines, i) or
+                re.match(r'^#+\s', line) or
+                re.search(r'^\s*\[.*\]:\s*', line)):
+                # End the current paragraph if any
+                if current_paragraph:
+                    paragraphs.append((current_paragraph[0], " ".join(current_paragraph[1])))
+                    current_paragraph = []
+                continue
+
+            # Skip empty lines - they end paragraphs
+            if not line.strip():
+                if current_paragraph:
+                    paragraphs.append((current_paragraph[0], " ".join(current_paragraph[1])))
+                    current_paragraph = []
+                continue
+
+            # Add line to current paragraph
+            if not current_paragraph:
+                current_paragraph = [i, [line]]
+            else:
+                current_paragraph[1].append(line)
+
+        # Add the last paragraph if any
+        if current_paragraph:
+            paragraphs.append((current_paragraph[0], " ".join(current_paragraph[1])))
+
+        # Check each paragraph for passive voice
+        for start_line, paragraph_text in paragraphs:
+            passive_constructions = detect_passive_voice_with_spacy(paragraph_text)
+
+            for passive_text, active_suggestion in passive_constructions:
+                # Find which line contains this passive construction
+                for j, line in enumerate(lines[start_line:start_line + 10]):  # Look at next 10 lines max
+                    if passive_text in line:
+                        line_index = start_line + j
+                        suggestions.append({
+                            "file": file_path,
+                            "line": line_index + 1,  # 1-based line numbers
+                            "original": line,
+                            "suggested": line.replace(passive_text, active_suggestion),
+                            "reason": "Convert passive voice to active voice for clarity and directness (detected by spaCy)."
+                        })
+                        break
+
+    # Then check each line against style rules
+    for i, line in enumerate(lines):
+        # Skip code blocks and YAML frontmatter
+        if is_in_code_block(lines, i) or is_in_yaml_frontmatter(lines, i):
+            continue
+
+        # Skip headings (lines starting with #)
+        if re.match(r'^#+\s', line):
+            continue
+
+        # Skip links and image references
+        if re.search(r'^\s*\[.*\]:\s*', line):
+            continue
+
+        # If we already have a suggestion for this line, skip further checks
+        if any(sugg["line"] == i + 1 for sugg in suggestions):
+            continue
+
+        # Check against other style rules
+        for rule in style_rules:
+            matches = list(re.finditer(rule["pattern"], line, re.IGNORECASE))
+            for match in matches:
+                # Create a suggestion
+                original = line
+
+                # Get the matched text
+                # Determine if replacement should be capitalized
+                replacement = rule["replacement"]
+                should_capitalize = should_capitalize_replacement(line, match.start(), replacement)
+                if should_capitalize:
+                    replacement = replacement[0].upper() + replacement[1:]
+
+                # Apply the replacement
+                suggested = line[:match.start()] + replacement + line[match.end():]
+
+                if original != suggested:
+                    suggestions.append({
+                        "file": file_path,
+                        "line": i + 1,
+                        "original": original,
+                        "suggested": suggested,
+                        "reason": rule["reason"],
+                    })
+                    # Only one suggestion per line to avoid conflicts
+                    break
+
+    return suggestions
+
+def save_suggestions_to_file(suggestions, output_file="data/style_suggestions.json"):
+    """Save suggestions to a JSON file."""
+    with open(output_file, "w") as f:
+        json.dump(suggestions, f, indent=2)
+    print(f"Saved suggestions to {output_file}")
+
+def print_suggestions(suggestions):
+    """Print suggestions in a readable format."""
+    if not suggestions:
+        print("No style issues found.")
+        return
+
+    print(f"\nFound {len(suggestions)} style issues:")
+    print("=" * 80)
+
+    for i, sugg in enumerate(suggestions, 1):
+        print(f"Issue {i}: File: {sugg['file']}, Line: {sugg['line']}, Reason: {sugg['reason']}")
+        print(f"Original: {sugg['original']} -> Suggested: {sugg['suggested']}")
+        print("-" * 80)
+
+
+def load_suggestions(suggestions_path):
+    with open(suggestions_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+def load_file_lines(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+
+def save_file_lines(file_path, lines):
+    with open(file_path, 'w', encoding='utf-8') as f:
+        f.writelines(lines)
+
+def interactive_review(suggestions):
+    grouped_suggestions = {}
+
+    # Group suggestions by file
+    for s in suggestions:
+        grouped_suggestions.setdefault(s['file'], []).append(s)
+
+    for file_path, file_suggestions in grouped_suggestions.items():
+        if not os.path.exists(file_path):
+            print(f"\n❌ File not found: {file_path} — skipping.")
+            continue
+
+        print(f"\n🔍 Reviewing suggestions for: {file_path}")
+        lines = load_file_lines(file_path)
+
+        modified = False
+
+        for suggestion in file_suggestions:
+            line_num = suggestion['line']
+            original_line = suggestion['original'].strip()
+            suggested_line = suggestion['suggested'].strip()
+            reason = suggestion.get('reason', 'No reason provided.')
+
+            print(f"\nLine {line_num}:")
+            print(f"🔸 Original:  {original_line}")
+            print(f"✅ Suggested: {suggested_line}")
+            print(f"Reason:    {reason}")
+            choice = input("Apply change? [y/n/q] (q = quit): ").strip().lower()
+
+            if choice == 'y':
+                index = line_num - 1
+                if lines[index].strip() == original_line:
+                    lines[index] = lines[index].replace(original_line, suggested_line)
+                    print("✔ Change applied.")
+                    modified = True
+                else:
+                    print("⚠ Original line mismatch. Skipped to prevent overwriting unintended content.")
+            elif choice == 'q':
+                print("Exiting early.")
+                return
+            else:
+                print("Skipped.")
+
+        if modified:
+            save_file_lines(file_path, lines)
+            print(f"💾 Changes saved to: {file_path}")
+        else:
+            print("No changes made.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Check markdown files for style issues")
+    parser.add_argument("--file", help="Path to a specific markdown file to check")
+    parser.add_argument("--dir", help="Directory containing markdown files to check")
+    parser.add_argument("--rules", default="tools/style_rules.json", help="JSON file containing style rules")
+    parser.add_argument("--output", default="data/style_suggestions.json", help="Output file for suggestions")
+    parser.add_argument("--install-spacy", action="store_true", help="Install spaCy and download the English model")
+    args = parser.parse_args()
+
+    # Install spaCy if requested
+    if args.install_spacy:
+        print("Installing spaCy and downloading the English model...")
+        import subprocess
+        subprocess.call([sys.executable, "-m", "pip", "install", "spacy"])
+        subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
+        print("Installation complete. Please run the script again without --install-spacy.")
+        sys.exit(0)
+
+    if not args.file and not args.dir:
+        print("Error: Please provide either --file or --dir argument")
+        sys.exit(1)
+
+    # Load style rules
+    style_rules = load_style_rules(args.rules)
+    if not style_rules:
+        print("Error: No style rules loaded. Check the rules file.")
+        sys.exit(1)
+
+    print(f"Loaded {len(style_rules)} style rules from {args.rules}")
+
+    all_suggestions = []
+
+    # Check a specific file
+    if args.file:
+        if not os.path.isfile(args.file):
+            print(f"Error: File not found: {args.file}")
+            sys.exit(1)
+
+        if not args.file.endswith((".md", ".mdx")):
+            print(f"Warning: {args.file} is not a markdown file. Checking anyway.")
+
+        with open(args.file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        suggestions = check_style(content, args.file, style_rules)
+        all_suggestions.extend(suggestions)
+        print(f"Checked {args.file}: Found {len(suggestions)} style issues")
+
+    # Check all markdown files in a directory
+    if args.dir:
+        if not os.path.isdir(args.dir):
+            print(f"Error: Directory not found: {args.dir}")
+            sys.exit(1)
+
+        for root, _, files in os.walk(args.dir):
+            for file in files:
+                if file.endswith((".md", ".mdx")):
+                    file_path = os.path.join(root, file)
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        content = f.read()
+
+                    suggestions = check_style(content, file_path, style_rules)
+                    all_suggestions.extend(suggestions)
+                    print(f"Checked {file_path}: Found {len(suggestions)} style issues")
+
+    if not all_suggestions:
+        print("No style issues found in the checked files.")
+        return
+
+    save_suggestions_to_file(all_suggestions, args.output)
+    interactive_review(all_suggestions)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/style_rules.json b/tools/style_rules.json
new file mode 100644
index 0000000000..7276245a4c
--- /dev/null
+++ b/tools/style_rules.json
@@ -0,0 +1,317 @@
+[
+  {
+    "pattern": "\\butilize\\b",
+    "replacement": "use",
+    "reason": "Use 'use' instead of 'utilize' for simplicity."
+  },
+  {
+    "pattern": "\\butilizes\\b",
+    "replacement": "uses",
+    "reason": "Use 'uses' instead of 'utilizes' for simplicity."
+  },
+  {
+    "pattern": "\\butilized\\b",
+    "replacement": "used",
+    "reason": "Use 'used' instead of 'utilized' for simplicity."
+  },
+  {
+    "pattern": "\\butilizing\\b",
+    "replacement": "using",
+    "reason": "Use 'using' instead of 'utilizing' for simplicity."
+  },
+  {
+    "pattern": "\\butilization\\b",
+    "replacement": "use",
+    "reason": "Use 'use' instead of 'utilization' for simplicity."
+  },
+  {
+    "pattern": "\\boptimize\\b",
+    "replacement": "improve",
+    "reason": "Simplify language by replacing 'optimize' with 'improve'."
+  },
+  {
+    "pattern": "\\boptimizes\\b",
+    "replacement": "improves",
+    "reason": "Simplify language by replacing 'optimizes' with 'improves'."
+  },
+  {
+    "pattern": "\\boptimized\\b",
+    "replacement": "improved",
+    "reason": "Simplify language by replacing 'optimized' with 'improved'."
+  },
+  {
+    "pattern": "\\boptimizing\\b",
+    "replacement": "improving",
+    "reason": "Simplify language by replacing 'optimizing' with 'improving'."
+  },
+  {
+    "pattern": "\\boptimization\\b",
+    "replacement": "improvement",
+    "reason": "Simplify language by replacing 'optimization' with 'improvement'."
+  },
+  {
+    "pattern": "\\boptimizations\\b",
+    "replacement": "improvements",
+    "reason": "Simplify language by replacing 'optimizations' with 'improvements'."
+  },
+  {
+    "pattern": "\\bin order to\\b",
+    "replacement": "to",
+    "reason": "Use 'to' instead of 'in order to' for conciseness."
+  },
+  {
+    "pattern": "\\bplease note that\\b",
+    "replacement": "",
+    "reason": "Remove 'please note that' for directness."
+  },
+  {
+    "pattern": "\\byou should\\b",
+    "replacement": "",
+    "reason": "Be direct with instructions, avoid 'you should'."
+  },
+  {
+    "pattern": "\\bis able to\\b",
+    "replacement": "can",
+    "reason": "Use 'can' instead of 'is able to' for simplicity."
+  },
+  {
+    "pattern": "\\bare able to\\b",
+    "replacement": "can",
+    "reason": "Use 'can' instead of 'are able to' for simplicity."
+  },
+  {
+    "pattern": "\\bwas able to\\b",
+    "replacement": "could",
+    "reason": "Use 'could' instead of 'was able to' for simplicity."
+  },
+  {
+    "pattern": "\\bwere able to\\b",
+    "replacement": "could",
+    "reason": "Use 'could' instead of 'were able to' for simplicity."
+  },
+  {
+    "pattern": "\\bdue to the fact that\\b",
+    "replacement": "because",
+    "reason": "Use 'because' instead of 'due to the fact that' for conciseness."
+  },
+  {
+    "pattern": "\\bat this point in time\\b",
+    "replacement": "now",
+    "reason": "Use 'now' instead of 'at this point in time' for conciseness."
+  },
+  {
+    "pattern": "\\bfor the purpose of\\b",
+    "replacement": "for",
+    "reason": "Use 'for' instead of 'for the purpose of' for conciseness."
+  },
+  {
+    "pattern": "\\bprior to\\b",
+    "replacement": "before",
+    "reason": "Use 'before' instead of 'prior to' for simplicity."
+  },
+  {
+    "pattern": "\\bsubsequent to\\b",
+    "replacement": "after",
+    "reason": "Use 'after' instead of 'subsequent to' for simplicity."
+  },
+  {
+    "pattern": "\\ba large number of\\b",
+    "replacement": "many",
+    "reason": "Use 'many' instead of 'a large number of' for conciseness."
+  },
+  {
+    "pattern": "\\ba majority of\\b",
+    "replacement": "most",
+    "reason": "Use 'most' instead of 'a majority of' for conciseness."
+  },
+  {
+    "pattern": "\\bin spite of\\b",
+    "replacement": "despite",
+    "reason": "Use 'despite' instead of 'in spite of' for conciseness."
+  },
+  {
+    "pattern": "\\bwith regard to\\b",
+    "replacement": "about",
+    "reason": "Use 'about' instead of 'with regard to' for conciseness."
+  },
+  {
+    "pattern": "\\bwith respect to\\b",
+    "replacement": "about",
+    "reason": "Use 'about' instead of 'with respect to' for conciseness."
+  },
+  {
+    "pattern": "\\bwe recommend\\b",
+    "replacement": "it is recommended",
+    "reason": "Use 'it is recommended' instead of 'we recommend' for a more neutral tone."
+  },
+  {
+    "pattern": "\\bwe suggest\\b",
+    "replacement": "it is suggested",
+    "reason": "Use 'it is suggested' instead of 'we suggest' for a more neutral tone."
+  },
+  {
+    "pattern": "\\bwe advise\\b",
+    "replacement": "it is advised",
+    "reason": "Use 'it is advised' instead of 'we advise' for a more neutral tone."
+  },
+  {
+    "pattern": "\\bwe will\\b",
+    "replacement": "you will",
+    "reason": "Use 'you will' instead of 'we will' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe can\\b",
+    "replacement": "you can",
+    "reason": "Use 'you can' instead of 'we can' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe have\\b",
+    "replacement": "you have",
+    "reason": "Use 'you have' instead of 'we have' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe are\\b",
+    "replacement": "you are",
+    "reason": "Use 'you are' instead of 'we are' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe need to\\b",
+    "replacement": "you need to",
+    "reason": "Use 'you need to' instead of 'we need to' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe must\\b",
+    "replacement": "you must",
+    "reason": "Use 'you must' instead of 'we must' to address the reader directly."
+  },
+  {
+    "pattern": "\\bwe should\\b",
+    "replacement": "you should",
+    "reason": "Use 'you should' instead of 'we should' to address the reader directly."
+  },
+  {
+    "pattern": "\\bis processed by\\b",
+    "replacement": "processes",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare processed by\\b",
+    "replacement": "process",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas processed by\\b",
+    "replacement": "processed",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere processed by\\b",
+    "replacement": "processed",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bis handled by\\b",
+    "replacement": "handles",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare handled by\\b",
+    "replacement": "handle",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas handled by\\b",
+    "replacement": "handled",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere handled by\\b",
+    "replacement": "handled",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bis managed by\\b",
+    "replacement": "manages",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare managed by\\b",
+    "replacement": "manage",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas managed by\\b",
+    "replacement": "managed",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere managed by\\b",
+    "replacement": "managed",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bis created by\\b",
+    "replacement": "creates",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare created by\\b",
+    "replacement": "create",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas created by\\b",
+    "replacement": "created",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere created by\\b",
+    "replacement": "created",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bis generated by\\b",
+    "replacement": "generates",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare generated by\\b",
+    "replacement": "generate",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas generated by\\b",
+    "replacement": "generated",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere generated by\\b",
+    "replacement": "generated",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bis provided by\\b",
+    "replacement": "provides",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bare provided by\\b",
+    "replacement": "provide",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwas provided by\\b",
+    "replacement": "provided",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\bwere provided by\\b",
+    "replacement": "provided",
+    "reason": "Use active voice instead of passive voice."
+  },
+  {
+    "pattern": "\\blearning path\\b",
+    "replacement": "Learning Path",
+    "reason": "Capitalize 'Learning Path'."
+  }
+]
diff --git a/tools/verify_index_fields.py b/tools/verify_index_fields.py
index f887c5af73..e9be639001 100644
--- a/tools/verify_index_fields.py
+++ b/tools/verify_index_fields.py
@@ -117,5 +117,7 @@ def validate_file(path, allowlist):
             if validate_file(f, allowlist):
                 any_errors = True
 
+
+
     if any_errors:
         sys.exit(1)
\ No newline at end of file