diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index e8f320a3cc..f88caac4b2 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -21,6 +21,10 @@ on: required: true HUGO_AUDIO_API: required: true + HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: + required: true + HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: + required: true env: HUGO_VERSION: 0.130.0 @@ -69,6 +73,8 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} + HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} # Deploys website to AWS S3 and invalidate CloudFront Cache - name: Deploy to S3 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 741d7b92b7..4e3dad94e4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,3 +24,5 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} + HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 67287cc764..1fbe6be03b 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -24,3 +24,5 @@ jobs: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} HUGO_AUDIO_API: ${{ secrets.HUGO_AUDIO_API }} + HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID: ${{ secrets.HUGO_DEV_PROG_SIGNIUP_FORM_MUNCHKIN_ID }} + HUGO_FORM_ID_FOR_PROGRAM_SIGNUP: ${{ secrets.HUGO_FORM_ID_FOR_PROGRAM_SIGNUP }} diff --git a/.gitignore b/.gitignore index 90df706b60..284d26700f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ node_modules/ package-lock.json .hugo_build.lock .vscode +.env +startup.sh # macOS files *.DS_Store diff --git a/.wordlist.txt b/.wordlist.txt index eca236986f..5eda687586 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -3688,4 +3688,188 @@ ver vit wav za -zh \ No newline at end of file +zh +ACM +APs +ASG +AutoModelForSpeechSeq +AutoProcessor +Avin +Bioinformatics +CDK +Dropbear +DxeCore +EFI +FFTW +HMMER +ILP +IoC +Jython +Khrustalev +LCP +Maranget +Minimap +NFS +NVPL +OMP +OSR +OneAPI +OpenRNG +OpenSSH +Phttps +RNG +Runbook +SSM +Shrinkwrap +Shrinkwrap's +TSO +TSan +TSan's +ThreadSanitizer +Toolkits +ULP +ULPs +VSL +Yury +Zarlez +ada +armtest +atomicity +autoscaling +cmath +cuBLAS +cuDNN +distro's +dtype +dxecore +foir +fourier +getter +libm +miniconda +msg +nInferencing +oneAPI +oneapi +openai +pseudorandom +quasirandom +reorderings +rootfs +runbook +safetensors +superset +sysroot +testroot +threadsanitizercppmanual +toolchain's +transpiles +vectorstore +vlen +vv +webhook +xE +Baichuan +Checkpointing +Choi +CustomData +Das +DataCollatorForSeq +DingTalk +FFXM +FSR +FastLanguageModel +FeaturedContent +FfxmFsr +FurthestReflectionCaptureDistance +GBuffer +GLES +HLSL +Halton +JITTERED +JKS +Jitter +KHR +LoFTQ +LoRA +MNN +MSM +MTL +MaliManga +Mip +Mipmap +NDC +PEFT +Parichay +QLoRA +Qianwen +RCAS +RLHF +RelaxedPrecision +RenderTargets +SFT +SFTTrainer +ScreenPercentage +ShaderQualityMode +ShareGPT +Shuheng +TAA +Taobao +TemporalUpscaler +ThirdParty +Tmall +Tokenize +TrainingArguments +UVs +Unsloth +Upscaling +UpscalingRatio +VRAM +XD +Xianyu +Youku +Zhipu +aar +adamw +agentic +antialiasing +arxiv +csn +docstring +ffx +func +getPipelinePermutationFlags +ggml +hoc +jitter +jittered +jittering +lastestlora +luminance +mipmap +mnn +motionVectorScale +mutators +proc +qwen +raytraced +reactiveness +renderdoc +renderers +sharegpt +sym +tokenization +tokenize +tokenizes +tokenizing +tonemapped +tonemapping +trl +unjittered +unsloth +upscaled +upscalers +upscales +upscaling +vl +webbot \ No newline at end of file diff --git a/assets/css/feedback.css b/assets/css/feedback.css index 67bbd2f590..9e5f0b2cad 100644 --- a/assets/css/feedback.css +++ b/assets/css/feedback.css @@ -1,7 +1,6 @@ #feedback-container { min-height: 200px; - } /****************************************************/ diff --git a/assets/events.csv b/assets/events.csv index c18741a394..c908d8883e 100644 --- a/assets/events.csv +++ b/assets/events.csv @@ -1,10 +1,17 @@ Name,Priority,Start Date,End Date,City,Country,Virtual Option,Description,URL,Categories -SOSS Policy Summit,2,2025-03-04,2025-03-04,Washington,United States,FALSE,"Hosted by the Open Source Security Foundation (OpenSSF), an initiative of the Linux Foundation, this event addresses the security challenges associated with Open Source Software (OSS).",https://events.linuxfoundation.org/openssf-policy-summit-dc/,Servers and Cloud Computing -SCaLE,1,2025-03-06,2025-03-09,Pasadena,United States,FALSE,SCaLE is the largest community-run open-source and free software conference in North America. It is held annually in the greater Los Angeles area.,https://www.socallinuxexpo.org/scale/22x,Servers and Cloud Computing -SUSECon,2,2025-03-10,2025-03-14,Orlando,United States,FALSE,Example description. Example description. Example description. Example description. Example description.,https://www.suse.com/susecon/,Servers and Cloud Computing; AI; IoT -Embedded World,1,2025-03-11,2025-03-13,Nuremburg,Germany,FALSE,"Embedded World offers insight into the world of embedded systems, from components and modules to operating systems, hardware and software design, M2M communication, and more.",https://www.embedded-world.de/en,Embedded and Microcontrollers; Automotive -FOSSAsia,2,2025-03-13,2025-03-15,Bangkok,Thailand,TRUE,Example description. Example description. Example description. Example description. Example description.,https://events.fossasia.org/,Servers and Cloud Computing; AI; IoT -NVIDIA GTC,1,2025-03-17,2025-03-21,San Jose,United States,TRUE,"Nvidia GTC is a global artificial intelligence conference for developers that brings together developers, engineers, researchers, inventors, and IT professionals. ",https://www.nvidia.com/gtc/,ML -GDC,1,2025-03-17,2025-03-21,San Fransisco,United States,FALSE,"The Game Developers Conference (GDC) is the world's premier event for developers who make the games we love. GDC is the destination for creativity, innovation, and excellence.",https://gdconf.com/,"Mobile, Graphics, and Gaming" -ATO AI,2,2025-03-17,2025-03-18,Durham,United States,,Example description. Example description. Example description. Example description. Example description.,https://allthingsopen.ai/,AI -KubeCon EU,1,2025-04-01,2025-04-04,London,United Kingdom,TRUE,"Europe's Cloud Native Computing Foundation's flagship conference, this four-day event focuses on Kubernetes and cloud-native technologies, with keynotes, technical sessions and collaboration opportunities. ",https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/,Servers and Cloud Computing +Kubecon Europe 2025,1,2025-04-01,2025-04-04,London,United Kingdom,TRUE,"Visit Arm at KubeCon + CloudNativeCon Europe 2025 at booth N161. See demos on how Arm boosts AI, cloud native workloads, and Kubernetes performance. ",https://events.linuxfoundation.org/kubecon-cloudnativecon-europe,Servers and Cloud Computing +Google Next,2,2025-04-09,2025-04-11,Las Vegas,United States,FALSE,"This global exhibition of inspiration, innovation, and education is where decision-makers, developers, and anyone passionate about cloud challenges, solutions, 10x ideas, and game-changing technologies come together.",https://cloud.withgoogle.com/next/25,Servers and Cloud Computing; AI +Microsoft Build,1,2025-05-19,2025-05-22,Seattle,United States,TRUE,"Microsoft Build 2025 is the ultimate developer conference, featuring AI, cloud, and software innovations, hands-on workshops, and keynotes from Microsoft leaders. Connect, learn, and build.",https://build.microsoft.com/en-US/home,Laptops and Desktops; Servers and Cloud Computing; AI +Rust Week,3,2025-05-13,2025-05-17,Utrecht,Netherlands,FALSE,"A dedicated series of events focused on the Rust programming language, bringing together developers to discuss best practices, security, and performance optimizations.",https://dev.events/conferences/rust-week-2025-utcccotp,Embedded and Microcontrollers; Servers and Cloud Computing +Linaro Connect,3,2025-05-14,2025-05-16,Lisbon,Portugal,FALSE,"A key event for the Arm ecosystem, bringing together engineers and industry leaders to collaborate on open-source software development and Arm-based solutions.",https://www.linaro.org/connect/,Laptops and Desktops; Servers and Cloud Computing +Embedded Recipes,3,2025-05-14,2025-05-16,Nice,France,FALSE,"A technical conference dedicated to embedded systems developers, covering low-level programming, Linux on embedded devices, and real-time system challenges.",https://embedded-recipes.org/2025/,Embedded and Microcontrollers +Red Hat Summit ,3,2025-05-19,2025-05-22,Boston,United States,FALSE,"An enterprise open-source conference highlighting innovations in Linux, Kubernetes, automation, and AI, with sessions led by industry experts.",https://www.redhat.com/en/summit,Servers and Cloud Computing; AI +Computex,1,2025-05-19,2025-05-23,Taipei,Taiwan,FALSE,"COMPUTEX 2025 is set to be a game-changer, and Arm is taking center stage. Join our CEO Keynote for the latest onArm technology, and how we're bringing AI to everyone, everywhere.",https://computex.arm.com/event/a0f7a41d-a2b5-4840-9894-6bd6e9058c7d/home,"Mobile, Graphics, and Gaming; AI" +Kubecon China 2025,2,2025-06-10,2025-06-11,Hong Kong,China,TRUE,"KubeCon is a conference by CNCF, focusing on Kubernetes and cloud-native technologies, featuring keynotes, technical sessions, and workshops.",https://events.linuxfoundation.org/kubecon-cloudnativecon-china/,Servers and Cloud Computing +BSDCan,3,2025-06-13,2025-06-14,Ottawa,Canada,FALSE,"A conference for BSD operating system developers and enthusiasts, offering technical talks, workshops, and discussions.",https://www.bsdcan.org/2025/,Laptops and Desktops; Servers and Cloud Computing +OSS NA,2,2025-06-23,2025-06-25,Denver,United States,TRUE,"The premier event for open-source developers and contributors, covering topics from Linux kernel development to AI, cloud-native computing, and security.",https://events.linuxfoundation.org/open-source-summit-north-america/,Servers and Cloud Computing +Linux Security Summit,3,2025-06-26,2025-06-27,Denver,United States,TRUE,"A highly technical event focusing on advances in Linux security, including kernel hardening, cryptography, access control, and threat mitigation strategies.",https://events.linuxfoundation.org/linux-security-summit-north-america/,Laptops and Desktops; Servers and Cloud Computing +DebCamp & DebConf,3,2025-07-07,2025-07-20,Nantes,France,FALSE,"Annual conference for the Debian community, featuring collaborative development, workshops, and discussions on the future of the Debian Linux distribution.",https://wiki.debian.org/DebConf/25,Laptops and Desktops; Servers and Cloud Computing +We Are Developers,2,2025-07-09,2025-07-11,Berlin,Germany,TRUE,"One of the largest global developer conferences, offering sessions on software engineering trends, AI, DevOps, and emerging programming languages.",https://www.wearedevelopers.com/world-congress,"Mobile, Graphics, and Gaming" +GUADEC,3,2025-07-24,2025-07-29,Brescia,Italy,FALSE,"The leading event for GNOME contributors, featuring talks, hackfests, and discussions about the future of the GNOME desktop environment.",https://events.gnome.org/event/259/,"Laptops and Desktops; Mobile, Graphics and Gaming" +FOSSY,3,2025-07-31,2025-08-03,Portland,United States,FALSE,"An annual event dedicated to the open-source community, bringing together developers, maintainers, and advocates to discuss sustainability, security, and collaboration in open-source projects.", https://2025.fossy.us/, IoT diff --git a/content/install-guides/_images/wperf-winget-installation.gif b/content/install-guides/_images/wperf-winget-installation.gif new file mode 100644 index 0000000000..254b4ff42c Binary files /dev/null and b/content/install-guides/_images/wperf-winget-installation.gif differ diff --git a/content/install-guides/wperf.md b/content/install-guides/wperf.md index 13d0dc006c..ff9694ed96 100644 --- a/content/install-guides/wperf.md +++ b/content/install-guides/wperf.md @@ -39,6 +39,46 @@ WindowsPerf consists of a kernel-mode driver and a user-space command-line tool. You cannot use WindowsPerf on virtual machines, such as cloud instances. {{% /notice %}} +## Using winget (Recommended) + +### Install + +You can now install WindowsPerf directly from [winget](https://learn.microsoft.com/en-us/windows/package-manager/). Open an `Administrator` terminal on PowerShell and type + +```console +winget install WindowsPerf +``` + +The output should look like: + +```output +Found WindowsPerf [Arm.WindowsPerf] Version 4.3.1.0 +This application is licensed to you by its owner. +Microsoft is not responsible for, nor does it grant any licenses to, third-party packages. +Downloading https://developer.arm.com/-/cdn-downloads/permalink/WindowsPerf/Installer/windowsperf-4.3.1.msi + 3.07 MB +Successfully verified installer hash +Starting package install... +Successfully installed +``` + +![Winget installation video](/install-guides/_images/wperf-winget-installation.gif) + +It will install the latest available WindowsPerf along with the [WPA plugins](/learning-paths/laptops-and-desktops/windowsperf_wpa_plugin/). To check that the installation was done correctly open a new terminal tab or window and follow the instructions under the [verify installation section](/install-guides/wperf/#verify-install) + +### Uninstall +If you need to uninstall WindowsPerf, open an `Administrator` terminal on PowerShell and run: +```console +winget uninstall WindowsPerf +``` + +The output from a successful uninstallation will look like: +```output +Found WindowsPerf [Arm.WindowsPerf] +Starting package uninstall... +Successfully uninstalled +``` + ## Visual Studio and the Windows Driver Kit (WDK) WindowsPerf relies on `dll` files installed with Visual Studio, from the Community Edition or higher and, optionally, installers from the Windows Driver Kit extension. @@ -86,7 +126,7 @@ Make sure you are in the `windowsperf-bin-` directory: cd windowsperf-bin-4.0.0 ``` -### Install with wperf-devgen {#devgen_install} +### Install with wperf-devgen Navigate to the `wperf-driver` folder and run the installer: @@ -103,7 +143,7 @@ Install requested. Device installed successfully ``` -## Verify install +## Verify install You can check everything is working by running the `wperf` executable. diff --git a/content/learning-paths/cross-platform/_example-learning-path/appendix-3-test.md b/content/learning-paths/cross-platform/_example-learning-path/appendix-3-test.md index a95c3d1dcc..dfb9eec38a 100644 --- a/content/learning-paths/cross-platform/_example-learning-path/appendix-3-test.md +++ b/content/learning-paths/cross-platform/_example-learning-path/appendix-3-test.md @@ -18,9 +18,7 @@ The framework allows you to parse Learning Path articles and generate instructio 2. [Edit Learning Path pages](#edit-learning-path-pages) 3. [Edit metadata](#edit-metadata) 4. [Run the framework](#run-the-framework) -5. [Result summary](#result-summary) -6. [Visualize results](#visualize-results) - +5. [Advanced usage for embedded development](#advanced-usage-for-embedded-development) ## Install dependencies @@ -279,7 +277,7 @@ In the example above, the summary indicates that for this Learning Path all test ## Advanced usage for embedded development ### Using the Corstone-300 FVP -By default, the framework runs instructions on the Docker images specified by the [metadata](#edit-metadata). For embedded development, it is possible to build software in a container instance and then check its behaviour on the Corstone-300 FVP. +By default, the framework runs instructions on the Docker images specified by the [metadata](#edit-metadata). For embedded development, it is possible to build software in a container instance and then check its behavior on the Corstone-300 FVP. For this, all container instances used by the test framework mount a volume in `/shared`. This is where software for the target FVP can be stored. To check the execution, the FVP commands just need to be identified as a `fvp` section for the framework. diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/Overview-1.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/1-overview.md similarity index 93% rename from content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/Overview-1.md rename to content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/1-overview.md index 2013211d3b..68b09f11c7 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/Overview-1.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/1-overview.md @@ -8,12 +8,11 @@ layout: learningpathall ## TinyML - -This Learning Path is about TinyML. It is a starting point for learning how innovative AI technologies can be used on even the smallest of devices, making Edge AI more accessible and efficient. You will learn how to set up your host machine and target device to facilitate compilation and ensure smooth integration across devices. +This Learning Path is about TinyML. It is a starting point for learning how innovative AI technologies can be used on even the smallest of devices, making Edge AI more accessible and efficient. You will learn how to set up your host machine to facilitate compilation and ensure smooth integration across devices. This section provides an overview of the domain with real-life use cases and available devices. -TinyML represents a significant shift in Machine Learning deployment. Unlike traditional Machine Learning, which typically depends on cloud-based servers or high-performance hardware, TinyML is tailored to function on devices with limited resources, constrained memory, low power, and fewer processing capabilities. +TinyML represents a significant shift in Machine Learning deployment. Unlike traditional Machine Learning, which typically depends on cloud-based servers or high-performance hardware, TinyML is tailored to function on devices with limited resources, constrained memory, low power, and fewer processing capabilities. TinyML has gained popularity because it enables AI applications to operate in real-time, directly on the device, with minimal latency, enhanced privacy, and the ability to work offline. This shift opens up new possibilities for creating smarter and more efficient embedded systems. @@ -36,7 +35,7 @@ Here are some of the key benefits of TinyML on Arm: TinyML is being deployed across multiple industries, enhancing everyday experiences and enabling groundbreaking solutions. The table below shows some examples of TinyML applications. -| Area | Device, Arm IP | Description | +| Area | Example, Arm IP | Description | | ------ | ------- | ------------ | | Healthcare | Fitbit Charge 5, Cortex-M | To monitor vital signs such as heart rate, detect arrhythmias, and provide real-time feedback. | | Agriculture | OpenAg, Cortex-M | To monitor soil moisture and optimize water usage. | @@ -70,4 +69,4 @@ In addition to hardware, there are software platforms that can help you build Ti Edge Impulse offers a suite of tools for developers to build and deploy TinyML applications on Arm-based devices. It supports devices like Raspberry Pi, Arduino, and STMicroelectronics boards. -Now that you have an overview of the subject, you can move on to the next section where you will set up an environment on your host machine. \ No newline at end of file +Now that you have an overview of the subject, you can move on to the next section where you will set up a development environment. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-5.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md similarity index 62% rename from content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-5.md rename to content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md index 6330b3b78e..a6b0c6c858 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-5.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/2-env-setup.md @@ -1,6 +1,6 @@ --- # User change -title: "Environment Setup on Host Machine" +title: "Install ExecuTorch" weight: 3 @@ -8,10 +8,16 @@ weight: 3 layout: "learningpathall" --- -In this section, you will prepare a development environment to compile a Machine Learning model. These instructions have been tested on Ubuntu 22.04, 24.04, and on Windows Subsystem for Linux (WSL). +In this section, you will prepare a development environment to compile a machine learning model. + +## Introduction to ExecuTorch + +ExecuTorch is a lightweight runtime designed for efficient execution of PyTorch models on resource-constrained devices. It enables machine learning inference on embedded and edge platforms, making it well-suited for Arm-based hardware. Since Arm processors are widely used in mobile, IoT, and embedded applications, ExecuTorch leverages Arm’s efficient CPU architectures to deliver optimized performance while maintaining low power consumption. By integrating with Arm’s compute libraries, it ensures smooth execution of AI workloads on Arm-powered devices, from Cortex-M microcontrollers to Cortex-A application processors. ## Install dependencies +These instructions have been tested on Ubuntu 22.04, 24.04, and on Windows Subsystem for Linux (WSL). + Python3 is required and comes installed with Ubuntu, but some additional packages are needed: ```bash @@ -45,7 +51,6 @@ Run the commands below to set up the ExecuTorch internal dependencies: ```bash git submodule sync git submodule update --init -./install_requirements.sh ./install_executorch.sh ``` @@ -70,8 +75,4 @@ executorch 0.6.0a0+3eea1f1 ## Next Steps -Your next steps depend on your hardware. - -If you have the Grove Vision AI Module, proceed to [Set up the Grove Vision AI Module V2 Learning Path](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/setup-7-grove/). - -If you do not have the Grove Vision AI Module, you can use the Corstone-320 FVP instead. See the Learning Path [Set up the Corstone-320 FVP](/learning-paths/microcontrollers/introduction-to-tinyml-on-arm/env-setup-6-fvp/). +Proceed to the next section to learn about and set up the virtualized hardware. diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-FVP.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-FVP.md new file mode 100644 index 0000000000..4a1d992d1b --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/3-env-setup-FVP.md @@ -0,0 +1,40 @@ +--- +# User change +title: "Set up the Corstone-320 FVP" + +weight: 5 # 1 is first, 2 is second, etc. + +# Do not modify these elements +layout: "learningpathall" +--- + +In this section, you will run scripts to set up the Corstone-320 reference package. + +The Corstone-320 Fixed Virtual Platform (FVP) is a pre-silicon software development environment for Arm-based microcontrollers. It provides a virtual representation of hardware, allowing developers to test and optimize software before actual hardware is available. Designed for AI and machine learning workloads, it includes support for Arm’s Ethos-U NPU and Cortex-M processors, making it ideal for embedded AI applications. The FVP accelerates development by enabling early software validation and performance tuning in a flexible, simulation-based environment. + +The Corstone reference system is provided free of charge, although you will have to accept the license in the next step. For more information on Corstone-320, check out the [official documentation](https://developer.arm.com/documentation/109761/0000?lang=en). + +## Corstone-320 FVP Setup for ExecuTorch + +Navigate to the Arm examples directory in the ExecuTorch repository. Run the following command. + +```bash +cd $HOME/executorch/examples/arm +./setup.sh --i-agree-to-the-contained-eula +``` + +After the script has finished running, it prints a command to run to finalize the installation. This step adds the FVP executables to your system path. + +```bash +source $HOME/executorch/examples/arm/ethos-u-scratch/setup_path.sh +``` + +Test that the setup was successful by running the `run.sh` script for Ethos-U85, which is the target device for Corstone-320: + +```bash + ./examples/arm/run.sh --target=ethos-u85-256 +``` + +You will see a number of examples run on the FVP. + +This confirms the installation, so you can now proceed to the Learning Path [Build a Simple PyTorch Model](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/build-model-8/). \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/build-model-8.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md similarity index 96% rename from content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/build-model-8.md rename to content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md index 7d69457845..5559b9f147 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/build-model-8.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/4-build-model.md @@ -106,8 +106,6 @@ FVP_Corstone_SSE-320 \ -a "$ET_HOME/examples/arm/executor_runner/cmake-out/arm_executor_runner" ``` - - {{% notice Note %}} The argument `mps4_board.visualisation.disable-visualisation=1` disables the FVP GUI. This can speed up launch time for the FVP. @@ -124,4 +122,4 @@ I [executorch:arm_executor_runner.cpp:412] Model in 0x70000000 $ I [executorch:arm_executor_runner.cpp:414] Model PTE file loaded. Size: 3360 bytes. ``` -You have now set up your environment for TinyML development on Arm, and tested a small PyTorch and ExecuTorch Neural Network. \ No newline at end of file +You have now set up your environment for TinyML development on Arm, and tested a small PyTorch and ExecuTorch Neural Network. In the next Learning Path of this series, you will learn about optimizing neural networks to run on Arm. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md index d323fc0bf3..db39cfc760 100644 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/_index.md @@ -9,12 +9,11 @@ learning_objectives: - Describe what differentiates TinyML from other AI domains. - Describe the benefits of deploying AI models on Arm-based edge devices. - Identify suitable Arm-based devices for TinyML applications. - - Set up and configure a TinyML development environment using ExecuTorch and Corstone-320 FVP. + - Set up and configure a TinyML development environment using ExecuTorch and Corstone-320 Fixed Virtual Platform (FVP). prerequisites: - - Basic knowledge of Machine Learning concepts. - - A Linux host machine or VM running Ubuntu 22.04 or higher. - - A [Grove Vision AI Module](https://wiki.seeedstudio.com/Grove-Vision-AI-Module/) or an Arm license to run the Corstone-320 Fixed Virtual Platform (FVP). + - Basic knowledge of Machine Learning concepts + - A Linux computer author: Dominica Abena O. Amanfo @@ -37,23 +36,21 @@ tools_software_languages: - ExecuTorch - Arm Compute Library - GCC - - Edge Impulse - - Node.js further_reading: - resource: - title: TinyML Brings AI to Smallest Arm Devices + title: TinyML Brings AI to Smallest Arm Devices link: https://newsroom.arm.com/blog/tinyml type: blog - resource: - title: Arm Compiler for Embedded - link: https://developer.arm.com/Tools%20and%20Software/Arm%20Compiler%20for%20Embedded + title: Arm Machine Learning Resources + link: https://www.arm.com/developer-hub/embedded-and-microcontrollers/ml-solutions/getting-started type: documentation - resource: - title: Arm GNU Toolchain - link: https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain + title: Arm Developers Guide for Cortex-M Processors and Ethos-U NPU + link: https://developer.arm.com/documentation/109267/0101 type: documentation - + diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-6-FVP.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-6-FVP.md deleted file mode 100644 index 01ed1c5f40..0000000000 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/env-setup-6-FVP.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -# User change -title: "Set up the Corstone-320 FVP" - -weight: 5 # 1 is first, 2 is second, etc. - -# Do not modify these elements -layout: "learningpathall" ---- - -## Corstone-320 FVP Setup for ExecuTorch - -Navigate to the Arm examples directory in the ExecuTorch repository: - -```bash -cd $HOME/executorch/examples/arm -./setup.sh --i-agree-to-the-contained-eula -``` - -After the script has finished running, it prints a command to run to finalize the installation. This step adds the FVP executables to your system path. - -```bash -source $HOME/executorch/examples/arm/ethos-u-scratch/setup_path.sh -``` - -Test that the setup was successful by running the `run.sh` script for Ethos-U85, which is the target device for Corstone-320: - -```bash - ./examples/arm/run.sh --target=ethos-u85-256 -``` - -You will see a number of examples run on the FVP. - -This confirms the installation, so you can now proceed to the Learning Path [Build a Simple PyTorch Model](/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/build-model-8/). \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/setup-7-Grove.md b/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/setup-7-Grove.md deleted file mode 100644 index 1153269058..0000000000 --- a/content/learning-paths/embedded-and-microcontrollers/introduction-to-tinyml-on-arm/setup-7-Grove.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -# User change -title: "Set up the Grove Vision AI Module V2" - -weight: 6 # 1 is first, 2 is second, etc. - -# Do not modify these elements -layout: "learningpathall" ---- -## Before you begin - -This section requires the Grove Vision AI Module. Due to its constrained environment, we will focus on lightweight, optimized, tools and models. - -### Compilers - -The examples can be built with Arm Compiler for Embedded or Arm GNU Toolchain. - -Use the install guides to install each compiler on your host machine: -- [Arm Compiler for Embedded](/install-guides/armclang/). -- [Arm GNU Toolchain](/install-guides/gcc/arm-gnu/). - -## Board Setup - -![Hardware Overview #center](Overview.png) - -Hardware overview: [Image credits](https://wiki.seeedstudio.com/grove_vision_ai_v2/). - -1. Download and extract the latest Edge Impulse firmware -Grove Vision V2 [Edge impulse Firmware](https://cdn.edgeimpulse.com/firmware/seeed-grove-vision-ai-module-v2.zip). - -2. Connect the Grove Vision AI Module V2 to your computer using the USB-C cable. - -![Board connection](Connect.png) - -{{% notice Note %}} -Ensure the board is properly connected and recognized by your computer. -{{% /notice %}} - -3. In the extracted Edge Impulse firmware, locate and run the `flash_linux.sh` script to flash your device. - - ```console - ./flash_linux.sh - ``` -You have now set up the board successfully. In the next section, you will learn how to use the functionality in the ExecuTorch repository for TinyML, using a hardware emulator. - -{{% notice Note %}} -In the next Learning Path in this series, you will incorporate the board into the workflow, running workloads on real hardware. -{{% /notice %}} - -Continue to the next page to build a simple PyTorch model. diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/1.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/1.png new file mode 100644 index 0000000000..aa05c20050 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/1.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/2.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/2.png new file mode 100644 index 0000000000..1827aa3f5e Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/2.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/3.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/3.png new file mode 100644 index 0000000000..2fd7f4c641 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/3.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_index.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_index.md new file mode 100644 index 0000000000..37c76feba0 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_index.md @@ -0,0 +1,71 @@ +--- +title: LLM Fine-Tuning for Mobile Applications + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: This learning path provides an introduction for developers and data scientists new to fine-tuning large language models (LLMs) and looking to develop a fine-tuned LLM for mobile applications. Fine-tuning involves adapting a pre-trained LLM to specific tasks or domains by training it on domain-specific data and optimizing its responses for accuracy and relevance. For mobile applications, fine-tuning enables personalized interactions, enhanced query handling, and improved contextual understanding, making AI-driven features more effective. This session will cover key concepts, techniques, tools, and best practices, ensuring a structured approach to building a fine-tuned LLM that aligns with real-world mobile application requirements.Mobile application with Llama, KleidiAI, ExecuTorch, and XNNPACK. + +learning_objectives: + - Learn the basics of large language models (LLMs) and how fine-tuning enhances model performance for specific use cases focusing on mobile applications. + - Understand full fine-tuning, parameter-efficient fine-tuning (e.g., LoRA, QLoRA, PEFT), and instruction-tuning. + - Learn when to use different fine-tuning approaches based on model size, task complexity, and computational constraints. + - Learn how to curate, clean, and preprocess domain-specific datasets for optimal fine-tuning. + - Understand dataset formats, tokenization, and annotation techniques for improving model learning. + - Implementing Fine-Tuning with Popular Frameworks like Hugging Face Transformers and PyTorch for LLM fine-tuning. + - Learn how to deploy and fine-tune the model in the mobile device. + - Compile a Large Language Model (LLM) using ExecuTorch. + - Describe techniques for running large language models in an mobile environment. + +prerequisites: + - Basic Understanding of Machine Learning & Deep Learning (Familiarity with concepts like supervised learning, neural networks, transfer learning and Understanding of model training, validation, & overfitting concepts). + - Familiarity with Deep Learning Frameworks (Experience with PyTorch for building, training neural networks and Knowledge of Hugging Face Transformers for working with pre-trained LLMs. + - An Arm-powered smartphone with the i8mm feature running Android, with 16GB of RAM. + - A USB cable to connect your smartphone to your development machine. + - An AWS Graviton4 r8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server or Arm based laptop. + - Python 3.10. + +author: Parichay Das + +### Tags +skilllevels: Introductory +subjects: GenAI +armips: + - Neoverse + +tools_software_languages: + - LLM + - GenAI + - Python + - PyTorch + - ExecuTorch +operatingsystems: + - Linux + - Windows + - Android + + +further_reading: + - resource: + title: Hugging Face Documentation + link: https://huggingface.co/docs + type: documentation + - resource: + title: PyTorch Documentation + link: https://pytorch.org/docs/stable/index.html + type: documentation + - resource: + title: Android + link: https://www.android.com/ + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_next-steps.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/example-picture.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/example-picture.png new file mode 100644 index 0000000000..c69844bed4 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/example-picture.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-1.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-1.md new file mode 100644 index 0000000000..299522fa67 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-1.md @@ -0,0 +1,65 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What is Fine-Tuning +Fine-tuning in the context of large language models (LLMs) refers to the process of further training a pre-trained LLM on domain-specific or task-specific data to enhance its performance for a particular application. LLMs, such as GPT, BERT, and LLaMA, are initially trained on massive corpora containing billions of tokens, enabling them to develop a broad linguistic understanding. Fine-tuning refines this knowledge by exposing the model to specialized datasets, allowing it to generate more contextually relevant and accurate responses. Rather than training an LLM from scratch, fine-tuning leverages the pre-existing knowledge embedded in the model, optimizing it for specific use cases such as customer support, content generation, legal document analysis, or medical text processing. This approach significantly reduces computational requirements and data needs while improving adaptability and efficiency in real-world applications. + +## Advantage of Fine-Tuning +Fine-tuning is essential for optimizing large language models (LLMs) to meet specific application requirements, enhance performance, and reduce computational costs. While pre-trained LLMs have broad linguistic capabilities, they may not always produce domain-specific, contextually accurate, or application-tailored responses +- Customization for Specific Domains +- Improved Response Quality and Accuracy +- Task-Specific Adaptation +- Reduction in Computational and Data Requirements +- Enhanced Efficiency in Real-World Applications +- Alignment with Ethical, Regulatory, and Organizational Guidelines + +## Fine-Tuning Methods +Fine-tuning LLM uses different techniques based on the various use cases, computational constraints, and efficiency requirements. Below are the key fine-tuning methods: + +### Full Fine-Tuning (Supervised Learning Approach) +It involves updating all parameters of the LLM using task-specific data, requiring significant computational power and large labeled datasets, which provides the highest level of customization. + +### Instruction Fine-Tuning +Instruction fine-tuning is a supervised learning method. A pre-trained large language model (LLM) is further trained on instruction-response pairs to improve its ability to follow human instructions accurately. Instruction Fine-Tuning has some key features using Labeled Instruction-Response Pairs, Enhances Model Alignment with Human Intent, Commonly Used in Chatbots and AI Assistants, and Prepares Models for Zero-Shot and Few-Shot Learning. + +### Parameter-Efficient Fine-Tuning (PEFT) +It is a optimized approaches that reduce the number of trainable parameters while maintaining high performance: + +- ###### LoRA (Low-Rank Adaptation) + - Introduces small trainable weight matrices (rank decomposition) while freezing the main model weights. + - It will significantly reduce GPU memory usage and training time. + +- ###### QLoRA (Quantized LoRA) + - It will use quantization (e.g., 4-bit or 8-bit precision) to reduce memory footprint while applying LoRA fine-tuning. + - It is Ideal for fine-tuning large models on limited hardware. + +- ###### Adapter Layers + - Inserts small trainable layers between existing layers of the model and Keeps most parameters frozen, reducing computational overhead. + +- ###### Reinforcement Learning from Human Feedback (RLHF) + - Fine-tunes models based on human preferences using reinforcement learning. + +- ###### Domain-Specific Fine-Tuning + - Fine-tunes the LLM with domain-specific datasets and Improves accuracy and relevance in specialized applications. + +- ###### Multi-Task Learning (MTL) Fine-Tuning + - Trains the model on multiple tasks simultaneously, enabling generalization across different applications. + + + +## Fine-Tuning Implementaion +The following steps need to be performed to implement fine-tuning: + + +![example image alt-text#center](1.png "Figure 1. Fine-Tuning Implementaion") + +- Base Model Selection: Choose a pre-trained model based on your use cases. You can find pre-trained models at [Hugging Face](https://huggingface.co/). +- Fine-Tuning Method Finalization: Select the most appropriate fine-tuning method (e.g., supervised, instruction-based, PEFT) based on your use case and dataset. You can typically find various datasets on [Hugging Face](https://huggingface.co/datasets) and [Kaggle](https://www.kaggle.com/datasets). +- Dataset Prepration:Organize your data for your use case-specific training, ensuring it aligns with the model's required format. +- Training:Utilize frameworks such as TensorFlow and PyTorch to fine-tune the model. +- Evaluate: Evaluate the model, refine it as needed, and retrain to enhance performance. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-2.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-2.md new file mode 100644 index 0000000000..1634ff0fc5 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-2.md @@ -0,0 +1,49 @@ +--- +title: Fine Tuning Large Language Model - Setup Environment +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Setup Environment + +#### Plartform Required +- An AWS Graviton4 r8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server or Arm based laptop. +- An Arm-powered smartphone with the i8mm feature running Android, with 16GB of RAM. +- A USB cable to connect your smartphone to your development machine. + +#### Set Up Required Libraries +The following commands install the necessary libraries for the task, including Hugging Face Transformers, Datasets, and fine-tuning methods. These libraries facilitate model loading, training, and fine-tuning + +###### The transformers library (by Hugging Face) provides pre-trained LLMs +```python +!pip install transformers + +``` +###### This installs transformers along with PyTorch, ensuring that models are trained and fine-tuned using the Torch backend. +```python +!pip install transformers[torch] +``` +###### The datasets library (by Hugging Face) provides access to a vast collection of pre-built datasets + +```python +!pip install datasets +``` +###### The evaluate library provides metrics for model performance assessment + +```python +!pip install evaluate +``` +###### Speed up fine-tuning of Large Language Models (LLMs) +[Unsloth](https://huggingface.co/unsloth) is a library designed to speed up fine-tuning of Large Language Models (LLMs) while reducing computational costs. It optimizes training efficiency, particularly for LoRA (Low-Rank Adaptation) fine-tuning +```python +%%capture +# %%capture is a Jupyter Notebook magic command that suppresses the output of a cell. + +``` +##### Uninstalls the existing Unsloth installation and installs the latest version directly from the GitHub repository + +```python +!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-3.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-3.md new file mode 100644 index 0000000000..cde0b63c38 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-3.md @@ -0,0 +1,67 @@ +--- +title: Fine Tuning Large Language Model - Load Pre-trained Model & Tokenizer +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Load Pre-trained Model & Tokenizer + +#### Load Pre-trained Model & Tokenizer +The following commands Load the pre-trained model and tokenizer, ensuring compatibility with the fine-tuning task and optimizing memory usage + +###### Import Required Modules +- FastLanguageModel: A highly optimized loader for LLaMA models in Unsloth, making it faster and memory-efficient. +- torch: Required for handling tensors and computations. +```python +from unsloth import FastLanguageModel +import torch + +``` +###### Define Model Configuration +- max_seq_length = 2048 → Defines the maximum number of tokens the model can process at once. +- dtype = None → Auto-selects Float16 for older GPUs (Tesla T4, V100) +- load_in_4bit = True → Enables 4-bit quantization to reduce memory usage +```python +max_seq_length = 2048 +dtype = None +load_in_4bit = True +``` +###### Load the Pre-trained Model +- Loads a 1B parameter fine-tuned LLaMA model +- Loads the optimized LLaMA model with reduced VRAM usage and faster processing +- Loads the corresponding tokenizer for tokenizing inputs properly + +```python +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/Llama-3.2-1B-Instruct", + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, +``` +###### Parameter-Efficient Fine-Tuning (PEFT) using LoRA (Low-Rank Adaptation) for the pre-trained model +- LoRA Rank (r): Defines the rank of the low-rank matrices used in LoRA +- Target Modules: Specifies which layers should be fine-tuned with LoRA, Includes attention layers (q_proj, k_proj, v_proj, o_proj) and feedforward layers (gate_proj, up_proj, down_proj) +- LoRA Alpha (lora_alpha):Scaling factor for LoRA weights and A higher value makes the LoRA layers contribute more to the model's output +- LoRA Dropout: Dropout randomly disables connections to prevent overfitting +- Bias (bias): No additional bias parameters are trained (optimized for efficiency) +- Gradient Checkpointing: Optimized memory-saving method +- Random Seed: Ensures reproducibility across training runs +- Rank-Stabilized LoRA: Rank stabilization not used +- LoFTQ Quantization: No LoFTQ (Low-bit Quantization) applied +```python +model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj",], + lora_alpha = 16, + lora_dropout = 0, + bias = "none", + use_gradient_checkpointing = "unsloth", + random_state = 3407, + use_rslora = False, + loftq_config = None, +) +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-4.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-4.md new file mode 100644 index 0000000000..ea63e43c22 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-4.md @@ -0,0 +1,75 @@ +--- +title: Fine Tuning Large Language Model - Prepare Dataset +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Prepare Dataset +This step prepares the dataset for fine-tuning by formatting it to match the LLaMA-3.1 chat template. + +###### Import Chat Template for Tokenizer +This imports the chat template functionality from Unsloth and It allows us to structure the dataset in a format that LLaMA-3.1 expects +```python +from unsloth.chat_templates import get_chat_template +``` + +###### Apply the Chat Template to Tokenizer +- Apply the Chat Template to Tokenizer. +- Ensures prompt formatting is consistent when training the model. +```python +tokenizer = get_chat_template( + tokenizer, + chat_template = "llama-3.1", +) + + +``` +###### Format Dataset Prompts +- Extracts the instruction column from the dataset. +- Applies the chat template formatting to each instruction. +- Returns a new dictionary with the formatted text. +```python +def formatting_prompts_func(examples): + convos = examples["instruction"] + texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] + return { "text" : texts, } +pass +``` +###### Load the Dataset +- Loads a [customer support chatbot training dataset](https://huggingface.co/datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset) from Hugging Face +- The dataset contains example conversations with instructions for fine-tuning +- Loads the corresponding tokenizer for tokenizing inputs properly + +```python +from datasets import load_dataset +dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split = "train") + +``` +![example image alt-text#center](2.png ) + +###### Import Standardization Function +- Imports standardize_sharegpt, a function that helps in structuring dataset inputs in a ShareGPT-like format (a commonly used format for LLM fine-tuning). +- Ensures that data follows a standardized format required for effective instruction tuning. +```python +from unsloth.chat_templates import standardize_sharegpt +``` +###### Define a Function to Format Dataset +- Extracts the instruction (input text) and response (output text) from the dataset. +- Stores them as "instruction_text" and "response_text". +```python +def formatting_prompts_func(examples): + return { "instruction_text": examples["instruction"], "response_text": examples["response"] } + +``` + +###### Apply Formatting to Dataset +- Applies formatting_prompts_func to every record in the dataset. +- Uses batch processing (batched=True) for efficiency. +```python +def formatting_prompts_func(examples): + return { "instruction_text": examples["instruction"], "response_text": examples["response"] } + +``` +![example image alt-text#center](3.png ) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-5.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-5.md new file mode 100644 index 0000000000..46f8e38baf --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-5.md @@ -0,0 +1,117 @@ +--- +title: Fine Tuning Large Language Model - Configure and Initialize the Fine-Tuning +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Configure and Initialize the Fine-Tuning + +###### Configure and Initialize the Fine-Tuning Trainer +Sets up the Supervised Fine-Tuning (SFT) Trainer to train the model using the prepared dataset. The trainer manages training configurations, optimization, and logging +###### Import Necessary Libraries +- SFTTrainer (from trl) → Handles the fine-tuning process for LLMs using supervised fine-tuning (SFT). +- TrainingArguments (from transformers) → Defines training hyperparameters like batch size, learning rate, and logging. +- DataCollatorForSeq2Seq (from transformers) → Prepares batches of text data for training (handles padding, truncation). +- is_bfloat16_supported() (from unsloth) → Checks if the system supports bfloat16 (a mixed-precision format for optimized training). + +```python +from trl import SFTTrainer +from transformers import TrainingArguments, DataCollatorForSeq2Seq +from unsloth import is_bfloat16_supported + +``` + +###### Initialize the SFTTrainer +- Loads the Model & Tokenizer → Uses the pre-trained LLM and tokenizer. +- Specifies the Training Dataset → The dataset (dataset) prepared earlier is used for fine-tuning. +- Sets Maximum Sequence Length → Defines max_seq_length, ensuring model input size is within the supported limit. +- Uses Data Collator for Batching → DataCollatorForSeq2Seq dynamically pads and tokenizes text data. +- Enables Multi-Processing (dataset_num_proc = 2) → Uses two parallel processes for faster data loading. +- Packing (packing = False) → Disables sequence packing, which can speed up training for shorter sequences. +```python +trainer = SFTTrainer( + model = model, + tokenizer = tokenizer, + train_dataset = dataset, + dataset_text_field = "instruction", + max_seq_length = max_seq_length, + data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer), + dataset_num_proc = 2, + packing = False, + +``` +###### Define Training Hyperparameters +- Batch Size (per_device_train_batch_size = 2) → Uses a small batch size to fit within GPU memory. +- Gradient Accumulation (gradient_accumulation_steps = 4) → Accumulates gradients over 4 steps before updating model weights. +- Warmup Steps (warmup_steps = 5) → Gradually increases the learning rate in the initial steps to stabilize training. +- Training Steps (max_steps = 60) → Runs for 60 optimization steps (adjustable for full training). +- Learning Rate (learning_rate = 2e-4) → Sets a moderate learning rate for stable fine-tuning. +- Mixed Precision (fp16 or bf16) → Uses bfloat16 if supported; otherwise, falls back to fp16 for efficient computation. +- Logging (logging_steps = 1) → Logs training progress every step. +- Optimizer (optim = "adamw_8bit") → Uses adamw_8bit, which is a memory-efficient optimizer. +- Weight Decay (weight_decay = 0.01) → Adds regularization to prevent overfitting. +- Learning Rate Scheduler (lr_scheduler_type = "linear") → Linearly decays the learning rate over time. +- Random Seed (seed = 3407) → Ensures reproducibility of training results. +- Output Directory (output_dir = "outputs") → Saves the trained model checkpoints in "outputs" folder. + +```python +args = TrainingArguments( + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 5, + max_steps = 60, + learning_rate = 2e-4, + fp16 = not is_bfloat16_supported(), + bf16 = is_bfloat16_supported(), + logging_steps = 1, + optim = "adamw_8bit", + weight_decay = 0.01, + lr_scheduler_type = "linear", + seed = 3407, + output_dir = "outputs", +) + +``` + + +###### Fine-Tuning on Responses Only +Modifies the training approach so that the model learns to focus only on responses rather than both instructions and responses + +###### Import Function to Modify Training +- Loads the train_on_responses_only function from Unsloth’s chat templates +```python +from unsloth.chat_templates import train_on_responses_only +``` +###### Apply train_on_responses_only to the Trainer +- Modifies Trainer Behavior → Instead of training the model on full conversations, it now only learns from the assistant's responses. +```python +trainer = train_on_responses_only( + trainer, + instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n", + response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n", +) + +``` +###### Inspecting Tokenized Data and Labels +Inspect how the dataset has been tokenized and prepared for fine-tuning. It checks how input sequences (prompts) and labels (expected model outputs) are formatted. + +###### Decode a Sample Training Input +- Extracts the tokenized input sequence from the dataset (trainer.train_dataset[5]["input_ids"]). +- Decodes it back into human-readable text using the tokenizer. +- This helps verify how instructions and responses were tokenized. +```python +tokenizer.decode(trainer.train_dataset[5]["input_ids"]) +space = tokenizer(" ", add_special_tokens = False).input_ids[0] +tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]) +``` +###### Training the Model +Initiates the training process using the trainer object, which has been configured with model, dataset, optimizer, and training parameters +- Create an account in [Weights & Biases](https://wandb.ai/) +- Logging into [Weights & Biases](https://wandb.ai/) and [W&B server locally](https://wandb.me/wandb-server) +- You can locate your [API key](https://wandb.ai/authorize) in your browser at this link +- Paste an API key from your profile and press enter +```python +trainer_stats = trainer.train() +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-6.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-6.md new file mode 100644 index 0000000000..6b40c6115b --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-6.md @@ -0,0 +1,93 @@ +--- +title: Fine Tuning Large Language Model - Running Inference +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Running Inference + + +###### Import Chat Template +- This function provides a predefined chat format suitable for Llama 3.1. +- Ensures that prompts are structured correctly for inference. + + +```python +from unsloth.chat_templates import get_chat_template +``` + +###### Apply Chat Template to the Tokenizer +- Updates the tokenizer with the Llama 3.1 chat template. +- Ensures the input messages are formatted according to Llama 3.1's expected structure. + +```python +tokenizer = get_chat_template( + tokenizer, + chat_template="llama-3.1", +) + + +``` +###### Enable Faster Inference +- Optimizes the model for low-latency inference. +- Uses Unsloth’s performance improvements to speed up text generation.checkpoints in "outputs" folder. + +```python +FastLanguageModel.for_inference(model) + +``` + + +###### Define Input Messages +- Defines a conversation in a structured format +```python +messages = [ + {"i have a question about cancelling oorder {{Order Number}}"}, +] +``` +###### Tokenize Input Messages +- Converts the messages into tokens. +- The apply_chat_template() function ensures the model receives the correct chat format. +- tokenize=True: Converts text into numerical token IDs. +- add_generation_prompt=True: Ensures the assistant's response is expected. +- return_tensors="pt": Converts input into PyTorch tensors. +- .to("cuda"): Moves data to GPU for faster processing. +```python +inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, # Must add for generation + return_tensors="pt", +).to("cuda") +``` +###### Generate Model Output +- Generates text based on the input. +- max_new_tokens=64: Limits output length to 64 tokens. +- use_cache=True: Speeds up generation by caching intermediate results. +- temperature=1.5: Increases randomness in output (higher value = more diverse text). +- min_p=0.1: Controls token probability threshold (avoids unlikely tokens). +```python +outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=1.5, min_p=0.1) +``` +###### Decode the Generated Output +- Converts tokenized output back into human-readable text + +```python +tokenizer.batch_decode(outputs) +``` + +###### Save the LoRA Model Locally +- Saves the model (including its LoRA weights) to a local directory + +```python +model.save_pretrained("finetune_mobilebot") +``` +###### Save the Tokenizer Locally +- Saves the tokenizer configuration into the "finetune_webbot" directory + +```python +tokenizer.save_pretrained("finetune_mobilebbot") +``` +After saving the fine-tuned model, you can integrate it into your mobile application for real-time inference using ExecuTorch. Load the model and tokenizer within the mobile environment, deploy them efficiently on-device with ExecuTorch's lightweight execution framework, and serve responses directly based on user inputs. Further details on implementation will be explained in the next steps. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-7.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-7.md new file mode 100644 index 0000000000..bd44002032 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-7.md @@ -0,0 +1,109 @@ +--- +title: Mobile Plartform for Fine Tuning Large Language Model +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Development environment +You will learn to build the ExecuTorch runtime for fine-tuning models using KleidiAI, create JNI libraries for an mobile application, and integrate these libraries into the application. + +The first step is to set up a development environment with the necessary software: +- Python 3.10 or later +- Git +- Java 17 JDK +- Latest Version of Android Studio +- Android NDK + +###### Installation of Android Studio and Android NDK +- Download and install the latest version of Android Studio +- Launch Android Studio and open the Settings dialog. +- Go to Languages & Frameworks > Android SDK. +- In the SDK Platforms tab, select Android 14.0 ("UpsideDownCake"). +- Install the required version of Android NDK by first setting up the Android command line tools. + +###### Install Java 17 JDK +- Open the [Java SE 17 Archive Downloads](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html) Downloads page in your browser. +- Choose the appropriate version for your operating system. +- Downloads are available for macOS and Linux. + +###### Install Git and cmake + +For macOS use [Homebrew](https://brew.sh/): + +``` bash +brew install git cmake +``` + +For Linux, use the package manager for your distribution: + +``` bash +sudo apt install git-all cmake +``` + +###### Install Python 3.10 + +For macOS: + +``` bash +brew install python@3.10 +``` + +For Linux: + +``` bash +sudo apt update +sudo apt install software-properties-common -y +sudo add-apt-repository ppa:deadsnakes/ppa +sudo apt install Python3.10 python3.10-venv +``` + + +###### Setup the [Executorch](https://pytorch.org/executorch/stable/intro-overview.html) Environments +For mobile device execution, [ExecuTorch](https://pytorch.org/executorch/stable/intro-overview.html) is required. It enables efficient on-device model deployment and execution + +- Python virtual environment creation + +```bash +python3.10 -m venv executorch +source executorch/bin/activate +``` + +The prompt of your terminal has `executorch` as a prefix to indicate the virtual environment is active. + +- Conda virtual environment creation + +Install Miniconda on your development machine by following the [Installing conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) instructions. + +Once `conda` is installed, create the environment: + +```bash +conda create -yn executorch python=3.10.0 +conda activate executorch +``` + +###### Clone ExecuTorch and install the required dependencies + +From within the conda environment, run the commands below to download the ExecuTorch repository and install the required packages: + +- You need to download Executorch from this [GitHub repository](https://github.com/pytorch/executorch/tree/main) +- Download the executorch.aar file from [executorch.aar](https://ossci-android.s3.us-west-1.amazonaws.com/executorch/release/executorch-241002/executorch.aar ) +- Add a libs folder in this path \executorch-main\executorch-main\examples\demo-apps\android\LlamaDemo\app\libs and add executorch.aar + +``` bash +git submodule sync +git submodule update --init +./install_requirements.sh +./install_requirements.sh --pybind xnnpack +./examples/models/llama/install_requirements.sh +``` + +###### Mobile Device Setup +- Enable the mobile device in [Android Studio](https://support.google.com/android/community-guide/273205728/how-to-enable-developer-options-on-android-pixels-6-secret-android-tips?hl=en) +- On the Android phone, enable Developer Options + - First, navigate to Settings > About Phone. + - At the bottom, locate Build Number and tap it seven times. A message will appear confirming that you are now a developer.(if only it were that easy to become one XD) + - Access Developer Options by navigating to Settings > System > Developer Options. + - You will see a large number of options, I repeat: DO NOT TOUCH ANYTHING YOU DO NOT KNOW. + - Enable USB Debugging to connect your mobile device to Android Studio. diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-8.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-8.md new file mode 100644 index 0000000000..544f158c2a --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-8.md @@ -0,0 +1,53 @@ +--- +title: Fine Tune Large Language Model and Quantization +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +#### Llama Model +Llama is a family of large language models designed for high-performance language processing tasks, trained using publicly available data. When fine-tuned, Llama-based models can be optimized for specific applications, enhancing their ability to generate accurate and context-aware responses. Fine-tuning enables the model to adapt to domain-specific data, improving performance in tasks such as: + +- Language translation – Enhancing fluency and contextual accuracy. +- Question answering – Providing precise and relevant responses. +- Text summarization – Extracting key insights while maintaining coherence. + +Fine-tuned LLaMA models are also highly effective in generating human-like text, making them valuable for: + +- Chatbots – Enabling intelligent and context-aware interactions. +- Virtual assistants – Enhancing responsiveness and personalization. +- Creative writing – Generating compelling and structured narratives. + +By fine-tuning Llama based models, their adaptability and relevance can be significantly improved, allowing seamless integration into specialized AI applications.Please note that the models are subject to the [acceptable use policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md) and [this responsible use guide](https://ai.meta.com/static-resource/responsible-use-guide/). + +#### Results + +Since LLaMA 2 and LLaMA 3 models require at least 4-bit quantization to accommodate the memory constraints of certain smartphones + +#### Quantization + +To optimize models for smartphone memory constraints, 4-bit groupwise per-token dynamic quantization can be applied to all linear layers. In this approach: + +- Dynamic quantization is used for activations, where quantization parameters are computed at runtime based on the min/max range. +- Static quantization is applied to weights, which are per-channel groupwise quantized using 4-bit signed integers. + +This method ensures efficient memory usage while maintaining model performance on resource-constrained devices. + +For further information, refer to [torchao: PyTorch Architecture Optimization](https://github.com/pytorch-labs/ao/). + +The table below evaluates WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). + +The results are for two different groupsizes, with max_seq_len 2048, and 1000 samples: + +|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------|-----------------| ---------------------- | --------------- +|Llama 2 7B | 9.2 | 10.2 | 10.7 +|Llama 3 8B | 7.9 | 9.4 | 9.7 + +Note that groupsize less than 128 was not enabled in this example, since the model was still too large. This is because current efforts have focused on enabling FP32, and support for FP16 is under way. + +What this implies for model size is: + +1. Embedding table is in FP32. +2. Quantized weights scales are FP32. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-9.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-9.md new file mode 100644 index 0000000000..331dc017d8 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-mobile-applications/how-to-9.md @@ -0,0 +1,34 @@ +--- +title: Prepared the Fine Tune Large Language Model for ExecuTorch and Mobile Deployment +weight: 10 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +#### Fine Tune Model Preparation + +- From the [Huggingface](https://huggingface.co/) need to apply for Repo access [Meta's Llama 3.2 language models](https://huggingface.co/meta-llama/Llama-3.2-1B). +- Download params.json and tokenizer.model from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). +- After fine-tuning the model, export the adapter_model.safetensors file locally and convert it to the adapter_model.pth format to .pte format. + +```python + python -m examples.models.llama.export_llama \ + --checkpoint \ + -p \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -qmode 8da4w \ + --group_size 128 \ + -d fp32 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --embedding-quantize 4,32 \ + --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +``` + +- Build the Llama Runner binary for [Android](https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android/). +- Build and Run [Android](https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/6-build-android-chat-app/). +- Open Android Studio and choose "Open an existing Android Studio project" to navigate to examples/demo-apps/android/LlamaDemo and Press Run (^R) to build and launch the app on your phone. +- Tap the Settings widget to select a model, configure its parameters, and set any prompts. +- After choosing the model, tokenizer, and model type, click "Load Model" to load it into the app and return to the main Chat activity. \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/1.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/1.png new file mode 100644 index 0000000000..f8dd45e42e Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/1.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/2.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/2.png new file mode 100644 index 0000000000..1827aa3f5e Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/2.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/3.png b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/3.png new file mode 100644 index 0000000000..2fd7f4c641 Binary files /dev/null and b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/3.png differ diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_index.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_index.md new file mode 100644 index 0000000000..b101e24449 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_index.md @@ -0,0 +1,63 @@ +--- +title: LLM Fine-Tuning for Web Applications + +draft: true +cascade: + draft: true + +minutes_to_complete: 60 + +who_is_this_for: This learning path provides an introduction for developers and data scientists new to fine-tuning large language models (LLMs) and looking to develop a fine-tuned LLM for web applications. Fine-tuning involves adapting a pre-trained LLM to specific tasks or domains by training it on domain-specific data and optimizing its responses for accuracy and relevance. For web applications, fine-tuning enables personalized interactions, enhanced query handling, and improved contextual understanding, making AI-driven features more effective. This session will cover key concepts, techniques, tools, and best practices, ensuring a structured approach to building a fine-tuned LLM that aligns with real-world web application requirements. + +learning_objectives: + - Learn the basics of large language models (LLMs) and how fine-tuning enhances model performance for specific use cases. + - Understand full fine-tuning, parameter-efficient fine-tuning (e.g., LoRA, QLoRA, PEFT), and instruction-tuning. + - Learn when to use different fine-tuning approaches based on model size, task complexity, and computational constraints. + - Learn how to curate, clean, and preprocess domain-specific datasets for optimal fine-tuning. + - Understand dataset formats, tokenization, and annotation techniques for improving model learning. + - Implementing Fine-Tuning with Popular Frameworks like Hugging Face Transformers and PyTorch for LLM fine-tuning. + +prerequisites: + - An AWS Graviton4 r8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server or Arm based laptop. + - Basic Understanding of Machine Learning & Deep Learning (Familiarity with concepts like supervised learning, neural networks, transfer learning and Understanding of model training, validation, & overfitting concepts). + - Familiarity with Deep Learning Frameworks (Experience with PyTorch for building, training neural networks and Knowledge of Hugging Face Transformers for working with pre-trained LLMs. + +author: Parichay Das + +### Tags +skilllevels: Introductory +subjects: GenAI +armips: + - Neoverse + +tools_software_languages: + - LLM + - GenAI + - Python + - PyTorch + - ExecuTorch +operatingsystems: + - Linux + - Windows + + + +further_reading: + - resource: + title: Hugging Face Documentation + link: https://huggingface.co/docs + type: documentation + - resource: + title: PyTorch Documentation + link: https://pytorch.org/docs/stable/index.html + type: documentation + + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_next-steps.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-1.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-1.md new file mode 100644 index 0000000000..07eb329938 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-1.md @@ -0,0 +1,65 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What is Fine-Tuning +Fine-tuning in the context of large language models (LLMs) refers to the process of further training a pre-trained LLM on domain-specific or task-specific data to enhance its performance for a particular application. LLMs, such as GPT, BERT, and LLaMA, are initially trained on massive corpora containing billions of tokens, enabling them to develop a broad linguistic understanding. Fine-tuning refines this knowledge by exposing the model to specialized datasets, allowing it to generate more contextually relevant and accurate responses. Rather than training an LLM from scratch, fine-tuning leverages the pre-existing knowledge embedded in the model, optimizing it for specific use cases such as customer support, content generation, legal document analysis, or medical text processing. This approach significantly reduces computational requirements and data needs while improving adaptability and efficiency in real-world applications. + +## Advantage of Fine-Tuning +Fine-tuning is essential for optimizing large language models (LLMs) to meet specific application requirements, enhance performance, and reduce computational costs. While pre-trained LLMs have broad linguistic capabilities, they may not always produce domain-specific, contextually accurate, or application-tailored responses +- Customization for Specific Domains +- Improved Response Quality and Accuracy +- Task-Specific Adaptation +- Reduction in Computational and Data Requirements +- Enhanced Efficiency in Real-World Applications +- Alignment with Ethical, Regulatory, and Organizational Guidelines + +## Fine-Tuning Methods +Fine-tuning LLM uses different techniques based on the various use cases, computational constraints, and efficiency requirements. Below are the key fine-tuning methods: + +### Full Fine-Tuning (Supervised Learning Approach) +It involves updating all parameters of the LLM using task-specific data, requiring significant computational power and large labeled datasets, which provides the highest level of customization. + +### Instruction Fine-Tuning +Instruction fine-tuning is a supervised learning method. A pre-trained large language model (LLM) is further trained on instruction-response pairs to improve its ability to follow human instructions accurately. Instruction Fine-Tuning has some key features using Labeled Instruction-Response Pairs, Enhances Model Alignment with Human Intent, Commonly Used in Chatbots and AI Assistants, and Prepares Models for Zero-Shot and Few-Shot Learning. + +### Parameter-Efficient Fine-Tuning (PEFT) +It is a optimized approaches that reduce the number of trainable parameters while maintaining high performance: + +- ###### LoRA (Low-Rank Adaptation) + - Introduces small trainable weight matrices (rank decomposition) while freezing the main model weights. + - It will significantly reduce GPU memory usage and training time. + +- ###### QLoRA (Quantized LoRA) + - It will use quantization (e.g., 4-bit or 8-bit precision) to reduce memory footprint while applying LoRA fine-tuning. + - It is Ideal for fine-tuning large models on limited hardware. + +- ###### Adapter Layers + - Inserts small trainable layers between existing layers of the model and Keeps most parameters frozen, reducing computational overhead. + +- ###### Reinforcement Learning from Human Feedback (RLHF) + - Fine-tunes models based on human preferences using reinforcement learning. + +- ###### Domain-Specific Fine-Tuning + - Fine-tunes the LLM with domain-specific datasets and Improves accuracy and relevance in specialized applications. + +- ###### Multi-Task Learning (MTL) Fine-Tuning + - Trains the model on multiple tasks simultaneously, enabling generalization across different applications. + + + +## Fine-Tuning Implementaion +The following steps need to be performed to implement fine-tuning: + + +![example image alt-text#center](1.png "Figure 1. Fine-Tuning Implementaion") + +- Base Model Selection: Choose a pre-trained model based on your use cases. You can find pre-trained models at [Hugging Face](https://huggingface.co/) +- Fine-Tuning Method Finalization: Select the most appropriate fine-tuning method (e.g., supervised, instruction-based, PEFT) based on your use case and dataset. You can typically find various datasets on [Hugging Face](https://huggingface.co/datasets) and [Kaggle](https://www.kaggle.com/datasets). +- Dataset Prepration:Organize your data for your use case-specific training, ensuring it aligns with the model's required format. +- Training:Utilize frameworks such as TensorFlow and PyTorch to fine-tune the model. +- Evaluate: Evaluate the model, refine it as needed, and retrain to enhance performance \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-2.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-2.md new file mode 100644 index 0000000000..615633a0e7 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-2.md @@ -0,0 +1,47 @@ +--- +title: Fine Tuning Large Language Model - Setup Environment +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Setup Environment + +#### Plartform Required +An AWS Graviton4 r8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server or Arm based laptop. + +#### Set Up Required Libraries +The following commands install the necessary libraries for the task, including Hugging Face Transformers, Datasets, and fine-tuning methods. These libraries facilitate model loading, training, and fine-tuning + +###### The transformers library (by Hugging Face) provides pre-trained LLMs +```python +!pip install transformers + +``` +###### This installs transformers along with PyTorch, ensuring that models are trained and fine-tuned using the Torch backend. +```python +!pip install transformers[torch] +``` +###### The datasets library (by Hugging Face) provides access to a vast collection of pre-built datasets + +```python +!pip install datasets +``` +###### The evaluate library provides metrics for model performance assessment + +```python +!pip install evaluate +``` +###### Speed up fine-tuning of Large Language Models (LLMs) +[Unsloth](https://huggingface.co/unsloth) is a library designed to speed up fine-tuning of Large Language Models (LLMs) while reducing computational costs. It optimizes training efficiency, particularly for LoRA (Low-Rank Adaptation) fine-tuning +```python +%%capture +# %%capture is a Jupyter Notebook magic command that suppresses the output of a cell. + +``` +##### Uninstalls the existing Unsloth installation and installs the latest version directly from the GitHub repository + +```python +!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-3.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-3.md new file mode 100644 index 0000000000..cde0b63c38 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-3.md @@ -0,0 +1,67 @@ +--- +title: Fine Tuning Large Language Model - Load Pre-trained Model & Tokenizer +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Load Pre-trained Model & Tokenizer + +#### Load Pre-trained Model & Tokenizer +The following commands Load the pre-trained model and tokenizer, ensuring compatibility with the fine-tuning task and optimizing memory usage + +###### Import Required Modules +- FastLanguageModel: A highly optimized loader for LLaMA models in Unsloth, making it faster and memory-efficient. +- torch: Required for handling tensors and computations. +```python +from unsloth import FastLanguageModel +import torch + +``` +###### Define Model Configuration +- max_seq_length = 2048 → Defines the maximum number of tokens the model can process at once. +- dtype = None → Auto-selects Float16 for older GPUs (Tesla T4, V100) +- load_in_4bit = True → Enables 4-bit quantization to reduce memory usage +```python +max_seq_length = 2048 +dtype = None +load_in_4bit = True +``` +###### Load the Pre-trained Model +- Loads a 1B parameter fine-tuned LLaMA model +- Loads the optimized LLaMA model with reduced VRAM usage and faster processing +- Loads the corresponding tokenizer for tokenizing inputs properly + +```python +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/Llama-3.2-1B-Instruct", + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, +``` +###### Parameter-Efficient Fine-Tuning (PEFT) using LoRA (Low-Rank Adaptation) for the pre-trained model +- LoRA Rank (r): Defines the rank of the low-rank matrices used in LoRA +- Target Modules: Specifies which layers should be fine-tuned with LoRA, Includes attention layers (q_proj, k_proj, v_proj, o_proj) and feedforward layers (gate_proj, up_proj, down_proj) +- LoRA Alpha (lora_alpha):Scaling factor for LoRA weights and A higher value makes the LoRA layers contribute more to the model's output +- LoRA Dropout: Dropout randomly disables connections to prevent overfitting +- Bias (bias): No additional bias parameters are trained (optimized for efficiency) +- Gradient Checkpointing: Optimized memory-saving method +- Random Seed: Ensures reproducibility across training runs +- Rank-Stabilized LoRA: Rank stabilization not used +- LoFTQ Quantization: No LoFTQ (Low-bit Quantization) applied +```python +model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj",], + lora_alpha = 16, + lora_dropout = 0, + bias = "none", + use_gradient_checkpointing = "unsloth", + random_state = 3407, + use_rslora = False, + loftq_config = None, +) +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-4.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-4.md new file mode 100644 index 0000000000..ea63e43c22 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-4.md @@ -0,0 +1,75 @@ +--- +title: Fine Tuning Large Language Model - Prepare Dataset +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Prepare Dataset +This step prepares the dataset for fine-tuning by formatting it to match the LLaMA-3.1 chat template. + +###### Import Chat Template for Tokenizer +This imports the chat template functionality from Unsloth and It allows us to structure the dataset in a format that LLaMA-3.1 expects +```python +from unsloth.chat_templates import get_chat_template +``` + +###### Apply the Chat Template to Tokenizer +- Apply the Chat Template to Tokenizer. +- Ensures prompt formatting is consistent when training the model. +```python +tokenizer = get_chat_template( + tokenizer, + chat_template = "llama-3.1", +) + + +``` +###### Format Dataset Prompts +- Extracts the instruction column from the dataset. +- Applies the chat template formatting to each instruction. +- Returns a new dictionary with the formatted text. +```python +def formatting_prompts_func(examples): + convos = examples["instruction"] + texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] + return { "text" : texts, } +pass +``` +###### Load the Dataset +- Loads a [customer support chatbot training dataset](https://huggingface.co/datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset) from Hugging Face +- The dataset contains example conversations with instructions for fine-tuning +- Loads the corresponding tokenizer for tokenizing inputs properly + +```python +from datasets import load_dataset +dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split = "train") + +``` +![example image alt-text#center](2.png ) + +###### Import Standardization Function +- Imports standardize_sharegpt, a function that helps in structuring dataset inputs in a ShareGPT-like format (a commonly used format for LLM fine-tuning). +- Ensures that data follows a standardized format required for effective instruction tuning. +```python +from unsloth.chat_templates import standardize_sharegpt +``` +###### Define a Function to Format Dataset +- Extracts the instruction (input text) and response (output text) from the dataset. +- Stores them as "instruction_text" and "response_text". +```python +def formatting_prompts_func(examples): + return { "instruction_text": examples["instruction"], "response_text": examples["response"] } + +``` + +###### Apply Formatting to Dataset +- Applies formatting_prompts_func to every record in the dataset. +- Uses batch processing (batched=True) for efficiency. +```python +def formatting_prompts_func(examples): + return { "instruction_text": examples["instruction"], "response_text": examples["response"] } + +``` +![example image alt-text#center](3.png ) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-5.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-5.md new file mode 100644 index 0000000000..46f8e38baf --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-5.md @@ -0,0 +1,117 @@ +--- +title: Fine Tuning Large Language Model - Configure and Initialize the Fine-Tuning +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Configure and Initialize the Fine-Tuning + +###### Configure and Initialize the Fine-Tuning Trainer +Sets up the Supervised Fine-Tuning (SFT) Trainer to train the model using the prepared dataset. The trainer manages training configurations, optimization, and logging +###### Import Necessary Libraries +- SFTTrainer (from trl) → Handles the fine-tuning process for LLMs using supervised fine-tuning (SFT). +- TrainingArguments (from transformers) → Defines training hyperparameters like batch size, learning rate, and logging. +- DataCollatorForSeq2Seq (from transformers) → Prepares batches of text data for training (handles padding, truncation). +- is_bfloat16_supported() (from unsloth) → Checks if the system supports bfloat16 (a mixed-precision format for optimized training). + +```python +from trl import SFTTrainer +from transformers import TrainingArguments, DataCollatorForSeq2Seq +from unsloth import is_bfloat16_supported + +``` + +###### Initialize the SFTTrainer +- Loads the Model & Tokenizer → Uses the pre-trained LLM and tokenizer. +- Specifies the Training Dataset → The dataset (dataset) prepared earlier is used for fine-tuning. +- Sets Maximum Sequence Length → Defines max_seq_length, ensuring model input size is within the supported limit. +- Uses Data Collator for Batching → DataCollatorForSeq2Seq dynamically pads and tokenizes text data. +- Enables Multi-Processing (dataset_num_proc = 2) → Uses two parallel processes for faster data loading. +- Packing (packing = False) → Disables sequence packing, which can speed up training for shorter sequences. +```python +trainer = SFTTrainer( + model = model, + tokenizer = tokenizer, + train_dataset = dataset, + dataset_text_field = "instruction", + max_seq_length = max_seq_length, + data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer), + dataset_num_proc = 2, + packing = False, + +``` +###### Define Training Hyperparameters +- Batch Size (per_device_train_batch_size = 2) → Uses a small batch size to fit within GPU memory. +- Gradient Accumulation (gradient_accumulation_steps = 4) → Accumulates gradients over 4 steps before updating model weights. +- Warmup Steps (warmup_steps = 5) → Gradually increases the learning rate in the initial steps to stabilize training. +- Training Steps (max_steps = 60) → Runs for 60 optimization steps (adjustable for full training). +- Learning Rate (learning_rate = 2e-4) → Sets a moderate learning rate for stable fine-tuning. +- Mixed Precision (fp16 or bf16) → Uses bfloat16 if supported; otherwise, falls back to fp16 for efficient computation. +- Logging (logging_steps = 1) → Logs training progress every step. +- Optimizer (optim = "adamw_8bit") → Uses adamw_8bit, which is a memory-efficient optimizer. +- Weight Decay (weight_decay = 0.01) → Adds regularization to prevent overfitting. +- Learning Rate Scheduler (lr_scheduler_type = "linear") → Linearly decays the learning rate over time. +- Random Seed (seed = 3407) → Ensures reproducibility of training results. +- Output Directory (output_dir = "outputs") → Saves the trained model checkpoints in "outputs" folder. + +```python +args = TrainingArguments( + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 5, + max_steps = 60, + learning_rate = 2e-4, + fp16 = not is_bfloat16_supported(), + bf16 = is_bfloat16_supported(), + logging_steps = 1, + optim = "adamw_8bit", + weight_decay = 0.01, + lr_scheduler_type = "linear", + seed = 3407, + output_dir = "outputs", +) + +``` + + +###### Fine-Tuning on Responses Only +Modifies the training approach so that the model learns to focus only on responses rather than both instructions and responses + +###### Import Function to Modify Training +- Loads the train_on_responses_only function from Unsloth’s chat templates +```python +from unsloth.chat_templates import train_on_responses_only +``` +###### Apply train_on_responses_only to the Trainer +- Modifies Trainer Behavior → Instead of training the model on full conversations, it now only learns from the assistant's responses. +```python +trainer = train_on_responses_only( + trainer, + instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n", + response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n", +) + +``` +###### Inspecting Tokenized Data and Labels +Inspect how the dataset has been tokenized and prepared for fine-tuning. It checks how input sequences (prompts) and labels (expected model outputs) are formatted. + +###### Decode a Sample Training Input +- Extracts the tokenized input sequence from the dataset (trainer.train_dataset[5]["input_ids"]). +- Decodes it back into human-readable text using the tokenizer. +- This helps verify how instructions and responses were tokenized. +```python +tokenizer.decode(trainer.train_dataset[5]["input_ids"]) +space = tokenizer(" ", add_special_tokens = False).input_ids[0] +tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]) +``` +###### Training the Model +Initiates the training process using the trainer object, which has been configured with model, dataset, optimizer, and training parameters +- Create an account in [Weights & Biases](https://wandb.ai/) +- Logging into [Weights & Biases](https://wandb.ai/) and [W&B server locally](https://wandb.me/wandb-server) +- You can locate your [API key](https://wandb.ai/authorize) in your browser at this link +- Paste an API key from your profile and press enter +```python +trainer_stats = trainer.train() +``` \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-6.md b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-6.md new file mode 100644 index 0000000000..9e2fe08616 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/llm-fine-tuning-for-web-applications/how-to-6.md @@ -0,0 +1,93 @@ +--- +title: Fine Tuning Large Language Model - Running Inference +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Fine Tuning Large Language Model - Running Inference + + +###### Import Chat Template +- This function provides a predefined chat format suitable for Llama 3.1. +- Ensures that prompts are structured correctly for inference. + + +```python +from unsloth.chat_templates import get_chat_template +``` + +###### Apply Chat Template to the Tokenizer +- Updates the tokenizer with the Llama 3.1 chat template. +- Ensures the input messages are formatted according to Llama 3.1's expected structure. + +```python +tokenizer = get_chat_template( + tokenizer, + chat_template="llama-3.1", +) + + +``` +###### Enable Faster Inference +- Optimizes the model for low-latency inference. +- Uses Unsloth’s performance improvements to speed up text generation.checkpoints in "outputs" folder. + +```python +FastLanguageModel.for_inference(model) + +``` + + +###### Define Input Messages +- Defines a conversation in a structured format +```python +messages = [ + {"i have a question about cancelling oorder {{Order Number}}"}, +] +``` +###### Tokenize Input Messages +- Converts the messages into tokens. +- The apply_chat_template() function ensures the model receives the correct chat format. +- tokenize=True: Converts text into numerical token IDs. +- add_generation_prompt=True: Ensures the assistant's response is expected. +- return_tensors="pt": Converts input into PyTorch tensors. +- .to("cuda"): Moves data to GPU for faster processing. +```python +inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, # Must add for generation + return_tensors="pt", +).to("cuda") +``` +###### Generate Model Output +- Generates text based on the input. +- max_new_tokens=64: Limits output length to 64 tokens. +- use_cache=True: Speeds up generation by caching intermediate results. +- temperature=1.5: Increases randomness in output (higher value = more diverse text). +- min_p=0.1: Controls token probability threshold (avoids unlikely tokens). +```python +outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=1.5, min_p=0.1) +``` +###### Decode the Generated Output +- Converts tokenized output back into human-readable text + +```python +tokenizer.batch_decode(outputs) +``` + +###### Save the LoRA Model Locally +- Saves the model (including its LoRA weights) to a local directory + +```python +model.save_pretrained("finetune_webbot") +``` +###### Save the Tokenizer Locally +- Saves the tokenizer configuration into the "finetune_webbot" directory + +```python +tokenizer.save_pretrained("finetune_webbot") +``` +After saving the fine-tuned model, you can integrate it into your web application for real-time inference. Load the model and tokenizer, deploy them using an API (e.g., FastAPI, Flask), and serve responses based on user inputs. This enables seamless AI-powered interactions in your application. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Android_Demo_with_GUI.md b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Android_Demo_with_GUI.md new file mode 100644 index 0000000000..69ce8890af --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Android_Demo_with_GUI.md @@ -0,0 +1,101 @@ +--- +title: Build the MNN Android Demo with GUI +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Set up development environment +In this learning path, you will learn how to build and deploy a Vision Transformer(ViT) chat app to an Android device using MNN-LLM. You will learn how to build the MNN-LLM and how to run the Qwen model for the Android application. + +The first step is to prepare a development environment with the required software: + +- Android Studio (latest version recommended) +- Android NDK (tested with version 28.0.12916984) +- CMake (4.0.0-rc1) +- Python3 (Optional) +- Git + +## Clone MNN repo +Open up a Windows PowerShell or Git Bash and checkout the source tree: + +```shell +cd C:\Users\$env:USERNAME +git clone https://github.com/HenryDen/MNN.git +cd MNN +git checkout 83b650fc8888d7ccd38dbc68330a87d048b9fe7a +``` + +{{% notice Note %}} +The app code is currently not merged into the MNN repo. The repo above is a fork from the MNN. +{{% /notice %}} + +## Build the app using Android Studio + +Create a signing.gradle file at android/app with the following template: +```shell +ext{ + signingConfigs = [ + release: [ + storeFile: file('PATH_TO_jks_file'), + storePassword: "****", + keyAlias: "****", + keyPassword: "****" + ] + ] +} +``` + +If you don't need to compile a release version of the app, you can skip the following step of creating a sign file and write anything in the signing.gradle. + +- Navigate to **Build -> Generate Signed App Bundle or APK**. +- Select **APK** and click **next**. +- Press **Create new** and fill in the information.. +- Fill in the information of the newly generated JKS file in the template above. + +Open the MNN/transformers/llm/engine/android directory with Android Studio and wait for the Gradle project sync to finish. + +## Prepare the model +You can download the model from ModelScope : https://www.modelscope.cn/models/qwen/qwen2-vl-2b-instruct + +Or Hugging Face : https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct + +If you need to test other vision transformer models, you can download models from https://modelscope.cn/organization/qwen?tab=model and convert them to MNN format. + +```shell +// make sure install git lfs +$ git lfs install +$ git clone https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct +// install llm-export +$ git clone https://github.com/wangzhaode/llm-export && cd llm-export/ +$ pip install . +// CONVERT model +$ llmexport --path /path/to/mnn-llm/Qwen2-VL-2B-Instruct/ --export mnn --quant_bit 4 --quant_block 0 --dst_path Qwen2-VL-2B-Instruct-convert-4bit-per_channel --sym +``` + +- --quant_bit: the quantization parameter, for example 4 is the q4 quantization +- --quant_block: the quantization parameter, for example 0 is per channel quantization, 128 is 128 per block quantization +- --sym: the quantization parameter, means symmetrical quantization. + +## Build and run the app +Before launching the app, you need to push the model into the device manually: + +```shell +$ adb shell mkdir /data/local/tmp/models/ +$ adb push /data/local/tmp/models +``` + +When you select Run, the build will be executed, and then the app will be copied and installed on the Android device. + +After opening the app, you will see: + +![Loading screenshot](Loading_page.png) + +After the Model is loaded, you can chat with the APP. + +![Loading screenshot](chat2.png) + + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Command-line_ViT_Demo.md b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Command-line_ViT_Demo.md new file mode 100644 index 0000000000..1a4cd3491c --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Build_the_MNN_Command-line_ViT_Demo.md @@ -0,0 +1,75 @@ +--- +title: Build the MNN Command-line ViT Demo +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Set up development environment +In this learning path, you will learn how to build and deploy a Vision Transformer(ViT) chat command line Demo to an Android device using MNN-LLM. You will learn how to build the MNN-LLM with cross-compile and how to run the Qwen model for the Android application. + +The first step is to prepare a development environment with the required software: + +- Linux ubuntu (20.04 or higher) +- Android NDK (tested with version 28.0.12916984) +- CMake (4.0.0-rc1) +- Python3 (Optional) +- Git + +## Build and run command-line demo + +Push the Model to device, how to obtain model is mention on last page. +```shell +$ adb shell mkdir /data/local/tmp/models/ +$ adb push /data/local/tmp/models +``` + +```shell +# Download a ndk file from https://developer.android.com/ndk/downloads/ +$ upzip android-ndk-r27d-linux.zip +$ export ANDROID_NDK=./android-ndk-r27d-linux/ + +$ git clone https://github.com/alibaba/MNN.git +% cd MNN/project/android +$ mkdir build_64 && cd build_64 +$ ../build_64.sh "-DMNN_LOW_MEMORY=true -DLLM_SUPPORT_VISION=true -DMNN_KLEIDIAI=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_ARM82=true -DMNN_OPENCL=true -DMNN_USE_LOGCAT=true -DMNN_IMGCODECS=true -DMNN_BUILD_OPENCV=true" +$ adb push *so llm_demo tools/cv/*so /data/local/tmp/ +$ adb shell +``` + +Here switch to android adb shell environment. + +```shell +$ cd /data/local/tmp/ +$ chmod +x llm_demo +$ export LD_LIBRARY_PATH=./ +# ./example.png get your image here +$ echo " ./example.pngDescribe the content of the image." >prompt +$ ./llm_demo models/Qwen-VL-2B-convert-4bit-per_channel/config.json prompt +``` + +Here is an example image: + +![example image](example.png) + +If the launch is success, you can see the output + +```shell +config path is models/Qwen-VL-2B-convert-4bit-per_channel/config.json +tokenizer_type = 3 +prompt file is prompt +The image features a tiger standing in a grassy field, with its front paws raised and its eyes fixed on something or someone behind it. The tiger's stripes are clearly visible against the golden-brown background of the grass. The tiger appears to be alert and ready for action, possibly indicating a moment of tension or anticipation in the scene. + +################################# +prompt tokens num = 243 +decode tokens num = 70 + vision time = 5.96 s + audio time = 0.00 s +prefill time = 1.80 s + decode time = 2.09 s +prefill speed = 135.29 tok/s + decode speed = 33.53 tok/s +################################## +``` + diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Loading_page.png b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Loading_page.png new file mode 100644 index 0000000000..db0a530ff4 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/Loading_page.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_index.md b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_index.md new file mode 100644 index 0000000000..8ff232fa5f --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_index.md @@ -0,0 +1,57 @@ +--- +title: Vision LLM inference on Android with KleidiAI and MNN + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an advanced topic for Android developers who want to efficiently run Vision-Transformer(ViT) on android device. + +learning_objectives: + - Run Vision-Transformer inference on an Android device with the Qwen Vision 2B model using the MNN inference framework. + - Download and Convert a Qwen Vision model from Hugging Face. + +prerequisites: + - A x86_64 development machine with Android Studio installed. + - A 64-bit Arm powered smartphone running Android with i8mm/dotprod supported. + +author: Shuheng Deng,Arm + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Cortex-A + - Cortex-X +tools_software_languages: + - Android Studio + - KleidiAI +operatingsystems: + - Android + + + +further_reading: + - resource: + title: "MNN : A UNIVERSAL AND EFFICIENT INFERENCE ENGINE" + link: https://arxiv.org/pdf/2002.12418 + type: documentation + - resource: + title: MNN-Doc + link: https://mnn-docs.readthedocs.io/en/latest/ + type: blog + - resource: + title: Vision transformer + link: https://en.wikipedia.org/wiki/Vision_transformer + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/background.md b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/background.md new file mode 100644 index 0000000000..a6560947a5 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/background.md @@ -0,0 +1,28 @@ +--- +title: Background +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## MNN Introduction +MNN is a highly efficient and lightweight deep learning framework. It supports inference and training of deep learning models and has industry-leading performance for inference and training on-device. At present, MNN has been integrated into more than 30 apps of Alibaba Inc, such as Taobao, Tmall, Youku, DingTalk, Xianyu, etc., covering more than 70 usage scenarios such as live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control. In addition, MNN is also used on embedded devices, such as IoT. + +MNN-LLM is a large language model runtime solution developed based on the MNN engine. The mission of this project is to deploy LLM models locally on everyone's platforms(Mobile Phone/PC/IOT). It supports popular large language models such as Qianwen, Baichuan, Zhipu, LLAMA, and others. + +KleidiAI is currently integrated into the MNN framework, enhancing the inference performance of large language models (LLMs) within MNN. The Android app on this page demonstrates Vision Transformer inference using the MNN framework, accelerated by KleidiAI. + +## Vision Transformer(ViT) +The Vision Transformer (ViT) is a deep learning model designed for image recognition tasks. Unlike traditional convolutional neural networks (CNNs), which process images using convolutional layers, ViT leverages the transformer architecture originally developed for natural language processing (NLP). +The Vit workflow contains: + +- **Image Patching** : The input image is divided into fixed-size patches, similar to how text is tokenized in NLP tasks. +- **Linear Embedding** : Each image patch is flattened and linearly embedded into a vector. +- **Position Encoding** : Positional information is added to the patch embeddings to retain spatial information. +- **Transformer Encoder** : The embedded patches are fed into a standard transformer encoder, which uses self-attention mechanisms to process the patches and capture relationships between them. +- **Classification** : The output of the transformer encoder is used for image classification or other vision tasks. + +ViT has shown competitive performance on various image classification benchmarks and has been widely adopted in computer vision research + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat2.png b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat2.png new file mode 100644 index 0000000000..661c0e09c2 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat2.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat_page.png b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat_page.png new file mode 100644 index 0000000000..a4bdb2d947 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/chat_page.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example-picture.png b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example-picture.png new file mode 100644 index 0000000000..c69844bed4 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example-picture.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example.png b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example.png new file mode 100644 index 0000000000..72be3d02de Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/Vision-LLM-inference-on-Android-with-KleidiAI-and-MNN/example.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md b/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md index b8dcffd8bd..2dedfb83f8 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ams/_index.md @@ -71,6 +71,10 @@ further_reading: title: Integrate Arm Performance Studio into a CI workflow link: https://developer.arm.com/documentation/102543 type: documentation + - resource: + title: RenderDoc Reference Guide + link: https://renderdoc.org/docs/index.html + type: documentation ### FIXED, DO NOT MODIFY diff --git a/content/learning-paths/mobile-graphics-and-gaming/ams/ga.md b/content/learning-paths/mobile-graphics-and-gaming/ams/ga.md index d66809b886..7e6e399411 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/ams/ga.md +++ b/content/learning-paths/mobile-graphics-and-gaming/ams/ga.md @@ -11,6 +11,8 @@ Graphics Analyzer is a tool to help `OpenGL ES` and `Vulkan` developers get the The tool allows you to observe API call arguments and return values, and interact with a running target application to investigate the effect of individual API calls. It highlights attempted misuse of the API, and gives recommendations for improvements. +**Note:** Graphics Analyzer is no longer in active development. You can still get Graphics Analyzer as part of [Arm Performance Studio 2024.2](https://artifacts.tools.arm.com/arm-performance-studio/2024.2/), but it is no longer available in later versions of the suite. For a more lightweight tool, try [Frame Advisor](https://developer.arm.com/Tools%20and%20Software/Frame%20Advisor), which enables you to capture and analyze rendering and geometry data for a single frame. For graphics debugging, we recommend RenderDoc for Arm GPUs. Both tools are available for free as part of [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio). + ## Prerequisites Build your application, and setup your Android device as described in [Setup tasks](/learning-paths/mobile-graphics-and-gaming/ams/setup_tasks/). diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/01-what-is-arm-asr.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/01-what-is-arm-asr.md new file mode 100644 index 0000000000..9b91645fe4 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/01-what-is-arm-asr.md @@ -0,0 +1,50 @@ +--- +title: What is Arm Accuracy Super Resolution (Arm ASR)? +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +[Arm® Accuracy Super Resolution™ (Arm ASR)](https://www.arm.com/developer-hub/mobile-graphics-and-gaming/accuracy-super-resolution) is a mobile-optimized temporal upscaling technique derived from [AMD's Fidelity Super Resolution 2 v2.2.2](https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/main/docs/techniques/super-resolution-temporal.md). Arm ASR extends this technology with multiple optimizations to make the technique suited for the more resource-constrained environment of mobile gaming. + +Available as an easy plug-in for Unreal Engine 5.3, 5,4, and 5.5, a Unity plugin coming soon, or as a generic library that you can integrate into other engines, you can easily improve frames per second, enhance visual quality, and prevent thermal throttling for smoother, longer gameplay. + +## What is Super Resolution? + +Super-resolution techniques render some frames at a lower resolution and use shader upscaling to reconstruct how the frames should look at native resolution. This offers significant performance and battery life improvements for mobile devices. + +Arm ASR outperforms spatial upscalers when reconstructing fine details, such as: + +- Thin features +- Grid-like structures +- Fast-moving objects + +You can control a range of different settings for Arm ASR: + +- The upscaling ratio. For example, a value of 50.0 will mean that the plugin upscales frames by a factor of 2. +- Use Arm ASR’s own auto-exposure or use the game engine’s auto-exposure value. +- Use a Robust Contrast Adaptive Sharpening (RCAS) filter to sharpen the output image. +- The shader quality preset: 1 - Quality, 2 - Balanced, 3 - Performance. + +## Overview of Arm ASR + +The [Arm ASR experience kit](https://github.com/arm/accuracy-super-resolution) is a combination of materials that provide access to the technology, to help you evaluate it and make the best use of it. It includes: + +- The Arm ASR source code so developers can access it fully and even evolve the technology for their needs. +- Tutorials and sample materials to help developers with the integration of the technology and how to use it. +- Plugin for Unreal Engine. + +## Unreal Engine Plugin + +The Unreal Engine 5 plugin can be integrated into your project in a matter of minutes. Once installed, simply enable temporal upscaling on your project and the plugin will automatically handle the upscaling of all frames. + +[Using Arm ASR in Unreal Engine](../03-ue) + +## Custom engine usage + +If you are using your own custom engine, you can still integrate Arm ASR using our generic library. + +[Using Arm ASR in a custom engine](../04-generic_library) diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/02-ue.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/02-ue.md new file mode 100644 index 0000000000..c9be6e618f --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/02-ue.md @@ -0,0 +1,115 @@ +--- +title: Using Arm ASR in Unreal Engine +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Introduction + +This guide describes how to get started with Arm® Accuracy Super Resolution™ (Arm ASR) through an example project in Unreal Engine. It is for Unreal Engine developers who want to apply upscaling techniques to their projects. You will walk through the processes of installing Arm ASR and some of the common tasks that you might encounter when setting up Arm ASR for the first time. + +## Before you begin + +We recommend using Unreal Engine versions 5.3-5.5 through this tutorial. Please get in contact with us at arm-asr-support@arm.com if you have any questions. + +## Installing the Arm ASR plugin + +The following steps describe how to install the Arm ASR plugin in Unreal Engine: + +1. Open the Unreal Engine project you intend to use with Arm ASR. The Third Person pack is available as an example. + + ![Third person pack](images/third_person_pack.png "Third person pack") + +2. Download the plugin by cloning the repository. + + ``` + git clone https://github.com/arm/accuracy-super-resolution-for-unreal + ``` + +3. Navigate to the `UE` directory in the cloned repository. This contains directories containing the plugin for each supported version of Unreal Engine. + +4. From the directory for your version of Unreal Engine, copy the Arm ASR plugin into the `Plugins` folder in the game directory. + + ![Plugin folder](images/plugin_folder.png "Plugin folder") + + ![Copied Arm ASR plugin](images/copied_arm_asr_plugin.png "Copied Arm ASR plugin") + +5. Navigate back to your Unreal Engine project. + +6. When the pop-up window opens asking to build Arm ASR, select **Yes**. + + ![Arm ASR pop up window](images/asr_popup.png "Arm ASR pop up window") + +## Enable and configure Arm ASR + +After reopening the Unreal Engine project, the Arm ASR plugin should be enabled. + +1. Go to **Edit > Plugins**, and search for Arm ASR. Make sure Arm ASR is checked. + + ![Verify the plugin](images/verify_plugin.png "Verify the plugin") + + ![Plugin screen](images/plugin_screen.png "Plugin screen") + +1. To enable Arm ASR upscaling, open **Project Settings** and change the Anti-Aliasing Method to **Temporal Anti-Aliasing**. + + {{% notice %}} + **Mobile Anti-Aliasing Method** is used for the mobile renderer, however this tutorial describes the desktop renderer. You can learn about other available renderers by studying the `README.md` of the `accuracy-super-resolution-for-unreal` repository. + {{% /notice %}} + + ![Change anti-aliasing method](images/change_anti_aliasing_method.png "Change anti-aliasing method") + +1. To check that Arm ASR is enabled and working, use the `ShowFlag.VisualizeTemporalUpscaler 1` console command. If Arm ASR is working, you will see Arm ASR listed as the **ThirdParty TemporalUpscaler** in the command window. + + {{% notice %}} + When you use the `ShowFlag.VisualizeTemporalUpscaler 1` console command, the debug views are not generated from Arm ASR but from Unreal Engines TAA. + {{% /notice %}} + + ![Visualize console command](images/visualise_console_command.png "Visualize console command") + +1. If Arm ASR is not shown here, use the `r.ArmASR.Enable 1` console command to enable upscaling. + + ![Arm ASR.Enable 1 command](images/arm_asr_enable_command.png "ArmASR.Enable 1 command") + +## Further configuration {.section} + +There are two ways to configure further settings. + +Option 1 is using the UI. Go to **Edit > Project Settings > Plugins > Arm ASR**. + +![Configure settings using UI](images/ui_settings.png "Configure settings using UI") + +Option 2 is through the console. By typing `r.ArmASR.` into the console window, you can see the configuration options. + +![Configuration options](images/configuration_options.png "Configuration options") + +For example, you can change the upscaling ratio by modifying the **ScreenPercentage**, via the command option `r.ScreenPercentage`. A value of `50.0` will mean that the plugin upscales frames by a factor of 2. + +Arm ASR's behavior can be configured via the following plugin-specific console variables: + +|Console Variable | Default Value | Value Range | Details | +|:------------------------------------------------- | :------------ | :---------- | :----------------------------------------------------------------------------------------------------- | +|`r.ArmASR.Enable` | 1 | 0, 1 | Enable / disable Arm ASR. | +|`r.ArmASR.AutoExposure` | 0 | 0, 1 | Set to 1 to use Arm ASR’s own auto-exposure, otherwise the engine’s auto-exposure value is used. | +|`r.ArmASR.Sharpness` | 0 | 0-1 | If greater than 0 this enables Robust Contrast Adaptive Sharpening Filter to sharpen the output image. | +|`r.ArmASR.ShaderQuality` | 1 | 1, 2, 3 | Select shader quality preset: 1 - Quality, 2 - Balanced, 3 - Performance. | +|`r.ArmASR.CreateReactiveMask` | 1 | 0-1 | Create the reactive mask. | +|`r.ArmASR.ReactiveMaskReflectionScale` | 0.4 | 0-1 | Range from 0.0 to 1.0, scales the Unreal Engine reflection contribution to the reactive mask, which can be used to control the amount of aliasing on reflective surfaces. | +|`r.ArmASR.ReactiveMaskRoughnessScale` | 0.15 | 0-1 | Range from 0.0 to 1.0, scales the GBuffer roughness to provide a fallback value for the reactive mask when screenspace & planar reflections are disabled or don't affect a pixel. | +|`r.ArmASR.ReactiveMaskRoughnessBias` | 0.25 | 0-1 | Range from 0.0 to 1.0, biases the reactive mask value when screenspace/planar reflections are weak with the GBuffer roughness to account for reflection environment captures. | +|`r.ArmASR.ReactiveMaskRoughnessMaxDistance` | 6000 | - | Maximum distance in world units for using material roughness to contribute to the reactive mask, the maximum of this value and View.FurthestReflectionCaptureDistance will be used. | +|`r.ArmASR.ReactiveMaskRoughnessForceMaxDistance` | 0 | - | Enable to force the maximum distance in world units for using material roughness to contribute to the reactive mask rather than using View.FurthestReflectionCaptureDistance. | +|`r.ArmASR.ReactiveMaskReflectionLumaBias` | 0 | 0-1 | Range from 0.0 to 1.0, biases the reactive mask by the luminance of the reflection. Use to balance aliasing against ghosting on brightly lit reflective surfaces. | +|`r.ArmASR.ReactiveHistoryTranslucencyBias` | 0.5 | 0-1 | Range from 0.0 to 1.0, scales how much translucency suppresses history via the reactive mask. Higher values will make translucent materials more reactive which can reduce smearing. | +|`r.ArmASR.ReactiveHistoryTranslucencyLumaBias` | 0 | 0-1 | Range from 0.0 to 1.0, biases how much the translucency suppresses history via the reactive mask by the luminance of the transparency. Higher values will make bright translucent materials more reactive which can reduce smearing. | +|`r.ArmASR.ReactiveMaskTranslucencyBias` | 1 | 0-1 | Range from 0.0 to 1.0, scales how much contribution translucency makes to the reactive mask. Higher values will make translucent materials more reactive which can reduce smearing. | +|`r.ArmASR.ReactiveMaskTranslucencyLumaBias` | 0 | 0-1 | Range from 0.0 to 1.0, biases the translucency contribution to the reactive mask by the luminance of the transparency. Higher values will make bright translucent materials more reactive which can reduce smearing. | +|`r.ArmASR.ReactiveMaskTranslucencyMaxDistance` | 500000 | - | Maximum distance in world units for using translucency to contribute to the reactive mask. This is a way to remove sky-boxes and other back-planes from the reactive mask, at the expense of nearer translucency not being reactive. | +|`r.ArmASR.ReactiveMaskForceReactiveMaterialValue` | 0 | 0, 1 | Force the reactive mask value for Reactive Shading Model materials, when > 0 this value can be used to override the value supplied in the Material Graph. | +|`r.ArmASR.ReactiveMaskReactiveShadingModelID` | MSM_NUM | - | Treat the specified shading model as reactive, taking the CustomData0.x value as the reactive value to write into the mask. | + +## Next steps + +You are now ready to use Arm ASR in your Unreal Engine projects. You can use [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio) tools to measure the performance of your game as it runs on a mobile device, allowing you to monitor the effect of Arm ASR. diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/04-generic_library.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/04-generic_library.md new file mode 100644 index 0000000000..3351e154fb --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/04-generic_library.md @@ -0,0 +1,381 @@ +--- +title: Using Arm ASR in a custom engine using the Generic Library +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +Use the following steps to implement **Arm Accuracy Super Resolution (Arm ASR)** in your own custom engine. Arm ASR is an optimized version of [Fidelity Super Resolution 2](https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/main/docs/techniques/super-resolution-temporal.md) (FSR2) that has been heavily modified to include many mobile-oriented optimizations to make the technique suited for mobile. + +There are two ways you can integrate Arm ASR into your custom engine: + +- [Quick integration](#quick-integration) - using the built-in standalone backend. +- [Tight integration](#tight-integration) - using your own backend/renderer. + +Then refer to the following sections to learn how to configure Arm ASR: + +- [Quality presets](#quality-presets) +- [Performance](#performance) +- [Shader variants and Extensions](#shader-variants-and-extensions) +- [Input resources](#input-resources) +- [Providing motion vectors](#providing-motion-vectors) +- [Reactive mask](#reactive-mask) +- [Automatically generating reactivity](#automatically-generating-reactivity) +- [Modular backend](#modular-backend) +- [Camera jitter](#camera-jitter) +- [Camera jump cuts](#camera-jump-cuts) +- [Mipmap biasing](#mipmap-biasing) +- [Frame Time Delta Input](#frame-time-delta-input) +- [HDR support](#hdr-support) +- [API Debug Checker](#debug-checker) +- [Extended ffx_shader_compiler](#extended-ffx_shader_compiler) +- [Generate prebuilt shaders](#generate-prebuilt-shaders) + +## Get the Arm ASR package + +1. Get the Arm ASR package from GitHub: + + ``` + git clone https://github.com/arm/accuracy-super-resolution-generic-library + ``` + +2. For the purposes of this tutorial, we will set a variable to identify the location of the Arm ASR package. This path will be used to refer to files in the repository throughout this learning path. + + ``` + export $ARMASR_DIR=$(pwd) + ``` + +## Quick integration + +To quickly integrate Arm ASR, which means the built-in standalone backend is used, follow the steps below: + +1. Copy the **Arm_ASR** directory into your project, and add **Arm_ASR/src/backends/shared/blob_accessors/prebuilt_shaders** in the include path if you want to use prebuilt shaders. + +2. Include the following header files in your codebase where you wish to interact with the technique: + + - `$ARMASR_DIR/include/host/ffxm_fsr2.h` + - `$ARMASR_DIR/include/host/backends/vk/ffxm_vk.h` + +3. Create a Vulkan backend. + - Allocate a Vulkan scratch buffer of the size returned by `ffxmGetScratchMemorySizeVK` in `$ARMASR_DIR/include/host/backends/vk/ffxm_vk.h`. + - Create `FfxmDevice` via `ffxmGetDeviceVK` in `$ARMASR_DIR/include/host/backends/vk/ffxm_vk.h`. + - Create `FfxmInterface` by calling `ffxmGetInterfaceVK` in `$ARMASR_DIR/include/host/backends/vk/ffxm_vk.h`. + +4. Create a context by calling `ffxmFsr2ContextCreate` in `$ARMASR_DIR/include/host/ffxm_fsr2.h`. The parameters structure should be filled out matching the configuration of your application. + +5. Each frame calls `ffxmFsr2ContextDispatch` via `$ARMASR_DIR/include/host/ffxm_fsr2.h` to record/execute the technique's workloads. The parameters structure should be filled out matching the configuration of your application. + +6. When your application is terminating (or you wish to destroy the context for another reason) you should call `ffxmFsr2ContextDestroy` accessed via `$ARMASR_DIR/include/host/ffxm_fsr2.h`. The GPU should be idle before calling this function. + +7. Sub-pixel jittering should be applied to your application's projection matrix. This should be done when performing the main rendering of your application. You should use the `ffxmFsr2GetJitterOffset` function accessed via `$ARMASR_DIR/include/host/ffxm_fsr2.h` to compute the precise jitter offsets. + +8. A global Mip bias should be applied when texturing. Applying a negative Mipmap biasing will typically generate an upscaled image with better texture detail. We recommend applying the following formula to your Mipmap bias: + + ``` cpp + mipBias = log2(renderResolution/displayResolution) - 1.0; + ``` + +9. For the best upscaling quality it is strongly advised that you populate the Reactive mask according to our guidelines. You can also use `ffxmFsr2ContextGenerateReactiveMask` accessed via `$ARMASR_DIR/include/host/ffxm_fsr2.h` as a starting point. + +10. Finally, link the two built libraries (**Arm_ASR_api** and **Arm_ASR_backend**). + +## Tight integration + +If you wish to use your own backend/renderer, a tight integration with your engine is required. For this, a similar process to the [quick integration](#quick-integration) described above is required, but with the added requirement to fill the `FfxmInterface` accessed via `$ARMASR_DIR/include/host/ffxm_interface.h` with functions implemented on your end. + +In this approach the shaders are expected to be built by the engine. Arm ASR's shaders have been micro-optimized to use explicit 16-bit floating-point types. It is therefore advisable that the shaders are built using such types. For example, `min16float` is used in High-level shader languag (HLSL) and `float16_t` in OpenGL Shading Language (GLSL). If you are using HLSL, define the following symbol with a value of `1`: + +```cpp +#define FFXM_HLSL_6_2 1 +``` + +The `FFXM_HALF` symbol is enabled by default in the provided shader sources. + +1. Include the following header in your codebase: + + - `$ARMASR_DIR/include/host/ffxm_interface.h` + +2. Implement your own functions (assume the names are `xxxGetInterfacexxx`, `xxxGetScratchMemorySizexxx`) and callbacks in `FfxmInterface` in `$ARMASR_DIR/include/host/ffxm_interface.h` to link Arm ASR with the engine's renderer. + +3. Create your own backend by calling `xxxGetInterfacexxx`. A scratch buffer should be allocated of the size returned by calling `xxxGetScratchMemorySizexxx` and the pointer to that buffer passed to `xxxGetInterfacexxx`. + +4. Now, you can follow the same steps from the quick integration instructions above, starting from step 4, creating an Arm ASR context. In the final step it is only necessary to link the **Arm_ASR_api** library. + +## Integration Guidelines + +In the following section, additional details for integrating Arm ASR are listed. + +{{% notice %}} +The `FfxmFsr2ContextDescription` from `$ARMASR_DIR/include/host/ffxm_fsr2.h` is referenced multiple times throughout the Integration Guidelines. You should configure the `flags` field of this structure when modifying those bits, by setting the variables in `FfxmFsr2InitializationFlagBits`. +{{% /notice %}} + +### HLSL-based workflows + +In an HLSL-based workflow using DirectX Shader Compiler to cross-compile to SPIR-V do the following: + +- Use the following flags when building: + + ``` + -fspv-target-env=vulkan1.1spirv1.4 -enable-16bit-types + ``` + +- The extension **VK_KHR_shader_float16_int8** should be used at runtime. + +## Quality presets + +The Arm ASR API provides a set of shader quality presets, to select a version of the technique that balances quality and performance: + +| Preset | Description | +|------------|-------------| +| **Quality** | An optimized version of FSR2 that maintains the same image quality as the original technique. | +| **Balanced** | Provides significant bandwidth savings and performance uplift while maintaining image quality close to the **Quality** preset. | +| **Performance** | A more aggressive preset that offers the highest performance with some quality sacrifices. | + +When creating a context, a `FfxmFsr2ShaderQualityMode` accessed via `$ARMASR_DIR/include/host/ffxm_fsr2.h` needs to be provided as part of the input settings in `FfxmFsr2ContextDescription`. + +## Upscaling ratios + +To enhance flexibility when using the technique, developers can specify both a shader quality preset and an upscaling ratio. They can select any combination of **FfxmFsr2ShaderQualityMode** and **FfxmFsr2UpscalingRatio** according to their requirements to adjust the balance between quality and performance of the application. + + +A couple of utilities are available to determine the source resolution the frame should use for rendering before upscaling. This calculation is based on the desired upscaling ratio, defined by `FfxmFsr2UpscalingRatio`. You can find this definition in `$ARMASR_DIR/include/host/ffxm_fsr2.h`. + +``` cpp +float ffxmFsr2GetUpscaleRatioFactor(FfxmFsr2UpscalingRatio upscalingRatio) +FfxErrorCode ffxmFsr2GetRenderResolutionFromUpscalingRatio( + uint32_t* renderWidth, + uint32_t* renderHeight, + uint32_t displayWidth, + uint32_t displayHeight, + FfxmFsr2UpscalingRatio upscalingRatio) +``` + +## Performance +Depending on your target hardware and operating configuration, Arm ASR will operate at different performance levels. The table below compares the rendering performance of two Arm GPUs (Immortalis-G715 and Immortalis-G720) when using different upscaling settings at two target resolutions. + + +| Target resolution | Quality | Upscaling Ratio | Immortalis-G715 | Immortalis-G720 | +|-------------------|--------------------|-----------|-----------------|-----------------| +| 2800x1260 | Quality | 1.5x | 6.5 ms | 4.1 ms | +| | | 1.7x | 6.3 ms | 3.8 ms | +| | | 2x | 6.1 ms | 3.3 ms | +| | Balanced | 1.5x | 5.8 ms | 3.3 ms | +| | | 1.7x | 5.4 ms | 3.1 ms | +| | | 2x | 4.7 ms | 2.8 ms | +| | Performance | 1.5x | 5.4 ms | 3.2 ms | +| | | 1.7x | 5.3 ms | 2.9 ms | +| | | 2x | 4.6 ms | 2.5 ms | +| 2400x1080 | Quality | 1.5x | 5.3 ms | 2.9 ms | +| | | 1.7x | 4.8 ms | 2.7 ms | +| | | 2x | 4.3 ms | 2.5 ms | +| | Balanced | 1.5x | 4.2 ms | 2.5 ms | +| | | 1.7x | 4.0 ms | 2.3 ms | +| | | 2x | 3.6 ms | 2.2 ms | +| | Performance | 1.5x | 4.1 ms | 2.4 ms | +| | | 1.7x | 3.7 ms | 2.1 ms | +| | | 2x | 3.6 ms | 2 ms | + +## Shader variants and Extensions + +**Unless you are using the prebuilt shaders with the standalone VK backend**, be aware of the following definitions when integrating Arm ASR shaders: + +- **FFXM_GPU**. Needs to be defined globally when including the shader headers. +- **FFXM_HLSL**. If defined, the logic will fallback to use the **HLSL** specific syntax (i.e types, resource declaration ...). +- **FFXM_GLSL**. If defined, the logic will fallback to use the **GLSL** specific syntax (i.e types, resource declaration ...). + +The following table shows the list of the different shader mutators that can be used. All of them must be defined with a value of 0 or 1. Which shader variant to use is guided internally by **getPipelinePermutationFlags(...)** based on things like the user's flags and shader quality. + +| Define | Description | +| -------- | ------- | +| FFXM_FSR2_OPTION_HDR_COLOR_INPUT | If **1**, will assume that the input color is in linear RGB. | +| FFXM_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS | If **1**, will assume the input motion vectors texture is in low resolution | +| FFXM_FSR2_OPTION_JITTERED_MOTION_VECTORS | If **1**, will assume jittered motion vectors using the same jitter offsets as the input color and depth. | +| FFXM_FSR2_OPTION_INVERTED_DEPTH | If **1**, it will assume the input depth containing reversed depth values (far == 0.0f) | +| FFXM_FSR2_OPTION_APPLY_SHARPENING | If **1**, informs the shaders that RCAS (sharpening) pass will be used. | +| FFXM_FSR2_OPTION_SHADER_OPT_BALANCED | If **1**, enables a batch of optimizations when the **Balanced** quality preset is selected. | +| FFXM_FSR2_OPTION_SHADER_OPT_PERFORMANCE | If **1**, enables a batch of optimizations when the **Performance** quality preset is selected. When this is enabled then **FFXM_FSR2_OPTION_SHADER_OPT_BALANCED** will be enabled too. | + +Lastly, when using an HLSL-based workflow, we also have the **FFXM_HLSL_6_2** global define. If defined with a value of **1**, this will enable the use of explicit 16-bit types instead of relying on **half** (RelaxedPrecision). The **VK_KHR_shader_float16_int8** extension is required on Vulkan. + +## Input resources + +Arm ASR is a temporal algorithm, and therefore requires access to data from both the current and previous frame. The following table enumerates all external inputs required by it, with most function names available in `$ARMASR_DIR/include/host/ffxm_fsr2.h`. + +The resolution column indicates if the data should be at 'rendered' resolution or 'presentation' resolution. 'Rendered' resolution indicates that the resource should match the resolution at which the application is performing its rendering. Conversely, 'presentation' indicates that the resolution of the target should match that which is to be presented to the user. All resources are from the current rendered frame, for Vulkan applications all input resources should be transitioned to [`VK_ACCESS_SHADER_READ_BIT`](https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkAccessFlagBits.html) respectively before calling `ffxmFsr2ContextDispatch`. + +| Name | Resolution | Format | Type | Notes | +| ----------------|------------------------------|------------------------------------|-----------|------------------------------------------------| +| Color buffer | Render | `APPLICATION SPECIFIED` | Texture | The current frame’s color data for the current frame. If HDR, enable `FFXM_FSR2_ENABLE_HIGH_DYNAMIC_RANGE` in `FfxmFsr2ContextDescription`. | +| Depth buffer | Render | `APPLICATION SPECIFIED (1x FLOAT)` | Texture | The depth buffer for the current frame. The data should be provided as a single floating point value, the precision of which is under the application's control. Configure the depth through the `FfxmFsr2ContextDescription` when creating the `FfxmFsr2Context`. If the buffer is inverted, set `FFXM_FSR2_ENABLE_DEPTH_INVERTED` flag ([1..0] range). If the buffer has an infinite far plane, set the `FFXM_FSR2_ENABLE_DEPTH_INFINITE`. If the application provides the depth buffer in `D32S8` format, then it will ignore the stencil component of the buffer, and create an `R32_FLOAT` resource to address the depth buffer. | +| Motion vectors | Render or presentation | `APPLICATION SPECIFIED (2x FLOAT)` | Texture | The 2D motion vectors for the current frame, in **[<-width, -height> ... ]** range. If your application renders motion vectors with a different range, you may use the `motionVectorScale` field of the `FfxmFsr2DispatchDescription` structure to adjust them to match the expected range for Arm ASR. Internally, Arm ASR uses 16-bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, Arm ASR will not benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the `FFXM_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS` flag is set when creating the `FfxmFsr2Context`, in which case it should be equal to the presentation resolution. | +| Reactive mask | Render | `R8_UNORM` | Texture | As some areas of a rendered image do not leave a footprint in the depth buffer or include motion vectors, Arm ASR provides support for a reactive mask texture. This can be used to indicate to the technique where such areas are. Good examples of these are particles, or alpha-blended objects which do not write depth or motion vectors. If this resource is not set, then Arm ASR's shading change detection logic will handle these cases as best it can, but for optimal results, this resource should be set. For more information on the reactive mask please refer to the [Reactive mask](#reactive-mask) section. | +| Exposure | 1x1 | `R32_FLOAT/ R16_FLOAT` | Texture | The exposure value computed for the current frame. This resource may be omitted if the `FFXM_FSR2_ENABLE_AUTO_EXPOSURE` flag in the `FfxmFsr2ContextDescription` structure when creating `FfxmFsr2Context`. | + +All inputs that are provided at Render Resolution, except for motion vectors, should be rendered with jitter. By default, Motion vectors are expected to be unjittered unless the `FFXM_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION` flag is present. + +## Providing motion vectors + +### Space + +A key part of a temporal algorithm (be it antialiasing or upscaling) is the provision of motion vectors. Arm ASR accepts motion vectors in 2D which encode the motion from a pixel in the current frame to the position of that same pixel in the previous frame. It expects that motion vectors are provided by the application in [**<-width, -height>**..****] range; this matches Screen-Space. For example, a motion vector for a pixel in the upper-left corner of the screen with a value of `` would represent a motion that traversed the full width and height of the input surfaces, originating from the bottom-right corner. + +If your application computes motion vectors in another space - for example normalized device coordinate space - then you may use the `motionVectorScale` (`$ARMASR_DIR/include/host/ffxm_fsr2.h`) field of the `FfxmFsr2DispatchDescription` structure to instruct the technique to adjust them to match the expected range. The code examples below illustrate how motion vectors may be scaled to screen space. The example HLSL and C++ code below illustrates how NDC-space motion vectors can be scaled using the Arm ASR host API. + +GPU: Example of application NDC motion vector computation +```output +float2 motionVector = (previousPosition.xy / previousPosition.w) \ + - (currentPosition.xy / currentPosition.w); +``` + +CPU: Matching Arm ASR motionVectorScale configuration +```output +dispatchParameters.motionVectorScale.x = (float)renderWidth; +dispatchParameters.motionVectorScale.y = (float)renderHeight; +``` + +### Precision & resolution + +Internally, Arm ASR uses 16-bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, it will not currently benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution. If the `FFXM_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS` flag is set in `FfxmFsr2ContextDescription` when creating the `FfxmFsr2Context`, it should be equal to the presentation resolution. + +### Coverage + +Arm ASR will perform better quality upscaling when more objects provide their motion vectors. It is therefore advised that all opaque, alpha-tested and alpha-blended objects should write their motion vectors for all covered pixels. If vertex shader effects are applied, such as scrolling UVs, these calculations should also be factored into the calculation of motion for the best results. For alpha-blended objects it is also strongly advised that the alpha value of each covered pixel is stored to the corresponding pixel in the [reactive mask](#reactive-mask). This will allow the technique to perform better handling of alpha-blended objects during upscaling. The reactive mask is especially important for alpha-blended objects where writing motion vectors might be prohibitive, such as particles. + +## Reactive mask + +In the context of Arm ASR, the term "reactivity" means how much influence the samples rendered for the current frame have over the production of the final upscaled image. Typically, samples rendered for the current frame contribute a relatively modest amount to the result computed by the algorithm; however, there are exceptions. As there is no good way to determine from either color, depth or motion vectors which pixels have been rendered using alpha blending, Arm ASR performs best when applications explicitly mark such areas. + +Therefore, it is strongly encouraged that applications provide a reactive mask as an input. The reactive mask guides Arm ASR on where it should reduce its reliance on historical information when compositing the current pixel, and instead allow the current frame's samples to contribute more to the final result. The reactive mask allows the application to provide a value from `[0.0..1.0]` where `0.0` indicates that the pixel is not at all reactive (and should use the default composition strategy), and a value of `1.0` indicates the pixel should be fully reactive. This is a floating point range and can be tailored to different situations. + +While there are other applications for the reactive mask, the primary application for the reactive mask is producing better results of upscaling images which include alpha-blended objects. A good proxy for reactiveness is the alpha value used when compositing an alpha-blended object into the scene. Therefore, applications should write `alpha` to the reactive mask. It should be noted that it is unlikely that a reactive value of close to `1` will ever produce good results. Therefore, you should clamp the maximum reactive value to around `0.9`. + +Provide a reactive mask by setting the `reactive` field of `FfxmFsr2DispatchDescription` to `NULL`. + +If a reactive mask is not provided then an internally generated `1x1` texture with a cleared reactive value will be used. + +## Automatically generating reactivity + +To help applications generate the reactive mask, we provide an optional utility pass. Under the hood, the API launches a fragment shader which computes these values for each pixel using a luminance-based heuristic. + +To do this, the applications can call the `ffxmFsr2ContextGenerateReactiveMask` (`$ARMASR_DIR/include/host/ffxm_fsr2.h`) function and should pass two versions of the color buffer: one containing opaque only geometry, and the other containing both opaque and alpha-blended objects. + +## Exposure + +Arm ASR provides two values which control the exposure used when performing upscaling: + +1. **Pre-exposure**: a value by which we divide the input signal to get back to the original signal produced by the game before any packing into lower precision render targets. + +2. **Exposure**: a value which is multiplied against the result of the pre-exposed color value. + +The exposure value should match that which the application uses during any subsequent tonemapping passes performed by the application. This means Arm ASR will operate consistently with what is likely to be visible in the final tonemapped image. + +{{% notice %}} +In various stages of the algorithm, the technique will compute its own exposure value for internal use. It is worth noting that all outputs will have this internal tonemapping reversed before the final output is written. Meaning that Arm ASR returns results in the same domain as the original input signal. +{{% /notice %}} + +Poorly selected exposure values can have a drastic impact on the final quality of Arm ASR's upscaling. Therefore, it is recommended that `FFXM_FSR2_ENABLE_AUTO_EXPOSURE` is used by the application, unless there is a particular reason not to. When `FFXM_FSR2_ENABLE_AUTO_EXPOSURE` is set in the `FfxmFsr2ContextDescription` structure, the exposure calculation in `ComputeAutoExposureFromLavg` (`$ARMASR_DIR/include/gpu/fsr2/ffxm_fsr2_common.h`) is used to compute the exposure value, which matches the exposure response of ISO 100 film stock. + +## Modular backend + +The design of the Arm ASR API means that the core implementation of the algorithm is unaware of which rendering API it sits upon. Instead, it calls functions provided to it through an interface, allowing different backends to be used with the technique. Applications which have their own rendering abstractions can implement their own backend, taking control of all aspects of Arm ASR's underlying function. This includes memory management, resource creation, shader compilation, shader resource bindings, and the submission of the workloads to the graphics device. + +Out of the box, the API will compile into multiple libraries following the separation already outlined between the core API and the backends. This means if you wish to use the backends provided, you should link both the core API lib **Arm_ASR_api** as well the backend **Arm_ASR_backend** matching your requirements. + +Arm ASR only provides a built-in Vulkan backend as it targets Vulkan mobile apps. + +## Camera jitter + +Arm ASR relies on the application to apply sub-pixel jittering while rendering - this is typically included in the projection matrix of the camera. To make the application of camera jitter simple, the API provides a small set of utility function which computes the sub-pixel jitter offset for a particular frame within a sequence of separate jitter offsets. + +``` CPP +int32_t ffxmFsr2GetJitterPhaseCount(int32_t renderWidth, int32_t displayWidth); +FfxErrorCode ffxmFsr2GetJitterOffset(float* outX, float* outY, int32_t jitterPhase, int32_t sequenceLength); +``` + +Internally, these functions implement a **Halton[2,3]** sequence. The goal of the Halton sequence is to provide spatially separated points, which cover the available space. + +It is important to understand that the values returned from the `ffxmFsr2GetJitterOffset` (`$ARMASR_DIR/include/host/ffxm_fsr2.h`) are in unit pixel space, and in order to composite this correctly into a projection matrix you have to convert them into projection offsets. The code below shows how to correctly composite the sub-pixel jitter offset value into a projection matrix. + +``` cpp +const int32_t jitterPhaseCount = ffxmFsr2GetJitterPhaseCount(renderWidth, displayWidth); + +float jitterX = 0; +float jitterY = 0; +ffxmFsr2GetJitterOffset(&jitterX, &jitterY, index, jitterPhaseCount); + +// Calculate the jittered projection matrix. +const float jitterX = 2.0f * jitterX / (float)renderWidth; +const float jitterY = -2.0f * jitterY / (float)renderHeight; +const Matrix4 jitterTranslationMatrix = translateMatrix(Matrix3::identity, Vector3(jitterX, jitterY, 0)); +const Matrix4 jitteredProjectionMatrix = jitterTranslationMatrix * projectionMatrix; +``` + +Jitter should be applied to *all* rendering. This includes opaque, alpha transparent, and raytraced objects. For rasterized objects, the sub-pixel jittering values calculated by the `ffxmFsr2GetJitterOffset` (`$ARMASR_DIR/include/host/ffxm_fsr2.h`) function can be applied to the camera projection matrix which is ultimately used to perform transformations during vertex shading. For raytraced rendering, the sub-pixel jitter should be applied to the ray's origin - often the camera's position. + +Whether you decide whether to use the recommended `ffxmFsr2GetJitterOffset` function or your own sequence generator, you must set the `jitterOffset` field of the `FfxmFsr2DispatchDescription` structure to inform the algorithm of the jitter offset that has been applied in order to render each frame. Moreover, if not using the recommended `ffxmFsr2GetJitterOffset` function, care should be taken that your jitter sequence never generates a null vector; that is value of 0 in both the X and Y dimensions. + +## Camera jump cuts + +Most applications with real-time rendering have a large degree of temporal consistency between any two consecutive frames. However, there are cases where a change to a camera's transformation might cause an abrupt change in what is rendered. In such cases, Arm ASR is unlikely to be able to reuse any data it has accumulated from previous frames, and should clear this data such to exclude it from consideration in the compositing process. In order to indicate that a jump cut has occurred with the camera you should set the `reset` field of the `FfxmFsr2DispatchDescription` structure to `true` for the first frame of the discontinuous camera transformation. + +Rendering performance may be slightly less than typical frame-to-frame operation when using the reset flag, as Arm ASR will clear some additional internal resources. + +## Mipmap biasing + +Applying a negative mipmap biasing will typically generate an upscaled image with better texture detail. We recommend applying the following formula to your Mipmap bias: + +``` CPP +mipBias = log2(renderResolution/displayResolution) - 1.0; +``` + +## Frame Time Delta Input + +The API requires `frameTimeDelta` be provided by the application through the `FfxmFsr2DispatchDescription` structure. This value is in milliseconds. If running at 60fps, the value passed should be around `16.6f`. + +The value is used within the temporal component of the auto-exposure feature. This allows for tuning of the history accumulation for quality purposes. + +## HDR support + +High dynamic range images are supported. To enable this, you should set the `FFXM_FSR2_ENABLE_HIGH_DYNAMIC_RANGE` flag in the `FfxmFsr2ContextDescription` structure. Provide the input color image in linear RGB color space. + +## Debug Checker + +The context description structure can be provided with a callback function for passing textual warnings from the runtime to the underlying application. The `fpMessage` member of the description is of type `FfxmFsr2Message` which is a function pointer for passing string messages of various types. Assigning this variable to a suitable function, and passing the `FFXM_FSR2_ENABLE_DEBUG_CHECKING` flag within `FfxmFsr2ContextDescription` will enable the feature. It is recommended this is enabled only in debug development builds. + +## Extended ffx_shader_compiler + +Most of the workloads in the upscalers have been converted to fragment shaders. Since the workflow using the standalone VK backend relies in reflection data generated with [`AMD's Shader Compiler`](https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/main/docs/tools/ffx-sc.md), it become necessary to do an ad-hoc extension of the tool to provide reflection data for the RenderTargets so resources could be resolved automatically in the backend. Users might want to evolve the algorithm potentially changing the RenderTargets in the process. Thus, a diff file is provided with the changes that were applied locally `ffx_shader_compiler` (`$ARMASR_DIR/tools/ffx_shader_compiler.diff`) for the latest version of the technique. + +## Generate prebuilt shaders + +We provide a helper script to generate prebuilt shaders which are used for standalone backend. Make sure python is installed, and run it with the following command: + +```bash +python $ARMASR_DIR/tools/generate_prebuilt_shaders.py +``` + +You will find the output from the script in `$ARMASR_DIR/src/backends/shared/blob_accessors/prebuilt_shaders`. + +## Targeting GLES 3.2 + +Running Arm ASR on GLES is possible when using the [tight integration](#tight-integration) approach. In this scenario, you will have to apply two changes: + +1. When creating the context, the user will have to specify the flag `FFXM_FSR2_OPENGL_ES_3_2` in the `FfxmFsr2ContextDescription`. This will trigger changes internally so that Arm ASR adapts to a GLES friendly approach. + +1. The `permutationOptions` (`$ARMASR_DIR/include/host/ffxm_interface.h`) provided when creating the pipelines will now include the new permutation option `FSR2_SHADER_PERMUTATION_PLATFORM_GLES_3_2` (`$ARMASR_DIR/src/components/fsr2/ffxm_fsr2_private.h`). This is a hint to the user that they will need to use the shader variants for the technique with the following symbol defined: + + ``` + #define FFXM_SHADER_PLATFORM_GLES_3_2 1 + ``` + +## Next steps + +You are now ready to use Arm ASR in your game engine projects. Go to the next section to explore more resources on Arm ASR. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/05-get-help.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/05-get-help.md new file mode 100644 index 0000000000..083695d379 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/05-get-help.md @@ -0,0 +1,21 @@ +--- +title: Getting help and finding resources +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Get support + +You can reach out to us on the following email address: . + +## Additional resources + +The following links provide additional information about Arm ASR. + +* [Arm ASR on Arm Developer Hub](https://www.arm.com/developer-hub/mobile-graphics-and-gaming/accuracy-super-resolution) +* [Arm ASR Manga Comic](https://developer.arm.com/Mobile%20Graphics%20and%20Gaming/FeaturedContent/Mali%20Manga/FeaturedContent-MaliManga-Volume4) +* [Arm Community Blog](https://community.arm.com/arm-community-blogs/b/graphics-gaming-and-vr-blog/posts/introducing-arm-accuracy-super-resolution) +* [Arm Accuracy Super Resolution for Unreal Engine Tutorial](https://developer.arm.com/documentation/109993/latest/) +* [Arm Accuracy Super Resolution for Universal SDK Tutorial](https://developer.arm.com/documentation/110404/latest/) \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md new file mode 100644 index 0000000000..1f49e92fe0 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_index.md @@ -0,0 +1,65 @@ +--- +title: Get started with Arm Accuracy Super Resolution (Arm ASR) + +minutes_to_complete: 20 + +draft: true +cascade: + draft: true + +who_is_this_for: Mobile gaming and graphics developers who want to install and configure Arm Accuracy Super Resolution (Arm ASR) in their projects, to improve performance on highly complex game content without compromising on image quality. + +learning_objectives: + - Understand Arm Accuracy Super Resolution + - Integrate Arm ASR into your game project + - Control how Arm ASR upscales your content + +prerequisites: + - A game project that uses high-quality rendering features (such as hardware ray tracing) that stretch the performance capabilities of everyday smartphones. + - A development machine with Git set up. + +author: Julie Gaskin + +### Tags +skilllevels: Advanced +subjects: Graphics +armips: + - Mali + - Immortalis +tools_software_languages: + - Unreal Engine +operatingsystems: + - Android + + + +further_reading: + - resource: + title: Arm ASR on Arm Developer Hub + link: https://www.arm.com/developer-hub/mobile-graphics-and-gaming/accuracy-super-resolution + type: website + - resource: + title: Arm ASR Manga Comic + link: https://developer.arm.com/Mobile%20Graphics%20and%20Gaming/FeaturedContent/Mali%20Manga/FeaturedContent-MaliManga-Volume4 + type: website + - resource: + title: Arm Community Blog + link: https://community.arm.com/arm-community-blogs/b/graphics-gaming-and-vr-blog/posts/introducing-arm-accuracy-super-resolution + type: blog + - resource: + title: Arm Accuracy Super Resolution for Unreal Engine Tutorial + link: https://developer.arm.com/documentation/109993/latest/ + type: documentation + - resource: + title: Arm Accuracy Super Resolution for the Generic Library Tutorial + link: https://developer.arm.com/documentation/110404/latest/ + type: documentation + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/add_asr_feature.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/add_asr_feature.png new file mode 100644 index 0000000000..979e73abf5 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/add_asr_feature.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_debugger.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_debugger.png new file mode 100644 index 0000000000..7ed7c26e2f Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_debugger.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_enable_command.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_enable_command.png new file mode 100644 index 0000000000..4f4ca1f52a Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_enable_command.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_settings.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_settings.png new file mode 100644 index 0000000000..7f79d38e62 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_settings.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_view.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_view.png new file mode 100644 index 0000000000..ffe4238a09 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/arm_asr_view.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/asr_popup.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/asr_popup.png new file mode 100644 index 0000000000..6611bc28ef Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/asr_popup.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/camera_settings.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/camera_settings.png new file mode 100644 index 0000000000..7da6fb34ef Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/camera_settings.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/change_anti_aliasing_method.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/change_anti_aliasing_method.png new file mode 100644 index 0000000000..1ae3a4874f Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/change_anti_aliasing_method.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/configuration_options.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/configuration_options.png new file mode 100644 index 0000000000..12351caf68 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/configuration_options.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/copied_arm_asr_plugin.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/copied_arm_asr_plugin.png new file mode 100644 index 0000000000..cfd378642d Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/copied_arm_asr_plugin.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/create_renderer.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/create_renderer.png new file mode 100644 index 0000000000..e09399b81c Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/create_renderer.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/disable_opaque_downsampling.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/disable_opaque_downsampling.png new file mode 100644 index 0000000000..0a3b1187c9 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/disable_opaque_downsampling.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_folder.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_folder.png new file mode 100644 index 0000000000..83cb0f7683 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_folder.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_screen.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_screen.png new file mode 100644 index 0000000000..f5bdec80c2 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/plugin_screen.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/project_settings.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/project_settings.png new file mode 100644 index 0000000000..968df94bed Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/project_settings.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/select_arm_asr.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/select_arm_asr.png new file mode 100644 index 0000000000..3ee3202255 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/select_arm_asr.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack.png new file mode 100644 index 0000000000..4f18194388 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack_opening_screen.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack_opening_screen.png new file mode 100644 index 0000000000..ab4564614d Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/third_person_pack_opening_screen.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/ui_settings.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/ui_settings.png new file mode 100644 index 0000000000..34fac73324 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/ui_settings.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/verify_plugin.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/verify_plugin.png new file mode 100644 index 0000000000..abcca0cab6 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/verify_plugin.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/visualise_console_command.png b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/visualise_console_command.png new file mode 100644 index 0000000000..7b3fdc4c47 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/get-started-with-arm-asr/images/visualise_console_command.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md new file mode 100644 index 0000000000..e89be4d2e9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md @@ -0,0 +1,55 @@ +--- +title: Run an AI Agent Application with llama.cpp and llama-cpp-agent using KleidiAI on Arm servers. + +draft: true +cascade: + draft: true + +minutes_to_complete: 45 + +who_is_this_for: This is an introductory topic for software developers and ML engineers looking to run an AI Agent Application. + +learning_objectives: + - Set up llama-cpp-python optimised for Arm servers. + - Learn how to run optimized LLM models. + - Learn how to create custom functions for LLMs. + - Learn how to use AI Agents for applications. + +prerequisites: + - An [Arm-based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server. + - Basic understanding of Python and Prompt Engineering + - Understanding of LLM fundamentals. + +author: Andrew Choi + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Neoverse +tools_software_languages: + - Python + - AWS Graviton +operatingsystems: + - Linux + + + +further_reading: + - resource: + title: llama.cpp + link: https://github.com/ggml-org/llama.cpp + type: documentation + - resource: + title: llama-cpp-agent + link: https://llama-cpp-agent.readthedocs.io/en/latest/ + type: documentation + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/agent-output.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/agent-output.md new file mode 100644 index 0000000000..5b056a7429 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/agent-output.md @@ -0,0 +1,205 @@ +--- +title: Understand and test the AI Agent +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## AI Agent Function Calls + +An AI agent, powered by a Large Language Model (LLM), decides which function to use by analyzing the prompt or input it receives, identifying the relevant intent or task, and then matching that intent to the most appropriate function from a pre-defined set of available functions based on its understanding of the language and context. + +Lets look at how this is implemented in the python script `agent.py`. + +- This code section of `agent.py` shown below creates an instance of the quantized `llama3.1 8B` model for more efficient inference on Arm-based systems. +```output +llama_model = Llama( + model_path="./models/dolphin-2.9.4-llama3.1-8b-Q4_0.gguf", + n_batch=2048, + n_ctx=10000, + n_threads=64, + n_threads_batch=64, +) +``` + +- Next, you define a provider that leverages the `llama.cpp` Python bindings. +```output +provider = LlamaCppPythonProvider(llama_model) +``` + +- The LLM has access to certain tools or functions and can take a general user input and decide which functions to call. The function’s docstring guides the LLM on when and how to invoke it. In `agent.py` three such tools or functions are defined `open_webpage`, `get_current_time` and `calculator` + +```output +def open_webpage(): + """ + Open Learning Path Website when user asks the agent regarding Arm Learning Path + """ + import webbrowser + + url = "https://learn.arm.com/" + webbrowser.open(url, new=0, autoraise=True) + + +def get_current_time(): + """ + Returns the current time in H:MM AM/PM format. + """ + import datetime # Import datetime module to get current time + + now = datetime.datetime.now() # Get current time + return now.strftime("%I:%M %p") # Format time in H:MM AM/PM format + + +class MathOperation(Enum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" +def calculator( + number_one: Union[int, float], + number_two: Union[int, float], + operation: MathOperation, +) -> Union[int, float]: + """ + Perform a math operation on two numbers. + + Args: + number_one: First number + number_two: Second number + operation: Math operation to perform + + Returns: + Result of the mathematical operation + + Raises: + ValueError: If the operation is not recognized + """ + if operation == MathOperation.ADD: + return number_one + number_two + elif operation == MathOperation.SUBTRACT: + return number_one - number_two + elif operation == MathOperation.MULTIPLY: + return number_one * number_two + elif operation == MathOperation.DIVIDE: + return number_one / number_two + else: + raise ValueError("Unknown operation.") +``` + +- `from_functions` creates an instance of `LlmStructuredOutputSettings` by passing in a list of callable Python functions. The LLM can then decide if and when to use these functions based on user queries. + +```output +output_settings = LlmStructuredOutputSettings.from_functions( + [get_current_time, open_webpage, calculator], allow_parallel_function_calling=True +) + +``` +- The user's prompt is then collected and processed through `LlamaCppAgent`. The agent decides whether to call any defined functions based on the request. +``` +user = input("Please write your prompt here: ") + +llama_cpp_agent = LlamaCppAgent( + provider, + debug_output=True, + system_prompt="You're a helpful assistant to answer User query.", + predefined_messages_formatter_type=MessagesFormatterType.LLAMA_3, +) + +result = llama_cpp_agent.get_chat_response( + user, structured_output_settings=output_settings, llm_sampling_settings=settings +) +``` + +## Test the AI Agent + +You are now ready to test and execute the AI Agent python script. Start the application: + +```bash +python3 agent.py +``` + +You will see lots of interesting statistics being printed from `llama.cpp` about the model and the system, followed by the prompt as shown: + +```output +llama_kv_cache_init: CPU KV buffer size = 1252.00 MiB +llama_init_from_model: KV self size = 1252.00 MiB, K (f16): 626.00 MiB, V (f16): 626.00 MiB +llama_init_from_model: CPU output buffer size = 0.49 MiB +llama_init_from_model: CPU compute buffer size = 677.57 MiB +llama_init_from_model: graph nodes = 1030 +llama_init_from_model: graph splits = 1 +CPU : NEON = 1 | ARM_FMA = 1 | FP16_VA = 1 | MATMUL_INT8 = 1 | SVE = 1 | DOTPROD = 1 | MATMUL_INT8 = 1 | SVE_CNT = 32 | OPENMP = 1 | AARCH64_REPACK = 1 | +Model metadata: {'tokenizer.chat_template': "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", 'tokenizer.ggml.eos_token_id': '128256', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'llama.vocab_size': '128258', 'general.file_type': '2', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '500000.000000', 'tokenizer.ggml.bos_token_id': '128000', 'llama.attention.head_count': '32', 'llama.feed_forward_length': '14336', 'general.architecture': 'llama', 'llama.attention.head_count_kv': '8', 'llama.block_count': '32', 'tokenizer.ggml.padding_token_id': '128004', 'general.basename': 'Meta-Llama-3.1', 'llama.embedding_length': '4096', 'general.base_model.0.organization': 'Meta Llama', 'tokenizer.ggml.pre': 'llama-bpe', 'llama.context_length': '131072', 'general.name': 'Meta Llama 3.1 8B', 'llama.rope.dimension_count': '128', 'general.base_model.0.name': 'Meta Llama 3.1 8B', 'general.organization': 'Meta Llama', 'general.type': 'model', 'general.size_label': '8B', 'general.base_model.0.repo_url': 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B', 'general.license': 'llama3.1', 'general.base_model.count': '1'} +Available chat formats from metadata: chat_template.default +Using gguf chat template: {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + ' +' + message['content'] + '<|im_end|>' + ' +'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant +' }}{% endif %} +Using chat eos_token: <|im_end|> +Using chat bos_token: <|begin_of_text|> +Please write your prompt here: +``` + +## Test the AI agent + +When you are presented with "Please write your prompt here:" test it with an input prompt. Enter "What is the current time?" + +- As part of the prompt, a list of executable functions is sent to the LLM, allowing the agent to select the appropriate function: + +```output +Read and follow the instructions below: + + +You're a helpful assistant to answer User query. + + + +You can call functions to help you with your tasks and user queries. The available functions are: + + +Function: get_current_time + Description: Returns the current time in H:MM AM/PM format. + Parameters: + none + +Function: open_webpage + Description: Open Learning Path Website when user asks the agent regarding Arm Learning Path + Parameters: + none + +Function: calculator + Description: Perform a math operation on two numbers. + Parameters: + number_one (int or float): First number + number_two (int or float): Second number + operation (enum): Math operation to perform Can be one of the following values: 'add' or 'subtract' or 'multiply' or 'divide' + + +To call a function, respond with a JSON object (to call one function) or a list of JSON objects (to call multiple functions), with each object containing these fields: + +- "function": Put the name of the function to call here. +- "arguments": Put the arguments to pass to the function here. +``` + +The AI Agent then decides to invoke the appropriate function and return the result as shown: + +```output +[ + { + "function": + "get_current_time", + "arguments": {} + } +] +---------------------------------------------------------------- +Response from AI Agent: +[{'function': 'get_current_time', 'arguments': {}, 'return_value': '07:58 PM'}] +---------------------------------------------------------------- +``` + +You have now tested when you enter, "What is the current time?", the AI Agent will choose to call the `get_current_time()` function, and return a result in **H:MM AM/PM** format. + +You have successfully run an AI agent. You can ask different questions to trigger and execute other functions. You can extend your AI agent by defining custom functions so it can handle specific tasks. + + + diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent-backend.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent-backend.md new file mode 100644 index 0000000000..0dc91bc38b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent-backend.md @@ -0,0 +1,160 @@ +--- +title: AI Agent Application +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Python Script for executing an AI Agent application +With `llama.cpp` built and the Llama3.1 8B model downloaded, you are now ready to create a Python script to execute an AI Agent Application: + +Create a Python file named `agent.py` with the content shown below: +```bash +from enum import Enum +from typing import Union +from pydantic import BaseModel, Field +from llama_cpp_agent import MessagesFormatterType +from llama_cpp_agent.chat_history.messages import Roles +from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings +from llama_cpp_agent import LlamaCppFunctionTool +from llama_cpp_agent import FunctionCallingAgent +from llama_cpp_agent import MessagesFormatterType +from llama_cpp_agent import LlamaCppAgent +from llama_cpp_agent.providers import LlamaCppPythonProvider +from llama_cpp import Llama +# import os +# from dotenv import load_dotenv +# from langchain_community.tools import TavilySearchResults # Uncomment this to enable search function + + +# load_dotenv() + +# os.environ.get("TAVILY_API_KEY") + +llama_model = Llama( + model_path="./models/dolphin-2.9.4-llama3.1-8b-Q4_0.gguf", # make sure you use the correct path for the quantized model + n_batch=2048, + n_ctx=10000, + n_threads=64, + n_threads_batch=64, +) + +provider = LlamaCppPythonProvider(llama_model) + + +def open_webpage(): + """ + Open Learning Path Website when user asks the agent regarding Arm Learning Path + """ + import webbrowser + + url = "https://learn.arm.com/" + webbrowser.open(url, new=0, autoraise=True) + + +def get_current_time(): + """ + Returns the current time in H:MM AM/PM format. + """ + import datetime # Import datetime module to get current time + + now = datetime.datetime.now() # Get current time + return now.strftime("%I:%M %p") # Format time in H:MM AM/PM format + + +class MathOperation(Enum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" + + +def calculator( + number_one: Union[int, float], + number_two: Union[int, float], + operation: MathOperation, +) -> Union[int, float]: + """ + Perform a math operation on two numbers. + + Args: + number_one: First number + number_two: Second number + operation: Math operation to perform + + Returns: + Result of the mathematical operation + + Raises: + ValueError: If the operation is not recognized + """ + if operation == MathOperation.ADD: + return number_one + number_two + elif operation == MathOperation.SUBTRACT: + return number_one - number_two + elif operation == MathOperation.MULTIPLY: + return number_one * number_two + elif operation == MathOperation.DIVIDE: + return number_one / number_two + else: + raise ValueError("Unknown operation.") + +# Uncomment the following function to enable web search functionality (You will need to install langchain-community) +# def search_from_the_web(content: str): +# """ +# Search useful information from the web to answer User's question + +# Args: +# content: Useful question to retrieve data from the web to answer user's question +# """ +# tool = TavilySearchResults( +# max_results=1, +# search_depth="basic" +# ) +# result = tool.invoke({"query":content}) +# return result + +settings = provider.get_provider_default_settings() + +settings.temperature = 0.65 +# settings.top_p = 0.85 +# settings.top_k = 60 +# settings.tfs_z = 0.95 +settings.max_tokens = 4096 + +output_settings = LlmStructuredOutputSettings.from_functions( + [get_current_time, open_webpage, calculator], allow_parallel_function_calling=True +) + + +def send_message_to_user_callback(message: str): + print(message) + + +def run_web_search_agent(): + user = input("Please write your prompt here: ") + if user == "exit": + return + + llama_cpp_agent = LlamaCppAgent( + provider, + debug_output=True, + system_prompt="You're a helpful assistant to answer User query.", + predefined_messages_formatter_type=MessagesFormatterType.LLAMA_3, + ) + + result = llama_cpp_agent.get_chat_response( + user, structured_output_settings=output_settings, llm_sampling_settings=settings + ) + + print("----------------------------------------------------------------") + print("Response from AI Agent:") + print(result) + print("----------------------------------------------------------------") + +if __name__ == '__main__': + run_web_search_agent() +``` +In the next section, you will inspect this script to understand how the LLM is configured and used to execute Agent tasks using this script. You will then proceed to executing and testing the AI Agent. + diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent.md new file mode 100644 index 0000000000..2e8f7ee943 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/ai-agent.md @@ -0,0 +1,46 @@ +--- +title: Introduction to AI Agents and Agent Use Cases +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview of AI Agents + +An AI Agent is best understood as an integrated system that goes beyond standard text generation by equipping Large Language Models (LLMs) with tools and domain knowledge. Here’s a closer look at the underlying elements: + +- **System**: Each AI Agent functions as an interconnected ecosystem of components. + - **Environment**: The domain in which the AI Agent operates. For instance, in a system that books travel itineraries, the relevant environment might include airline reservation systems and hotel booking tools. + - **Sensors**: Methods the AI Agent uses to observe its surroundings. For a travel agent, these could be APIs that inform the agent about seat availability on flights or room occupancy in hotels. + - **Actuators**: Ways the AI Agent exerts influence within that environment. In the example of a travel agent, placing a booking or modifying an existing reservation serves as the agent’s “actuators.” + +- **Large Language Models**: While the notion of agents is not new, LLMs bring powerful language comprehension and data-processing capabilities to agent setups. +- **Performing Actions**: Rather than just produce text, LLMs within an agent context interpret user instructions and interact with tools to achieve specific objectives. +- **Tools**: The agent’s available toolkit depends on the software environment and developer-defined boundaries. In the travel agent example, these tools might be limited to flight and hotel reservation APIs. +- **Knowledge**: Beyond immediate data sources, the agent can fetch additional details—perhaps from databases or web services—to enhance decision making. + + + +## Types of AI Agents + +AI Agents come in multiple forms. The table below provides an overview of some agent types and examples illustrating their roles in a travel booking system: + +| **Agent Category** | **Key Characteristics** | **Example usage in a Travel system** | +|--------------------------|--------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| **Simple Reflex Agents** | Act directly based on set rules or conditions. | Filters incoming messages and forwards travel-related emails to a service center. | +| **Model-Based Agents** | Maintain an internal representation of the world and update it based on new inputs. | Monitors flight prices and flags dramatic fluctuations, guided by historical data. | +| **Goal-Based Agents** | Execute actions with the aim of meeting designated objectives. | Figures out the necessary route (flights, transfers) to get from your current location to your target destination. | +| **Utility-Based Agents** | Use scoring or numerical metrics to compare and select actions that fulfill a goal. | Balances cost versus convenience when determining which flights or hotels to book. | +| **Learning Agents** | Adapt over time by integrating lessons from previous feedback or experiences. | Adjusts future booking suggestions based on traveler satisfaction surveys. | +| **Hierarchical Agents** | Split tasks into sub-tasks and delegate smaller pieces of work to subordinate agents.| Cancels a trip by breaking down the process into individual steps, such as canceling a flight, a hotel, and a car rental. | +| **Multi-Agent Systems** | Involve multiple agents that may cooperate or compete to complete tasks. | Cooperative: Different agents each manage flights, accommodations, and excursions. Competitive: Several agents vie for limited rooms. | + + +## Ideal Applications for AI Agents + +While the travel scenario illustrates different categories of AI Agents, there are broader circumstances where agents truly excel: + +- **Open-Ended Challenges**: Complex tasks with no predetermined procedure, requiring the agent to determine the necessary steps. +- **Procedural or Multi-Step Tasks**: Endeavors requiring numerous phases or tool integrations, allowing the agent to switch between resources. +- **Continual Improvement**: Contexts where feedback loops enable the agent to refine its behaviors for better outcomes in the future. diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/set-up.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/set-up.md new file mode 100644 index 0000000000..ba4bdb9bbb --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/set-up.md @@ -0,0 +1,133 @@ +--- +title: Set up the Environment to Run an AI Application Locally +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Before you begin + +This Learning Path demonstrates how to build an AI Agent Application using open-source Large Language Models (LLMs) optimized for Arm architecture. The AI Agent can use Large Language Models (LLMs) to perform actions by accessing tools and knowledge. The instructions in this Learning Path have been designed for Arm servers running Ubuntu 22.04 LTS. You need an Arm server instance with at least 4 cores and 16GB of memory to run this example. Configure disk storage up to at least 32 GB. The instructions have been tested on an AWS EC2 Graviton3 `m7g.xlarge instance`. + +## Overview + +In this Learning Path, you learn how to build an AI Agent application using `llama-cpp-python` and `llama-cpp-agent`. `llama-cpp-python` is a Python binding for `llama.cpp` that enables efficient LLM inference on Arm CPUs and `llama-cpp-agent` provides an interface for processing text using agentic chains with tools. + +## Install dependencies + +Install the following packages on your Arm based server instance: + +```bash +sudo apt-get update +sudo apt-get upgrade +sudo apt install python3-pip python3-venv cmake -y +``` + +Create and activate a Python virtual environment: +```bash +python3 -m venv ai-agent +source ai-agent/bin/activate +``` + +Install the `llama-cpp-python` package using pip: + +```bash +pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +``` + +Install the `llama-cpp-agent` and `pydantic` packages using pip: + +```bash +pip install llama-cpp-agent pydantic +``` + + +## Download the pre-quantized Llama-3.1-8B LLM model from Hugging Face + +You are now ready to download the LLM. + +Create and navigate to the models directory: + +```bash +mkdir models +cd models +``` +Install the `huggingface_hub` python library using `pip`: + +```bash +pip install huggingface_hub +``` +You can now download the [pre-quantized Llama3.1 8B model](https://huggingface.co/cognitivecomputations/dolphin-2.9.4-llama3.1-8b-gguf) using the huggingface cli: + +```bash +huggingface-cli download cognitivecomputations/dolphin-2.9.4-llama3.1-8b-gguf dolphin-2.9.4-llama3.1-8b-Q4_0.gguf --local-dir . --local-dir-use-symlinks False +``` + +`Q4_0` in the model name refers to the quantization method the model uses. The goal of quantization is to reduce the size of the model (to reduce the memory space required) and faster (to reduce memory bandwidth bottlenecks transferring large amounts of data from memory to a processor). The primary trade-off to keep in mind when reducing a model’s size is maintaining quality of performance. Ideally, a model is quantized to meet size and speed requirements while not having a negative impact on performance. + +The main thing to note in the quantization format is the number of bits per parameter, which is denoted by ‘Q4’ in this case or 4-bit integer. + +## Build llama.cpp + +As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for performance optimization with KleidiAI kernels. You can take advantage of these kernels by running your models using the `llama.cpp` framework. + +Navigate to your home directory: + +```bash +cd ~ +``` + +Clone the `llama.cpp` repository: + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` + +Build llama.cpp: + +```bash +cd llama.cpp +mkdir build +cd build +cmake .. -DCMAKE_CXX_FLAGS="-mcpu=native" -DCMAKE_C_FLAGS="-mcpu=native" +cmake --build . -v --config Release -j `nproc` +``` +`llama.cpp` is now built in the `bin` directory. +Check that `llama.cpp` has built correctly by running the help command: + +```bash +cd bin +./llama-cli -h +``` + +If `llama.cpp` has built correctly on your machine, you will see the help options being displayed. A snippet of the output is shown below: + +```output +usage: ./llama-cli [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 4) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) +``` +In the next section you will create a python script to execute an AI Agent powered by the model you downloaded. diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_functions.png b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_functions.png new file mode 100644 index 0000000000..7960beff9e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_functions.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_output.png b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_output.png new file mode 100644 index 0000000000..1b4018939e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_output.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_prompt.png b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_prompt.png new file mode 100644 index 0000000000..f4aa3ede82 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/test_prompt.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/1.md b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/1.md index 053b508448..3362c46937 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/1.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/1.md @@ -30,7 +30,7 @@ MULT R3, #R1, #5 // B - **Hardware Perceived Order**: This is the perspective observed by other devices in the system, which can differ if the hardware buffers writes or merges memory operations. Crucially, the hardware-perceived order can vary between CPU architectures, for example between x86 and Arm, and this should be considered when porting applications. An abstract diagram from the academic paper is shown below [Maranget et. al, 2012]. A write operation in one of the 5 threads in the pentagon below may propagate to the other threads in any order. -![abstract_model](./Abstract_model.png) +![abstract_model](./multi-copy-atomic.png) ## High-level differences between the Arm memory model and the x86 memory model diff --git a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/2.md b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/2.md index d2e91c15bf..4bc458a7c8 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/2.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/2.md @@ -48,7 +48,7 @@ In the pseudo code snippet above, it's possible for operation B to precede opera - `memory_order_acquire` and `memory_order_release` -Acquire and release are used to synchronise atomic variables. In the example below, thread A writes to memory (allocating the string and setting data) and then uses a release-store to publish these updates. Thread B repeatedly performs an acquire-load until it sees the updated pointer. The acquire ensures that once Thread B sees a non-null pointer, all writes made by Thread A (including the update to data) become visible, synchronizing the two threads. +Acquire and release are used to synchronize atomic variables. In the example below, thread A writes to memory (allocating the string and setting data) and then uses a release-store to publish these updates. Thread B repeatedly performs an acquire-load until it sees the updated pointer. The acquire ensures that once Thread B sees a non-null pointer, all writes made by Thread A (including the update to data) become visible, synchronizing the two threads. ```cpp // Thread A diff --git a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/4.md b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/4.md index 555886b6de..a1d4fa28af 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/4.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/4.md @@ -8,7 +8,7 @@ layout: learningpathall ## How can I detect infrequent race conditions? -ThreadSanitizer, commonly referred to as `TSan`, is a concurrency bug detection tool that identifies data races in multi-threaded programs. By instrumenting code at compile time, TSan dynamically tracks memory operations, monitoring lock usage and detecting inconsistencies in thread synchronization. When it finds a potential data race, it reports detailed information to aid debugging. TSan’s overhead can be significant, but it provides valuable insights into concurrency issues often missed by static analysis. +ThreadSanitizer, commonly referred to as `TSan`, is a concurrency bug detection tool that identifies data races in multi-threaded programs. By instrumenting code at compile time, TSan dynamically tracks memory operations, monitoring lock usage and detecting inconsistencies in thread synchronization. When it finds a potential data race, it reports detailed information to aid debugging. TSan's overhead can be significant, but it provides valuable insights into concurrency issues often missed by static analysis. TSan is available through both recent `clang` and `gcc` compilers. diff --git a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/_index.md b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/_index.md index 14ea6585d4..a3327a3a88 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-cpp-memory-model/_index.md @@ -27,7 +27,7 @@ armips: - Neoverse tools_software_languages: - C++ - - ThreadSantizer (TSan) + - ThreadSanitizer (TSan) operatingsystems: - Linux - Runbook @@ -38,7 +38,7 @@ further_reading: link: https://en.cppreference.com/w/cpp/atomic/memory_order type: documentation - resource: - title: Thread Santiser Manual + title: Thread Sanitizer Manual link: Phttps://github.com/google/sanitizers/wiki/threadsanitizercppmanual type: documentation diff --git a/content/learning-paths/servers-and-cloud-computing/copilot-extension-deployment/2-cdk-services.md b/content/learning-paths/servers-and-cloud-computing/copilot-extension-deployment/2-cdk-services.md index 49993206ba..231762c060 100644 --- a/content/learning-paths/servers-and-cloud-computing/copilot-extension-deployment/2-cdk-services.md +++ b/content/learning-paths/servers-and-cloud-computing/copilot-extension-deployment/2-cdk-services.md @@ -7,7 +7,7 @@ layout: learningpathall --- ## Which AWS Services do I need? -In the first GitHub Copilot Extension Learning Path, [Build a GitHub Copilot Extension in Python](learning-paths/servers-and-cloud-computing/gh-copilot-simple), you ran a GitHub Copilot Extension on a single Linux computer, with the public URL provided by an ngrok tunnel to your localhost. +In the first GitHub Copilot Extension Learning Path, [Build a GitHub Copilot Extension in Python](/learning-paths/servers-and-cloud-computing/gh-copilot-simple), you ran a GitHub Copilot Extension on a single Linux computer, with the public URL provided by an ngrok tunnel to your localhost. For a production environment, you require: diff --git a/content/learning-paths/servers-and-cloud-computing/copilot-extension/1-rag.md b/content/learning-paths/servers-and-cloud-computing/copilot-extension/1-rag.md index 1a0723bcd3..3b0e5c9fbf 100644 --- a/content/learning-paths/servers-and-cloud-computing/copilot-extension/1-rag.md +++ b/content/learning-paths/servers-and-cloud-computing/copilot-extension/1-rag.md @@ -18,7 +18,7 @@ The basic flow of a RAG system includes: The benefits of a RAG system center around improved factual accuracy of responses and the ability to integrate up-to-date information, as you can update the knowledge base without retraining the model. -Most importantly, RAG lets you provide reference links to the user, showing the user where the information originates. This not only build trust with users but also serves as a pathway for further exploration of the source material. +Most importantly, RAG lets you provide reference links to the user, showing the user where the information originates. This not only builds trust with users but also serves as a pathway for further exploration of the source material. ## What are the challenges of building a RAG system? @@ -28,6 +28,6 @@ While RAG systems improve AI-generated content, they also introduce several chal * Context Length Limitations: all models, including the GitHub Copilot API, have limitations on the amount of information they can process at once, requiring careful selection and ranking of retrieved data. * Handling Conflicting Information: if your knowledge base has contradictory information, the system may struggle to reconcile them and generate a coherent response. * Scalability and Latency: querying large knowledge bases and integrating retrieval with generation can increase response time. This is another place where the choice of similarity search algorithm has an impact. -* Data Freshness and Maintenance: The knowledge base must be regularly updated to ensure the system remains accurate and relevant. +* Data Freshness and Maintenance: the knowledge base must be regularly updated to ensure the system remains accurate and relevant. For an example of a production RAG GitHub Copilot Extension, you can check out [Arm for GitHub Copilot](https://github.com/marketplace/arm-for-github-copilot) in the GitHub Marketplace. diff --git a/content/learning-paths/servers-and-cloud-computing/copilot-extension/_index.md b/content/learning-paths/servers-and-cloud-computing/copilot-extension/_index.md index 8c50b29155..c53938f1c6 100644 --- a/content/learning-paths/servers-and-cloud-computing/copilot-extension/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/copilot-extension/_index.md @@ -37,6 +37,20 @@ operatingsystems: - Linux - macOS +further_reading: + - resource: + title: GitHub Marketplace for Copilot extensions + link: https://github.com/marketplace?type=apps&copilot_app=true/ + type: website + - resource: + title: About building Copilot Extensions + link: https://docs.github.com/en/copilot/building-copilot-extensions/about-building-copilot-extensions/ + type: documentation + - resource: + title: Copilot Extensions repository + link: https://github.com/copilot-extensions/ + type: documentation + ### FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/servers-and-cloud-computing/copilot-extension/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/copilot-extension/_next-steps.md index b708fce39b..921f569dd7 100644 --- a/content/learning-paths/servers-and-cloud-computing/copilot-extension/_next-steps.md +++ b/content/learning-paths/servers-and-cloud-computing/copilot-extension/_next-steps.md @@ -1,19 +1,4 @@ --- - -further_reading: - - resource: - title: GitHub Marketplace for Copilot extensions - link: https://github.com/marketplace?type=apps&copilot_app=true/ - type: website - - resource: - title: About building Copilot Extensions - link: https://docs.github.com/en/copilot/building-copilot-extensions/about-building-copilot-extensions/ - type: documentation - - resource: - title: Copilot Extensions repository - link: https://github.com/copilot-extensions/ - type: documentation - # ================================================================================ # FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/4.md b/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/4.md index 593c938fb7..7b1f2388b4 100644 --- a/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/4.md +++ b/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/4.md @@ -128,44 +128,44 @@ Average elapsed time: 0.0420332 seconds Average elapsed time: 0.0155661 seconds ``` -Here we can observe a notable performance speed up from using higher levels of optimisations. +Here we can observe a notable performance speed up from using higher levels of optimizations. -Please Note: To understand which lower level optimisation are used by `-O1`, `-O2` and `-O3` we can use the `g++ -Q --help=optimizers` command. +Please Note: To understand which lower level optimization are used by `-O1`, `-O2` and `-O3` we can use the `g++ -Q --help=optimizers` command. -### Understanding what was optimised +### Understanding what was optimized Naturally, the next question is to understand which part of your source code was optimized between the outputs above. Full optimization reports generated by compilers like GCC provide a detailed tree of reports through various stages of the optimization process. For beginners, these reports can be overwhelming due to the sheer volume of information they contain, covering every aspect of the code's transformation and optimization. For a more manageable overview, you can enable basic optimization information (`opt-info`) reports using specific arguments such as `-fopt-info-vec`, which focuses on vectorization optimizations. The `-fopt-info` flag can be customized by changing the info bit to target different types of optimizations, making it easier to pinpoint specific areas of interest. -First, to see what part of our source code was optimised between levels 1 and 2 we can run the following commands to see if our vectorisable loop was indeed vectorised. +First, to see what part of our source code was optimized between levels 1 and 2 we can run the following commands to see if our vectorizable loop was indeed vectorized. ```bash g++ -O1 vectorizable_loop.cpp -o level_1 -fopt-info-vec ``` -Running the `-O1` flag led showed no terminal output indicating no vectorisation was performed. Next, run the command below with the `-O2` flag. +Running the `-O1` flag led showed no terminal output indicating no vectorization was performed. Next, run the command below with the `-O2` flag. ```bash g++ -O2 vectorizable_loop.cpp -o level_2 -fopt-info-vec ``` -This time the `-O2` flag enables our loop to be vectorised as can be seen from the output below. +This time the `-O2` flag enables our loop to be vectorized as can be seen from the output below. ```output vectorizable_loop.cpp:13:30: optimized: loop vectorized using 16 byte vectors /usr/include/c++/13/bits/stl_algobase.h:930:22: optimized: loop vectorized using 16 byte vectors ``` -To see what optimisations were performed and missed between level 2 and level 3, we could direct the terminal output from all optimisations (`-fopt-info`) to a text file with the commands below. +To see what optimizations were performed and missed between level 2 and level 3, we could direct the terminal output from all optimizations (`-fopt-info`) to a text file with the commands below. ```bash g++ -O2 vectorizable_loop.cpp -o level_2 -fopt-info 2>&1 | tee level2.txt g++ -O3 vectorizable_loop.cpp -o level_3 -fopt-info 2>&1 | tee level3.txt ``` -Comparing the outputs between different levels can highlight where in your source code opportunities to optimise code where missed, for example with the `diff` command. This can help you write source code that is more likely to be optimised. However, source code modifications are out of scope for this learning path and we will leave it to the reader to dive into the differences if they wish to learn more. +Comparing the outputs between different levels can highlight where in your source code opportunities to optimize code where missed, for example with the `diff` command. This can help you write source code that is more likely to be optimized. However, source code modifications are out of scope for this learning path and we will leave it to the reader to dive into the differences if they wish to learn more. ## Target balanced performance diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/_index.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/_index.md index d474f8e6fd..bd90c7923a 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/_index.md @@ -1,5 +1,5 @@ --- -title: Glibc + Linux + FVP +title: Build and test a custom Linux image on an FVP draft: true cascade: @@ -8,32 +8,24 @@ cascade: minutes_to_complete: 60 who_is_this_for: > - Developers who wish to run a Linux system (optionally using a custom kernel and - a C library) on an Arm Fixed Virtual Platform (FVP) model. For example, this - guide might be useful if you want to test patches for the Linux kernel or Glibc. + This is an advanced topic for developers who wish to run a Linux system (optionally using a custom kernel and + a C library) on an Arm Fixed Virtual Platform (FVP) model. This learning path might be useful to follow if you want to test patches for the Linux kernel or Glibc prior to having hardware available. -learning_objectives: +learning_objectives: - Build the Linux kernel. - Install the Shrinkwrap tool, build firmware for the FVP and run it. - Configure and boot a Linux system on the FVP. - - Configure guest OS to make running Glibc tests easier. + - Configure guest OS and run Glibc tests. - Build Glibc and run tests on the system running on the FVP. prerequisites: - - An AArch64 or x86 host running a Linux system. - - GCC cross toolchain for the `aarch64-none-linux-gnu` target. - - Docker. - - Git to checkout sources. - - Make to build the tools. - - Bash for your shell. - - Python 3.x and `pip` to create a Python virtual environment. - - Common tools like `wget`, `unxz`, `truncate`. + - An AArch64 or x86_64 Linux machine. The instructions in this Learning Path have been tested on AArch64 Linux machine running Ubuntu 24.04. author: Yury Khrustalev ### Tags skilllevels: Advanced -subjects: Architecture Enablement +subjects: Performance and Architecture armips: - AArch64 tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/conventions.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/conventions.md index 3d93c460c3..8392333f7a 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/conventions.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/conventions.md @@ -1,44 +1,49 @@ --- -title: Conventions +title: Development environment weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -For a few things you may need root access on your host system to do minimal setup and -install packages that are described in the following sections. +In this section, you will install packages and review the directory structure used throughout the learning path. -## Naming conventions +## Install dependencies -In the following sections we use host system to checkout sources and build various -tools and we also make configuration changes to the guest system that will run on -the Arm [Fixed Virtual Platform (FVP)][1] model. +Run the following commands to make sure the necessary dependencies are installed on your host machine: -Before we begin, it's important to describe the specifics of our setup making it easier -to write commands and code examples. Wherever possible we use generic commands and code -examples but in certain places we have to use hardcoded values and absolute paths. +```bash +sudo apt update && sudo apt install -y \ + git make bash \ + flex bison build-essential libssl-dev bc libelf-dev libncurses-dev \ + python3 python3-pip python-is-python3 python3-venv wget xz-utils coreutils +``` -Table 1. Directory layout +Install GCC to cross compile Linux applications: +```bash +sudo apt install gcc-aarch64-linux-gnu -y +``` + +{{% notice Note%}} +The GCC cross toolchain installation directory contains everything a cross toolchain would need, for example, the path to the `gcc` tool would be `/usr/bin/aarch64-linux-gnu-gcc`. +{{% /notice %}} -| Path | Description | -|--------------------------------------|--------------------------------------------| -| `/path/to/cross/gcc` | GCC cross toolchain installation directory | -| `/home/user` | Home directory of your host non-root user | -| `/home/user/workspace` | Workspace directory | -| `/home/user/workspace/linux` | Folder with the Linux kernel sources | -| `/home/user/workspace/linux-headers` | Directory for installing kernel headers | -| `/home/user/workspace/linux-build` | Folder for the Linux kernel build output | -| `/home/user/workspace/glibc` | Foldr for the Glibc sources | -| `/home/user/workspace/glibc-build` | Directory foir the Glibc build output | +Install Docker, refer to the [Docker install guide](/install-guides/docker/). +## Directory Structure -We presume that the GCC cross toolchain installation directory contains everything a -cross toolchain would need, for example, the path to the `gcc` tool would be -`/path/to/cross/gcc/bin/aarch64-none-linux-gnu-gcc`. +In the following sections you will checkout sources and build various tools on your Linux host machine. Before you begin, lets look at the directory structure of where you will build the different parts of software stack needed to run this learning path. + +Table 1. Directory layout -In the next steps we create a Python virtual environment. It doesn't matter where -it is located, but to avoid ambiguity let's presume it is in `~/workspace/venv`. +| Path | Description | +|--------------------------------------|---------------------------------------| +| `$HOME` | Home directory of your host non-root user | +| `$HOME/workspace` | Workspace directory | +| `$HOME/workspace/linux` | Folder with the Linux kernel sources | +| `$HOME/workspace/linux-headers` | Directory for installing kernel headers | +| `$HOME/workspace/linux-build` | Folder for the Linux kernel build output | +| `$HOME/workspace/glibc` | Folder for the Glibc sources | +| `$HOME/workspace/glibc-build` | Directory for the Glibc build output | -[1]: https://developer.arm.com/downloads/-/arm-ecosystem-fvps diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/fvp.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/fvp.md index 0b137ec265..0cfd00e2ac 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/fvp.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/fvp.md @@ -1,5 +1,5 @@ --- -title: Boot Linux on FVP +title: Boot Linux on the FVP weight: 5 ### FIXED, DO NOT MODIFY @@ -8,30 +8,29 @@ layout: learningpathall ## Introduction -During this step, we set up everything for the FVP to run and then boot the Linux system -using the kernel and the root file system that we prepared earlier. +During this step, you will set up everything for the FVP to run and then boot the Linux system +using the kernel and the root file system that you prepared earlier in the earlier sections. -Arm [Fixed Virtual Platform (FVP)][1] is a model that allows you to access functionality -of Armv8.x or v9.x hardware. We use the Armv-A Base Rev C Architecture Envelope Model (AEM). +Arm [Fixed Virtual Platform (FVP)](https://developer.arm.com/downloads/-/arm-ecosystem-fvps) is a model that allows you to access functionality +of Armv8.x or v9.x hardware. You will use the Armv-A Base Rev C Architecture Envelope Model (AEM). In addition to the model itself, you also need the device tree and the firmware. To simplify -building these components, we use a tool called [Shrinkwrap][2]. +building these components, you will use a tool called [Shrinkwrap](https://gitlab.arm.com/tooling/shrinkwrap). -This tool comes with a detailed [user guide][3] that covers all of its features and -configuration options. Here, we provide a short quick-start guide. +This tool comes with a detailed [user guide](https://shrinkwrap.docs.arm.com/en/latest/) that covers all of its features and +configuration options. You will also leverage this tool to run and boot the images you prepared on the FVP. -We also rely on a Docker container to facilitate the building of the firmware and +Shrinkwrap can be used in a few different ways. One of the ways is to use a Docker container to facilitate the building of the firmware and running the FVP. This helps avoid installing all the dependencies on your host system. -Shrinkwrap can be used without Docker but it requires extra steps to ensure that all -dependencies are of the right version and are installed correctly. ## Install Shrinkwrap -First, we install prerequisites in a Python virtual environment using Python 3.x: +First, install prerequisites in a Python virtual environment using Python 3.x: ```bash -python -m venv ~/workspace/venv -source ~/workspace/venv/bin/activate +cd $HOME +python -m venv $HOME/workspace/venv +source $HOME/workspace/venv/bin/activate pip install -U pip setuptools wheel pip install pyyaml termcolor tuxmake ``` @@ -41,7 +40,7 @@ repository. Run this command in the workspace directory: ```bash git clone https://git.gitlab.arm.com/tooling/shrinkwrap.git -export PATH=${PATH}:$(pwd)/shrinkwrap/shrinkwrap +export PATH=${PATH}:${HOME}/workspace/shrinkwrap/shrinkwrap ``` Putting Shrinkwrap's main executable on your `PATH` is all you need to install the tool. @@ -52,12 +51,15 @@ is activated, then run this command: shrinkwrap --version ``` -## Build firmware for FVP +You should see a version printed. -Before proceeding, ensure that Docker is installed and usable. Follow the installation -instructions for your distro [here][4]. +```output +shrinkwrap v1.0.0 +``` + +## Build firmware for the FVP -Now, we use the Shrinkwrap tool to build the firmware, the third essential ingredient in our +Now, you can use the Shrinkwrap tool to build the firmware, the third essential ingredient in our setup. The following step needs to be done once although you will need to repeat it if you want to rebuild the firmware: @@ -78,17 +80,17 @@ the components including the device tree for the FVP. At this point, we have everything required to boot our system. Shrinkwrap uses so called overlay configuration files. The following file instructs Shrinkwrap to connect all the pieces together and locate the kernel image, and rootfs. It can also be used to tweak any of the FVP -parameters. Save this file as `~/workspace/aarch64.yaml`: +parameters. Create a file in $HOME/workspace directory called `aarch64.yaml` using a text editor of your choice. Copy the contents shown below into the file: ```yaml run: rtvars: ROOTFS: - value: /home/user/workspace/rootfs.img + value: ~/workspace/rootfs.img CMDLINE: value: ip=dhcp kpti=off root=/dev/vda2 console=ttyAMA0 KERNEL: - value: /home/user/workspace/linux-build/arch/arm64/boot/Image + value: ~/workspace/linux-build/arch/arm64/boot/Image params: -C bp.hostbridge.userNetworking: 1 -C bp.hostbridge.userNetPorts: 8022=22,8123=8123 @@ -108,7 +110,7 @@ The most important parts in this configuration file are: You can add more ports as needed. The FVP has many parameters that can be tweaked in this config by adding a `-C param: value` -line to the `params` section. Refer to the [Fast Models Fixed Virtual Platforms Reference Guide][5] +line to the `params` section. Refer to the [Fast Models Fixed Virtual Platforms Reference Guide](https://developer.arm.com/documentation/100966/latest/Getting-Started-with-Fixed-Virtual-Platforms/Configuring-the-model). for more details. ## Run FVP with Shrinkwrap @@ -116,7 +118,7 @@ for more details. To run the FVP using Docker, execute the following command: ```bash -shrinkwrap run ns-edk2.yaml --overlay ~/workspace/aarch64.yaml +shrinkwrap run ns-edk2.yaml --overlay $HOME/workspace/aarch64.yaml ``` At first, Shrinkwrap starts a Docker container and runs the FVP in it. At the beginning @@ -156,9 +158,6 @@ uname -a cat /proc/cpuinfo ``` -We will do more setup during the next step. This additional setup is optional but it -helps prepare our guest system for running Glibc tests and doing other complex tasks. - ## Powering down You can always press `Ctrl+]` to stop Shrinkwrap in the terminal where Shrinkwrap is @@ -171,8 +170,6 @@ from the root console of your guest system, for example: ssh root@172.17.0.2 -p 8022 poweroff ``` -[1]: https://developer.arm.com/downloads/-/arm-ecosystem-fvps -[2]: https://gitlab.arm.com/tooling/shrinkwrap -[3]: https://shrinkwrap.docs.arm.com/en/latest/ -[4]: https://docs.docker.com/engine/install/ -[5]: https://developer.arm.com/documentation/100966/latest/Getting-Started-with-Fixed-Virtual-Platforms/Configuring-the-model +Continue to the next section for additional setup. The remaining steps are optional but it +helps prepare our guest system for running Glibc tests and doing other complex tasks. + diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/glibc.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/glibc.md index c4e25c146f..5491039adf 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/glibc.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/glibc.md @@ -1,5 +1,5 @@ --- -title: Glibc tests on FVP +title: Glibc tests on the FVP weight: 7 ### FIXED, DO NOT MODIFY @@ -8,8 +8,8 @@ layout: learningpathall ## Prepare kernel headers -For this step you need the GCC cross-toolchain for the `aarch64-none-linux-gnu` target. -We can use the same toolchain as we used for building the kernel. +For this step you need the GCC cross-toolchain for the `aarch64-linux-gnu` target. +You can use the same toolchain as we used for building the kernel. Since we are going to use our Glibc on a system running a specific version of the kernel, we should build the Glibc using the kernel headers of the same version. The Glibc build @@ -18,31 +18,28 @@ headers from a specific directory. To ensure that we use correct headers for the we need to install them from source. We presume that the kernel source is in the `linux` subfolder, and we will install the -headers in the `linux-headers` subfolder. To do this, run the following commands in the -workspace directory: +headers in the `linux-headers` subfolder. First, you have to set a few environment variables. +To do this, run the following commands: ```bash -# Make sure that cross GCC is on the PATH -export PATH=/path/to/cross/gcc/bin:${PATH} - -# Specify target architecture +cd $HOME/workspace export ARCH=arm64 - -# Specify cross compiler -export CROSS_COMPILE=aarch64-none-linux-gnu- - -# Install headers make -C linux headers_install INSTALL_HDR_PATH=$(pwd)/linux-headers ``` -You should see kernel headers in the `/home/user/workspace/linux-headers/include` folder now. +{{% notice %}} +If you are running an x86_64 Linux host, the `CROSS_COMPILE` flag needs to be set. Example: export CROSS_COMPILE=aarch64-none-linux-gnu- +{{% /notice %}} + +You should see kernel headers in the `$HOME/workspace/linux-headers/include` folder now. We will use this path during the next step. ## Get Glibc sources and build for AArch64 target -In the workspace directory, clone the Glibc Git repository: +Clone the Glibc Git repository: ```bash +cd $HOME/workspace git clone git://sourceware.org/git/glibc.git ``` @@ -50,17 +47,15 @@ To make the following command simpler, let's introduce the `CROSS` variable (not at the end): ```bash -CROSS=/path/to/cross/gcc/bin/aarch64-none-linux-gnu- +export CROSS=/usr/bin/aarch64-linux-gnu- ``` -Now configure the cross-build for the `aarch64-none-linux-gnu` target: +Now configure the cross-build for the `aarch64-linux-gnu` target: ```bash -# Create build folder: mkdir glibc-build cd glibc-build -# Configure Glibc: LC_ALL=C BUILD_CC=gcc \ CC=${CROSS}gcc CXX=${CROSS}g++ \ NM=${CROSS}nm READELF=${CROSS}readelf \ @@ -68,9 +63,9 @@ AR=${CROSS}ar GPROF=${CROSS}gprof \ OBJDUMP=${CROSS}objdump OBJCOPY=${CROSS}objcopy \ RANLIB=${CROSS}ranlib \ ../glibc/configure --prefix=/usr \ - --host=aarch64-none-linux-gnu \ + --host=aarch64-linux-gnu \ --enable-hardcoded-path-in-tests \ - --with-headers=/home/user/workspace/linux-headers/include + --with-headers=$HOME/workspace/linux-headers/include ``` Notice the path to the kernel headers in the last parameter in the `configure` command and @@ -84,7 +79,7 @@ make -j$(nproc) ## Run tests on FVP -If you are using an AArch64 host, you can run Glibc tests both on your host and on the FVP +As you are using an AArch64 host, you can run Glibc tests both on your host and on the FVP from the same build tree. Before we run some tests, we need to make sure that we have two important prerequisites in place. @@ -139,14 +134,17 @@ the easiest way to proceed is as follows: dbclient -p 8022 "$@" ``` - * Make it executable using the `chmod u+x` command - * Save this script in one of the directories in your `PATH`. + Make it executable using the `chmod u+x` command, and save this script in one of the directories in your `PATH`, for example `/usr/bin`: + +```bash +chmod u+x ussh && sudo mv ussh /usr/bin/ussh +``` To run a single test, use this command: ```bash make test t=misc/tst-aarch64-pkey \ - test-wrapper="/home/user/workspace/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" + test-wrapper="/home/user/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" ``` Let's see what we have here. The `test` target will build (or rebuild) one test and all @@ -182,9 +180,9 @@ original exit status 0 To run a group of tests, use the following command with the `check` target: ```bash -make check -C /home/user/workspace/glibc/argp \ +make check -C $HOME/workspace/glibc/argp \ objdir=`pwd` \ - test-wrapper="/home/user/workspace/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" + test-wrapper="$HOME/workspace/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" ``` In this instance, we are building and running tests from the `argp` folder. We use the `-C` @@ -196,10 +194,12 @@ To run all the tests, simply do: ```bash make check \ - test-wrapper="/home/user/workspace/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" + test-wrapper="$HOME/workspace/glibc/scripts/cross-test-ssh.sh --ssh ussh fvp" ``` Note that this will take a considerable amount of time. Also, notice that we are not using a parallel `make` command for running tests on the FVP. The reason is that the Fast Models simulation code runs primarily on a single thread, and running tests in parallel would not speed up the execution. + +By reaching the end of this learning path, you have completed setting up a Linux system on the FVP, and used it to run Glibc tests. diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/kernel.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/kernel.md index d39120d358..9f2f57c253 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/kernel.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/kernel.md @@ -8,14 +8,13 @@ layout: learningpathall ## Obtain kernel source -The Linux kernel image is the first essential components that we need. We are going -to build it from source. +The Linux kernel image is the first essential components that you need to build. You are going to build it from source. -There are various ways to obtain the sources for a particular version of the -Linux kernel that you want to use. Here, as an example, we obtain a stable +There are various ways to obtain the sources for a particular version of the Linux kernel that you want to use. As an example, lets checkout a stable version from the mainline repository: ```bash +cd $HOME/workspace git clone https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git pushd linux git checkout v6.13 -b release/6.13 @@ -27,29 +26,26 @@ available. Using a stable kernel version is a good starting point. When everything is up and running, you can switch to the version of the kernel that you are actually -interested in. +interested in running. ## Configure and build kernel -The following commands will configure and build the Linux kernel image. All the -build output, including the binary that we intend to use later, will be put in +Before moving on, export some environment variables, pointing out some key directories and options. +All the build output, including the binary that you intend to use later, will be put in the `linux-build` subfolder. Run the following commands in the workspace directory: ```bash -# Make sure that cross GCC is on the PATH -export PATH=/path/to/cross/gcc/bin:${PATH} - -# Use out-of-tree build for kernel export KBUILD_OUTPUT="$(pwd)/linux-build" - -# Specify target architecture export ARCH=arm64 +```` -# Specify cross compiler -export CROSS_COMPILE=aarch64-none-linux-gnu- +{{% notice %}} +If you are running an x86_64 host, you will need to set the `CROSS_COMPILE` environment variable to point to your GCC cross compile toolchain. For example: export CROSS_COMPILE=aarch64-none-linux-gnu- +{{% /notice %}} -# Build kernel image +Next, build the kernel image: +```bash make -C linux mrproper make -C linux defconfig make -C linux Image -j $(nproc) @@ -69,6 +65,3 @@ When the build completes, check that the kernel image binary is present: ls linux-build/arch/arm64/boot/Image ``` -If any of the described steps result in an error message, most likely some of the -build dependencies are not installed. You should be able to obtain them from -your distro's package manager. diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/rootfs.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/rootfs.md index fcdead2a3d..cf13a1a021 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/rootfs.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/rootfs.md @@ -1,37 +1,38 @@ --- -title: Root file system +title: Build the Root file system weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Download and unpack rootfs +## Download and unpack root file system The root file system (or rootfs for short) is the second essential component -that we need. The root file system is a collection of files that are essential +that you will need. The root file system is a collection of files that are essential for a Linux system to function. Usually, it also includes various tools that make using your system more convenient. -Since we provide our own kernel, there isn't much that is required from a +Since you are providing your own kernel, there isn't much that is required from a rootfs. All you need is for it to be built for the AArch64 target and contain the tools that you require. -To speed things up for this learning path, we use a readily available rootfs -for the [Void Linux][1] distro. There are other options for obtaining a working +To speed things up for this learning path, you will use a readily available rootfs +for the [Void Linux](https://voidlinux.org/) distro. There are other options for obtaining a working root file system, but the rest of this learning path assumes that you are using the Void Linux distribution. Download the image: ```bash +cd $HOME/workspace wget https://repo-default.voidlinux.org/live/20250202/void-rpi-aarch64-20250202.img.xz ``` Note that at the time when you are reading this, there might be a newer version available. Let's unpack and resize the image. The added size determines how much free disk space -we will have in our guest system: +you will have in your guest system: ```bash unxz --keep void-rpi-aarch64-20250202.img.xz @@ -39,7 +40,7 @@ mv void-rpi-aarch64-20250202.img rootfs.img truncate -s +2G rootfs.img ``` -Here we add 2 GiB of free space. Of course, the file system in this image is not actually +Here you added 2 GiB of free space. Of course, the file system in this image is not actually resized at this point. Void Linux will be able to do it automatically during the first boot. @@ -47,4 +48,3 @@ Note that when we run our system, the rootfs image file will be modified. If som goes wrong, the image might be corrupted, and you might not be able to boot from it again. That's why it's recommended to create a backup copy after the initial setup. -[1]: https://voidlinux.org/ diff --git a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/setup.md b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/setup.md index 019c2e61e9..9fa2ca3c3d 100644 --- a/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/glibc-linux-fvp/setup.md @@ -6,9 +6,11 @@ weight: 6 layout: learningpathall --- -## A few tips about Void Linux +In this section, you will configure the guest system to make it easier running more complex tasks. -For a detailed guide on Void Linux, refer to the [documentation][1]. +## Void Linux basics + +For a detailed guide on Void Linux, refer to the [documentation](https://docs.voidlinux.org/). Commands in this section are executed on the guest system. @@ -19,20 +21,16 @@ the default shell for a user, run this command: chsh -s /bin/bash root ``` -If you need to install some package, use the following commands: +You install packages using `xbps-install`. The `-S` option updates the repository cache. You need to run this once before installing additional packages. For example, install `vim` with the following commands: ```bash -# Update repository cache (you need to run this once) xbps-install -y -S - -# Install vim package (for example): xbps-install -y vim ``` -We can add a bit of automation to speed up configuring your system from scratch: +You can add a bit of automation to speed up configuring your system from scratch: ```bash -# Install packages: required and optional required=(nfs-utils sv-netmount rpcbind) optional=(vim binutils make strace python3) for p in "${required[@]}" "${optional[@]}"; do @@ -51,7 +49,6 @@ to free up some space. Since we use our own kernel and firmware, we can safely delete the following packages: ```bash -# Remove unused packages: unused=(rpi-firmware rpi-kernel) for p in "${unused[@]}"; do test -f "/etc/xbps.d/disable-${p}.conf" || { @@ -61,7 +58,7 @@ for p in "${unused[@]}"; do done ``` -Here, we also mask these packages to prevent them from being installed automatically +Here, you also mask these packages to prevent them from being installed automatically during system updates, saving time in the process. The last two code snippets are written so that you can re-run them multiple times. @@ -80,17 +77,15 @@ Commands in this section are executed on the guest system. Our main interaction with the guest system will be via SSH. Running software on an FVP is slower than on real hardware, so we want to reduce the overhead. One way to do this is by replacing the preinstalled OpenSSH server with a more lightweight -alternative, such as [Dropbear][2]. +alternative, such as [Dropbear](https://matt.ucc.asn.au/dropbear/dropbear.html). First, install Dropbear and enable corresponding service: ```bash -# Install Dropbear server xbps-query -l | grep -w dropbear > /dev/null || { xbps-install -y dropbear } -# Enable Dropbear SSH server service test -h /var/service/dropbear || { ln -s /etc/sv/dropbear /var/service/ } @@ -99,7 +94,6 @@ test -h /var/service/dropbear || { Now, disable the OpenSSH server: ```bash -# Disable OpenSSH server service test -h /var/service/sshd && { sv stop sshd rm -vf /var/service/sshd @@ -110,7 +104,6 @@ Finally, create a simple service that prints a message to the system log when th system is ready for incoming SSH connections: ```bash -# Create service to indicate SSH readiness test -h /var/service/hello || { mkdir -p /etc/sv/hello cat < /etc/sv/hello/run @@ -146,7 +139,7 @@ to use the Dropbear client and configure SSH keys for it. First, install the Dropbear client: ```bash -sudo apt install dropbear-bin +sudo apt -y install dropbear-bin ``` To avoid typing the guest system's IP address every time, add it to `/etc/hosts`: @@ -171,29 +164,30 @@ entering a password: dbclient -l root -p 8022 fvp ``` -## Non-root user +## Set up a non-root user Commands in this section are executed on the guest system. Creating a non-root user in the guest system can be practical. Additionally, we will copy the same SSH key used for the root user to avoid setting up different key pair and having to alternate between them. For a non-root user in the guest system we -will use the same username `user` as on your host system. +will use the username `user`. + +SSH as root into the guest system running on FVP and execute these commands: ```bash -# Create user -id -u user 2> /dev/null || { - useradd -m -s /bin/bash user - test -d /home/user/.ssh || { - mkdir -p /home/user/.ssh - chown user:user /home/user/.ssh - chmod 0700 /home/user/.ssh - } - test -f /root/.ssh/authorized_keys && { - cp /root/.ssh/authorized_keys /home/user/.ssh/authorized_keys - chown user:user /home/user/.ssh/authorized_keys - chmod 0600 /home/user/.ssh/authorized_keys - } +id -u user 2> /dev/null || useradd -m -s /bin/bash user + +test -d /home/user/.ssh || { + mkdir -p /home/user/.ssh + chown user:user /home/user/.ssh + chmod 0700 /home/user/.ssh +} + +test -f /root/.ssh/authorized_keys && { + cp /root/.ssh/authorized_keys /home/user/.ssh/authorized_keys + chown user:user /home/user/.ssh/authorized_keys + chmod 0600 /home/user/.ssh/authorized_keys } ``` @@ -204,7 +198,7 @@ without having to enter a password: dbclient -p 8022 fvp ``` -## Shared workspace +## Configure a shared workspace Commands in this section are executed first on the host system and then on the guest. @@ -219,7 +213,7 @@ for example, by the Glibc tests, so it is not suitable for our use case. On your host, install the NFS server (this example is for Debian or Ubuntu): ```bash -sudo apt install nfs-kernel-server +sudo apt -y install nfs-kernel-server sudo systemctl disable nfs-kernel-server ``` @@ -275,18 +269,16 @@ and the host systems. ## More configuration -The following changes to the guest system configuration help reduce host memory usage by the -FVP: +Disabling Address Space Layout Randomization (ASLR) in the guest system helps reducing host memory usage by the FVP: ```bash -# Disable ASLR (helps reduce host memory usage by the FVP process): mkdir -p /etc/sysctl.d test -f /etc/sysctl.d/01-disable-aslr.conf || { echo "kernel.randomize_va_space = 0" > /etc/sysctl.d/01-disable-aslr.conf } ``` -These changes also help prevent the OOM (Out of Memory) killer from unnecessarily terminating +These changes help prevent the OOM (Out of Memory) killer from unnecessarily terminating processes on the guest system when they consume too much memory: ```bash @@ -296,9 +288,5 @@ test -f /etc/sysctl.d/02-disable-vm-overcommit.conf || { } ``` -We are now ready to do something substantial with our Linux system running on the FVP. Let's -build the Glibc from source and run its tests on the FVP. - +You are now ready to do put your Void Linux system to the test. Let'sbuild the Glibc from source and run its tests on the FVP. -[1]: https://docs.voidlinux.org/ -[2]: https://matt.ucc.asn.au/dropbear/dropbear.html diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md index 1de2d2050c..58c9ebe668 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md @@ -1,57 +1,82 @@ --- -title: Introduction to performance libraries +title: "Introduction to Libraries" weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Introduction to performance libraries +## Types of Library -The C++ Standard Library provides a collection of classes and functions that are essential for everyday programming tasks, such as data structures, algorithms, and input/output operations. It is designed to be versatile and easy to use, ensuring compatibility and portability across different platforms. However as a result of this portability, standard libraries introduce some limitations. Performance sensitive applications may wish to take maximum advantage of the hardware's capabilities - this is where performance libraries come in. +C++ libraries generally fall into two major categories, each serving different needs. This section walks you through both the standard library and performance libraries, and outlines the purpose and characteristics of each. -Performance libraries are specialized for high-performance computing tasks and are often tailored to the microarchitecture of a specific processor. These libraries are optimized for speed and efficiency, often leveraging hardware-specific features such as vector units to achieve maximum performance. Performance libraries are crafted through extensive benchmarking and optimization, and can be domain-specific, such as genomics libraries, or produced by Arm for general-purpose computing. For example, OpenRNG focuses on generating random numbers quickly and efficiently, which is crucial for simulations and scientific computations, whereas the C++ Standard Library offers a more general-purpose approach with functions like `std::mt19937` for random number generation. +### Standard C++ Library -Performance libraries for Arm CPUs, such as the Arm Performance Libraries (APL), provide highly optimized mathematical functions for scientific computing. An analogous library for accelerating routines on a GPU is cuBLAS for NVIDIA GPUs. These libraries can be linked dynamically at runtime or statically during compilation, offering flexibility in deployment. They are designed to support multiple versions of the Arm architecture, including those with NEON and SVE. Generally, minimal source code changes are required to use these libraries, making them suitable for porting and optimizing applications. +The C++ Standard Library provides a collection of classes, functions, and templates that are defined by the C++ standard and are essential for everyday programming, such as: -### How can I choose the right version of a performance library? +* Data structures. +* Algorithms. +* Input/output operations. +* Utility functions. -Performance libraries are often distributed with multiple formats to support various use cases. +### Trade-offs between versatility and performance -- **ILP64** uses 64 bits for representing integers, which are often used for indexing large arrays in scientific computing. In C++ source code we use the `long long` type to specify 64-bit integers. +The C++ Standard Library is designed to be versatile and easy to use, ensuring compatibility and portability across different platforms. This portability comes at a cost, however, and standard libraries have some limitations. Designers of performance-sensitive applications might wish to take advantage of the hardware's full capabilities, and where they might be unable to do so through standard libraries, they can instead implement performance libraries that can bring these performance optimizations into effect. -- **LP64** uses 32 bits to present integers which are more common in general purpose applications. +### Benefits of Performance libraries -- **Open Multi-process** (OpenMP) is a programming interface for paralleling workloads across many CPU cores across multiple platforms (i.e. x86, AArch64 etc.). Programmers interact primarily through compiler directives, such as `#pragma omp parallel` indicating which section of source code can be run in parallel and which sections require synchronization. +Performance libraries are specialized for high-performance computing tasks and are often tailored to the microarchitecture of a specific processor. These libraries are optimized for speed and efficiency, often leveraging hardware-specific features such as vector units to achieve maximum performance. -Arm performance libraries like the x86 equivalent, Open Math Kernel Library (MKL) provide optimized functions for both ILP64 and LP64 as well as OpenMP or single threaded implementations. Further, the interface libraries are available as shared libraries for dynamic linking (i.e. `*.so`) or static linking (i.e. `*.a`). +Crafted through extensive benchmarking and optimization, performance libraries can be domain-specific - such as genomics libraries - or for general-purpose computing. For example, OpenRNG focuses on generating random numbers quickly and efficiently, which is crucial for simulations and scientific computations, whereas the C++ Standard Library offers a more general-purpose approach with functions such as `std::mt19937` for random number generation. -### Why do multiple performance Libraries exist? +Performance libraries for Arm CPUs - such as the Arm Performance Libraries (APL) - provide highly optimized mathematical functions for scientific computing. An analogous library for accelerating routines on a GPU is cuBLAS, which is available for NVIDIA GPUs. -A natural source of confusion stems from the plethora of similar seeming performance libraries. For example, OpenBLAS and NVIDIA Performance Libraries (NVPL) both have their own implementations for basic linear algebra subprograms (BLAS). This begs the question which one should a developer use? +These libraries can be linked dynamically at runtime or statically during compilation, offering flexibility in deployment. They are designed to support multiple versions of the Arm architecture, including those with NEON and SVE. Generally, only minimal source code changes are required to use these libraries, making them ideal for porting and optimizing applications. -Multiple performance libraries coexist to cater to the diverse needs of different hardware architectures and applications. For instance, Arm performance libraries are optimized for Arm CPUs, leveraging the unique instruction sets and power efficiency. On the other hand, NVIDIA performance libraries for Grace CPUs are tailored to maximize the performance of NVIDIA's hardware. +### How do I choose the right version of a performance library? -- **Hardware Specialization** Some libraries are designed to be cross-platform, supporting multiple hardware architectures to provide flexibility and broader usability. For example, the OpenBLAS library supports both Arm and x86 architectures, allowing developers to use the same library across different systems. +Performance libraries are often distributed in multiple formats to support various use cases: -- **Domain-Specific Libraries**: Libraries are often created to handle specific domains or types of computations more efficiently. For instance, libraries like cuDNN are optimized for deep learning tasks, providing specialized functions that significantly speed up neural network training and inference. +- **ILP64** uses 64 bits for representing integers, which are often used for indexing large arrays in scientific computing. In C++ source code, one uses the `long long` type to specify 64-bit integers. -- **Commercial Libraries**: Alternatively, some highly performant libraries require a license to use. This is more common in domain specific libraries such as computational chemistry or fluid dynamics. +- **LP64** uses 32 bits to represent integers which are more common in general-purpose applications. + +- **Open Multi-Processing** (OpenMP) is a cross-platform programming interface for parallelizing workloads across many CPU cores, such as x86 and AArch64. Programmers interact primarily through compiler directives, such as `#pragma omp parallel` indicating which section of source code can be run in parallel and which sections require synchronization. + +Arm Performance Libraries, in common with their x86 equivalent, Open Math Kernel Library (MKL), provide optimized functions for both ILP64 and LP64, as well as for OpenMP or single-threaded implementations. + +Additionally, interface libraries are available as shared libraries for dynamic linking, such as those with a `.so` file extension, or as static linking, such as those with a `.a` file extension. + +### Which performance library should I choose? + +A natural source of confusion stems from the plethora of similar performance libraries. For example, OpenBLAS and NVIDIA Performance Libraries (NVPL) each offer their own implementation of basic linear algebra subprograms (BLAS). This raises the question: which one should a developer choose? + +Multiple performance libraries exist to meet the diverse needs of different hardware architectures and applications. For instance, Arm performance libraries are optimized for Arm CPUs, leveraging unique instruction sets and power efficiency. Meanwhile, NVIDIA performance libraries for Grace CPUs are tailored to maximize the performance of NVIDIA's hardware. + +Here are some of the different types of performance libraries available: + +- Hardware-specialized - some libraries are designed to be cross-platform, supporting multiple hardware architectures to provide flexibility and broader usability. For example, the OpenBLAS library supports both Arm and x86 architectures, allowing developers to use the same library across different systems. + +- Domain-specific - libraries are often created to handle specific domains or types of computations more efficiently. For instance, libraries like cuDNN are optimized for deep learning tasks, providing specialized functions that significantly speed up neural network training and inference. + +- Commercial - some highly-performant libraries require a license to use. This is more common in domain-specific libraries such as computational chemistry or fluid dynamics. These factors contribute to the existence of multiple performance libraries, each tailored to meet the specific demands of various hardware and applications. Invariably, there will be performance differences between each library and the best way to observe them is to use the library within your own application. -For more information on performance benchmarking you can read [Arm Performance Libraries 24.10](https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/arm-performance-libraries-24-10). +For more information on performance benchmarking, see [Arm Performance Libraries 24.10](https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/arm-performance-libraries-24-10). ### What performance libraries are available on Arm? -For a directory of community-produced libraries we recommend looking at the the Software Ecosystem Dashboard for Arm. Each library may not be available as a binary and may need to be compiled from source. The table below gives examples of libraries that are available on Arm. +For a directory of community-produced libraries, see the [Software Ecosystem Dashboard for Arm](https://www.arm.com/developer-hub/ecosystem-dashboard). + +Each library might not be available as a binary and you might need to compile it from source. The table below gives examples of libraries that are available on Arm. | Package / Library | Domain | | -------- | ------- | | Minimap2 | Long-read sequence alignment in genomics | | HMMER |Bioinformatics library for homologous sequences | -| FFTW | Open-source fast fourier transform library | +| FFTW | Open-source Fast Fourier Transform Library | -See the [Software Ecosystem Dashboard for Arm](https://www.arm.com/developer-hub/ecosystem-dashboard) for the most comprehensive and up-to-date list. \ No newline at end of file +See the [Software Ecosystem Dashboard for Arm](https://www.arm.com/developer-hub/ecosystem-dashboard) for the most comprehensive and up-to-date list. diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/2.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/2.md index 52cb2e9579..d80d0c459d 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/2.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/2.md @@ -6,18 +6,20 @@ weight: 3 layout: learningpathall --- +## Get started + You can install Arm Performance Libraries on an Arm-based AWS instance, such as `t4g.2xlarge`, running Ubuntu 22.04 LTS. -For instructions to create and connect to an AWS instance, please refer to [Get started with Servers and Cloud Computing](/learning-paths/servers-and-cloud-computing/intro/). +For instructions to create and connect to an AWS instance, see [Get started with Servers and Cloud Computing](/learning-paths/servers-and-cloud-computing/intro/). -Once connected via `ssh`, install the required packages with the following commands. +Once connected via `ssh`, install the required packages with the following commands: ```bash sudo apt update sudo apt install gcc g++ make -y ``` -Next, install Arm Performance Libraries with the commands below. For more information, refer to the [Arm Performance Libraries install guide](/install-guides/armpl/). +Next, install Arm Performance Libraries with the commands below. For more information, see the [Arm Performance Libraries install guide](/install-guides/armpl/). ```bash wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_deb_gcc.tar @@ -41,13 +43,13 @@ You should see the `armpl/24.10.0_gcc` available. armpl/24.10.0_gcc ``` -Load the module with the following command. +Load the module with the following command: ```bash module load armpl/24.10.0_gcc ``` -Navigate to the `lp64` C source code examples and compile. +Navigate to the `lp64` C source code examples and compile: ```bash cd $ARMPL_DIR/examples_lp64 @@ -62,6 +64,6 @@ Your terminal output shows the examples being compiled and the output ends with: Test passed OK ``` -For more information on all the available function, refer to the [Arm Performance Libraries Reference Guide](https://developer.arm.com/documentation/101004/latest/). +For more information, see the [Arm Performance Libraries Reference Guide](https://developer.arm.com/documentation/101004/latest/). diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md index 9b3bbbbe7f..747a377c3a 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md @@ -5,12 +5,13 @@ weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- +## Libamath -The `libamath` library from Arm is an optimized subset of the standard library math functions for Arm-based CPUs, providing both scalar and vector functions at different levels of precision. It includes vectorized versions (Neon and SVE) of common math functions found in the standard library, such as those in the `` header. +The `libamath` library from Arm is an optimized subset of the standard library math functions for Arm-based CPUs, providing both scalar and vector functions at different levels of precision. It includes vectorized versions (NEON and SVE) of common math functions found in the standard library, such as those in the `` header. The trivial snippet below uses the `` standard cmath header to calculate the base exponential of a scalar value. -Use a text editor to copy and paste the example code below into a file named `basic_math.cpp`. +Use a text editor to copy and paste the example code below into a file named `basic_math.cpp`: ```cpp #include @@ -26,7 +27,7 @@ int main() { } ``` -Compile the code using the `g++` command. You can use the `ldd` command to print the shared objects for dynamic linking. +Compile the code using the `g++` command. You can use the `ldd` command to print the shared objects for dynamic linking: ```bash @@ -34,7 +35,7 @@ g++ basic_math.cpp -o basic_math ldd basic_math ``` -Observe the superset `libm.so` is linked. The output is: +Notice that the superset `libm.so` is linked. The output is: ```output linux-vdso.so.1 (0x0000f55218587000) @@ -45,13 +46,13 @@ Observe the superset `libm.so` is linked. The output is: libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000f55218460000) ``` -## Update to use an optimized library +## Update to use Libamath -To use the optimized math library `libamath` requires minimal source code changes for the scalar example. Modify the include statements to point to the correct header file and additional compiler flags. +Using the optimized math library `libamath` requires only minimal source code changes for the scalar example. Simply modify the include statements to reference the correct header file and add the necessary compiler flags. -Libamath routines have maximum errors inferior to 4 ULPs, where ULP stands for Unit in the Last Place, which is the smallest difference between two consecutive floating-point numbers at a specific precision. These routines only support the default rounding mode (round-to-nearest, ties to even). Therefore, switching from `libm` to `libamath` results in a small accuracy loss on a range of routines, similar to other vectorized implementations of these functions. +Libamath routines have maximum errors below 4 ULPs, where ULP stands for Unit in the Last Place, which is the smallest difference between two consecutive floating-point numbers at any given precision. These routines support only the default rounding mode, which is Round-to-Nearest, Ties to Even. As a result, switching from `libm` to `libamath` might cause a slight loss in accuracy on a range of routines, similar to other vectorized implementations. -Use a text editor to copy and paste the following C++ code into a file named `optimized_math.cpp`. +Use a text editor to copy and paste the following C++ code into a file named `optimized_math.cpp`: ```cpp #include @@ -74,7 +75,7 @@ g++ optimized_math.cpp -o optimized_math -lamath -lm ldd optimized_math ``` -Now you observe the `libamath.so` shared object is linked: +Now you can see that the `libamath.so` shared object is linked: ```output linux-vdso.so.1 (0x0000eb1eb379b000) @@ -83,23 +84,29 @@ Now you observe the `libamath.so` shared object is linked: libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000eb1eb3050000) libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000eb1eb3520000) /lib/ld-linux-aarch64.so.1 (0x0000eb1eb3762000) - libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000eb1eb34f0000 + libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000eb1eb34f0000) ``` -### What about vector operations? +### What about Vector Operations? -The naming convention of the Arm Performance Library for scalar operations follows that of `libm`. Hence, you are able to simply update the header file and recompile. For vector operations, one option is to rely on the compiler autovectorisation, whereby the compiler generates the vector code. This is used in the Arm Compiler for Linux (ACfL). Alternatively, you can use vector routines, which uses name mangling. Mangling is a technique used in computer programming to modify the names of vector functions to ensure uniqueness and avoid conflicts. This is particularly important in compiled languages like C++ and in environments where multiple libraries or modules may be used together. +In the Arm Performance Library, scalar operations follow the same naming convention of `libm`. This means that you can simply update the header file and recompile your code with minimal changes. -In the context of Arm's AArch64 architecture, vector name mangling follows the specific convention below to differentiate between scalar and vector versions of functions. +For vector operations, you have two options: + +1. You can rely on the compiler autovectorization, where the compiler automatically generates the vectorized code. The Arm Compiler for Linux (ACfL) uses autovectorization. + +2. You can use vector routines, which uses name mangling. Name mangling is a technique used in computer programming to modify the names of vector functions to ensure uniqueness and avoid conflicts. This is particularly important in compiled languages like C++ and in environments where multiple libraries or modules might be used together. + +In the context of Arm's AArch64 architecture, vector name mangling follows the specific convention below to differentiate between scalar and vector versions of functions: ```output '_ZGV' '_' ``` Where the values are given below: -- **original_name** : name of scalar libm function -- **ISA** : 'n' for Neon, 's' for SVE -- **Mask** : 'M' for masked/predicated version, 'N' for unmasked. Only masked routines are defined for SVE, and only unmasked for Neon. -- **vlen** : integer number representing vector length expressed as number of lanes. For Neon ='2' in double-precision and ='4' in single-precision. For SVE, ='x'. -- **signature** : 'v' for 1 input floating point or integer argument, 'vv' for 2. More details in AArch64's vector function ABI. +- `original_name` - the name of scalar `libm` function. +- `isa` - 'n' for Neon, 's' for SVE. +- `mask` - 'M' for masked/predicated version, 'N' for unmasked. Only masked routines are defined for SVE, and only unmasked for NEON. +- `vlen` - the integer number representing vector length expressed as number of lanes. For NEON, ='2' in double-precision and ='4' in single-precision. For SVE, ='x'. +- `signature` - 'v' for 1 input floating point or integer argument, 'vv' for 2. For further information, see AArch64's vector function ABI. diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/4.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/4.md index eb77a5a94b..b171fd9984 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/4.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/4.md @@ -6,19 +6,25 @@ weight: 5 layout: learningpathall --- -### Port an example application that uses Intel Vector Statistics Library to Arm +## Port an example application that uses Intel Vector Statistics Library to Arm -OpenRNG is an open-source Random Number Generator (RNG) library, initially released with Arm Performance Libraries 24.04, designed to make application porting to Arm easier. It serves as a drop-in replacement for Intel's Vector Statistics Library (VSL). OpenRNG supports various RNG types, including pseudorandom, quasirandom, and nondeterministic generators, and offers tools for efficient multithreading and converting random sequences into common probability distributions. A vector of random numbers is a sequence of numbers that appear random and are used in various applications, such as simulating unpredictable natural processes, modeling financial markets, and creating unpredictable AI behaviors in gaming. +### Background + +OpenRNG is an open-source Random Number Generator (RNG) library, initially released with Arm Performance Libraries 24.04. It is designed to simplify porting by serving as a drop-in replacement for Intel's Vector Statistics Library (VSL). + +OpenRNG supports various RNG types, including pseudorandom, quasirandom, and non-deterministic generators, and provides tools for efficient multithreading and converting random sequences into common probability distributions. A vector of random numbers is a sequence of numbers that appear random and are used in various applications, such as simulating unpredictable natural processes, modeling financial markets, and creating unpredictable AI behaviors in gaming. ### Run on an x86 Instance -To demonstrate porting you can start with an application running on an x86_64, AWS `t3.2xlarge` instance with 32GB of storage. Please refer to [Getting started with Servers and Cloud computing](/learning-paths/servers-and-cloud-computing/intro/) to create an x86 instance type. +To demonstrate porting, start with an application running on an x86_64, AWS `t3.2xlarge` instance with 32GB of storage. + +See [Getting started with Servers and Cloud computing](/learning-paths/servers-and-cloud-computing/intro/) to create an x86 instance type. Install the OneAPI toolkit using the [Intel oneAPI Toolkits Installation Guide for Linux](https://www.intel.com/content/www/us/en/docs/oneapi/installation-guide-linux/2023-0/apt.html#GUID-560A487B-1B5B-4406-BB93-22BC7B526BCD). The following source code uses a classic algorithm to calculate pi. -Use a text editor to copy and paste the source code below into a file named `pi_x86.c`. +Use a text editor to copy and paste the source code below into a file named `pi_x86.c`: ```c /* @@ -48,7 +54,7 @@ int main() { // Declare and initialise the stream. // // In this example, we've selected the PHILOX4X32X10 generator and seeded it - // with 42. We can then check that the method executed succesfully by checking + // with 42. We can then check that the method executed successfully by checking // the return value for VSL_ERROR_OK. Most methods return VSL_ERROR_OK on // success. // @@ -79,11 +85,11 @@ int main() { // // Use the random numbers. // - // This is a classic algorithm used for estimating the value of pi. We imagine - // a unit square overlapping a quarter of a circle with unit radius. We then - // treat pairs of successive random numbers as points on the unit square. We + // This is a classic algorithm used for estimating the value of pi. You can imagine + // a unit square overlapping a quarter of a circle with unit radius. You then + // treat pairs of successive random numbers as points on the unit square. You // can check if the point is inside the quarter circle by measuring the - // distance between the point and the centre of the circle; if the distance is + // distance between the point and the center of the circle; if the distance is // less than 1, the point is inside the circle. The proportion of points // inside the circle should be // @@ -125,7 +131,7 @@ int main() { Compile the source code by running the following commands. {{% notice Note %}} -You may need to adjust the oneapi version from 2025.0 to the version installed on your system. +You might need to adjust the oneapi version from 2025.0 to the version installed on your system. {{% /notice %}} ```bash @@ -146,9 +152,9 @@ ldd ./pi_x86 ### Port the application to Arm -OpenRNG in most cases is a drop-in replacement for the Vector Statistics Library. Please refer to the reference guide for full information on which functions are supported. To enable this source code to run on Arm you need to modify the header file used. +OpenRNG in most cases is a drop-in replacement for the Vector Statistics Library. See the reference guide for information on supported functions are supported. To enable this source code to run on Arm you need to modify the header file used. -Copy the file `pi_x86.c` to `pi.c` so you can make the modification. +Copy the file `pi_x86.c` to `pi.c` so you can make the modification: ```c // from diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md index 3b0bd336b9..1fa011ed1b 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md @@ -1,15 +1,16 @@ --- -title: Start migrating applications that use performance libraries +title: Migrate applications that leverage performance libraries minutes_to_complete: 60 -who_is_this_for: C/C++ developers who are looking to migrate applications that rely on optimized performance libraries. +who_is_this_for: This Learning Path is for both C and C++ developers who want to migrate applications that rely on optimized performance libraries from x86 to Arm Architecture. learning_objectives: - - Learn how to incorporate optimized libraries. - - Learn how to port a basic application from x86 to AArch64. + - Describe the differences between standard and performance libraries. + - Incorporate optimized libraries. + - Port a basic application from x86 to AArch64. prerequisites: - - Access to an Arm and an x86-based cloud instance. + - Access to both an Arm and an x86-based cloud instance. - Intermediate understanding of C++, compilers, and Linux. author: Kieran Hejmadi diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/_demo.md b/content/learning-paths/servers-and-cloud-computing/whisper/_demo.md index 8744151484..9685aa864b 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/_demo.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/_demo.md @@ -1,14 +1,20 @@ --- -title: Demo - Audio transcription on Arm +title: Demo - Whisper Voice Audio transcription on Arm overview: | - Insert helpful overview here. + This Learning Path shows you how to use a c8g.8xlarge AWS Graviton4 instance, powered by an Arm Neoverse CPU, to build a simple Transcription-as-a-Service server. + This architecture is suitable for businesses looking to deploy the latest Generative AI technologies with audio transcription capabilities using their existing CPU compute capacity and deployment pipelines. This demo provides speech recognition using the `whisper-large-v3-turbo model`, deployed using the Hugging Face Transformers framework. + + Record audio from your browser to interact with the Whisper model and send it to be transcribed, so you csn see the performance for yourself. Note that no recorded audio is ever saved on our servers. + + After running the demo, you can follow the Learning Path to build your own Generative AI service on Arm Neoverse. demo_steps: - - Record your voice (giving mic permissions to your browser). - - Review and send to _________________insert technical thing here_____________. - - Get transcription and view stats. + - Record your voice (grant microphone permissions to your browser). + - Review and send the audio file to the sever for transcription. + - Receive the transcribed output and view stats. + title_chatbot_area: Whisper Voice Demo @@ -21,7 +27,8 @@ terms_and_conditions: demo-terms-and-conditions.txt ### Specific details to this demo # ================================================================================ -stats_description: The 'total time' for a whisper voice-to-text process refers to the complete duration taken from the moment the audio input is received until the final text output is generated. This includes several related times such as the 'pre-processing time', which is the time taken to prepare the audio data for transcription, the 'transcription time', which is the actual time spent converting the audio to text, and the 'post-processing time', which involves refining and formatting the transcribed text. Each of these stages contributes to the overall 'total time' and can vary depending on factors such as audio quality, length of the audio, and the efficiency of the transcription algorithm. +stats_description: | + The 'total time' for a whisper voice-to-text process refers to the complete duration taken from the moment the audio input is received until the final text output is generated. This includes several related times such as the 'pre-processing time', which is the time taken to prepare the audio data for transcription, the 'transcription time', which is the actual time spent converting the audio to text, and the 'post-processing time', which involves refining and formatting the transcribed text. Each of these stages contributes to the overall 'total time' and can vary depending on factors such as audio quality, length of the audio, and the efficiency of the transcription algorithm. ### FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md index e2bab7ac3c..b912714b1f 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md @@ -1,25 +1,22 @@ --- -title: Run OpenAI Whisper Audio Model efficiently on Arm with Hugging Face Transformers - -draft: true -cascade: - draft: true +title: Accelerate Whisper on Arm with Hugging Face Transformers minutes_to_complete: 15 -who_is_this_for: This Learning Path is for software developers looking to run the Whisper automatic speech recognition (ASR) model efficiently. You will use an Arm-based cloud instance to run and build speech transcription based applications. +who_is_this_for: This Learning Path is for software developers familiar with basic machine learning concepts and looking to run the OpenAI Whisper Automatic Speech Recognition (ASR) model efficiently, using an Arm-based cloud instance. learning_objectives: - - Install the dependencies to run the Whisper Model - - Run the OpenAI Whisper model using Hugging Face Transformers. + - Install the dependencies for the Whisper ASR Model. + - Run the Whisper model using Hugging Face Transformers. - Enable performance-enhancing features for running the model on Arm CPUs. - - Compare the total time taken to generate transcript with Whisper. + - Evaluate transcript generation times using Whisper. prerequisites: - - An [Arm-based compute instance](/learning-paths/servers-and-cloud-computing/intro/) with 32 cores, 8GB of RAM, and 32GB disk space running Ubuntu. - - Basic understanding of Python and ML concepts. - - Understanding of Whisper ASR Model fundamentals. + - An [Arm-based compute instance](/learning-paths/servers-and-cloud-computing/intro/) running Ubuntu with 32 cores, 8GB of RAM, and 32GB of disk space. + - Basic knowledge of Python. + - Familiarity with machine learning concepts. + - Familiarity with the fundamentals of the Whisper ASR Model. author: Nobel Chowdary Mandepudi diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/demo-terms-and-conditions.txt b/content/learning-paths/servers-and-cloud-computing/whisper/demo-terms-and-conditions.txt index 4e9a905f19..6984572abd 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/demo-terms-and-conditions.txt +++ b/content/learning-paths/servers-and-cloud-computing/whisper/demo-terms-and-conditions.txt @@ -1,4 +1,4 @@ -TERMS OF SERVICE FOR USE OF CHATBOT DEMO +TERMS OF SERVICE FOR USE OF WHISPER DEMO PLEASE READ THESE TERMS OF SERVICE CAREFULLY. BY USING THE SERVICE, YOU HEREBY ACKNOWLEDGE AND AGREE THAT YOU HAVE READ, FULLY UNDERSTAND, AND AGREE TO BE BOUND BY THESE TERMS OF SERVICE. IF YOU ARE ENTERING INTO THESE TERMS OF SERVICE ON BEHALF OF A COMPANY OR OTHER ENTITY, YOU REPRESENT THAT YOU HAVE THE AUTHORITY TO BIND SUCH COMPANY OR OTHER ENTITY TO THESE TERMS OF SERVICE. diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/whisper.md b/content/learning-paths/servers-and-cloud-computing/whisper/whisper.md index 4cd24f86be..552ab5440d 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/whisper.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/whisper.md @@ -1,8 +1,8 @@ --- # User change -title: "Setup the Whisper Model" +title: "Set up the Whisper Model" -weight: 2 +weight: 3 # Do not modify these elements layout: "learningpathall" @@ -10,26 +10,38 @@ layout: "learningpathall" ## Before you begin -This Learning Path demonstrates how to run the [whisper-large-v3-turbo model](https://huggingface.co/openai/whisper-large-v3-turbo) as an application that takes an audio input and computes the text transcript of it. The instructions in this Learning Path have been designed for Arm servers running Ubuntu 24.04 LTS. You need an Arm server instance with 32 cores, atleast 8GB of RAM and 32GB disk to run this example. The instructions have been tested on a AWS Graviton4 `c8g.8xlarge` instance. +This Learning Path demonstrates how to run the [whisper-large-v3-turbo model](https://huggingface.co/openai/whisper-large-v3-turbo) as an application that accepts an audio input and computes its text transcript. -## Overview +The instructions in this Learning Path have been designed for Arm servers running Ubuntu 24.04 LTS. You will need an Arm server instance with 32 cores, at least 8GB of RAM, and 32GB of disk space. -OpenAI Whisper is an open-source Automatic Speech Recognition (ASR) model trained on the multilingual and multitask data, which enables the transcript generation in multiple languages and translations from different languages to English. You will learn about the foundational aspects of speech-to-text transcription applications, specifically focusing on running OpenAI’s Whisper on an Arm CPU. Lastly, you will explore the implementation and performance considerations required to efficiently deploy Whisper using Hugging Face Transformers framework. +These steps have been tested on an AWS Graviton4 `c8g.8xlarge` instance. + +## Overview and Focus of Learning Path + +OpenAI Whisper is an open-source Automatic Speech Recognition (ASR) model trained on multilingual, multitask data. It can generate transcripts in multiple languages and translate various languages into English. + +In this Learning Path, you will learn about the foundational aspects of speech-to-text transcription applications, with a focus on running OpenAI’s Whisper on an Arm CPU. You will explore the implementation and performance considerations required to efficiently deploy Whisper using the Hugging Face Transformers framework. ### Speech-to-text ML applications -Speech-to-text (STT) transcription applications transform spoken language into written text, enabling voice-driven interfaces, accessibility tools, and real-time communication services. Audio is first cleaned and converted into a format suitable for processing, then passed through a deep learning model trained to recognize speech patterns. Advanced language models help refine the output, improving accuracy by predicting likely word sequences based on context. Whether running on cloud servers, STT applications must balance accuracy, latency, and computational efficiency to meet the needs of diverse use cases. +Speech-to-text (STT) transcription applications transform spoken language into written text, enabling voice-driven interfaces, accessibility tools, and real-time communication services. + +Audio is first cleaned and converted into a format suitable for processing, then passed through a deep learning model trained to recognize speech patterns. Advanced language models help refine the output, improving accuracy by predicting likely word sequences based on context. When deployed on cloud servers, STT applications must balance accuracy, latency, and computational efficiency to meet diverse use cases. + +## Learning Path Setup -## Install dependencies +To get set up, follow these steps, copying the code snippets at each stage. -Install the following packages on your Arm based server instance: +### Install dependencies + +Install the following packages on your Arm-based server instance: ```bash sudo apt update sudo apt install python3-pip python3-venv ffmpeg wget -y ``` -## Install Python Dependencies +### Install Python Dependencies Create a Python virtual environment: @@ -49,18 +61,22 @@ Install the required libraries using pip: pip install torch transformers accelerate ``` -## Download the sample audio file +### Download the Sample Audio File + +Download this sample audio file, which is about 33 seconds in .wav format. -Download a sample audio file, which is about 33 second audio in .wav format. You can use any .wav sound file if you'd like to try some other examples. +You can use any .wav file to try different examples: ```bash wget https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav ``` -## Create a python script for audio to text transcription +### Create a Python Script for Audio-To-Text Transcription -You will use the Hugging Face `transformers` framework to help process the audio. It contains classes that configures the model, and prepares it for inference. `pipeline` is an end-to-end function for NLP tasks. In the code below, it's configured to do pre- and post-processing of the sample in this example, as well as running the actual inference. +Use the Hugging Face `Transformers` framework to process the audio. It provides classes to configure the model and prepare it for inference. -Using a file editor of your choice, create a python file named `whisper-application.py` with the content shown below: +The `pipeline` function is an end-to-end solution for NLP tasks. In the code below, it is configured to do pre- and post-processing of the sample in this example, as well as running inference. + +Using a file editor of your choice, create a Python file named `whisper-application.py` with the following content: ```python { file_name="whisper-application.py" } import torch @@ -110,10 +126,7 @@ print(f'\n{result["text"]}\n') # Calculate and print the duration of the inference duration = end_time - start_time -hours = duration // 3600 -minutes = (duration - (hours * 3600)) // 60 -seconds = (duration - ((hours * 3600) + (minutes * 60))) -msg = f'\nInferencing elapsed time: {seconds:4.2f} seconds\n' +msg = f'\nInferencing elapsed time: {duration:4.2f} seconds\n' print(msg) ``` @@ -125,9 +138,11 @@ export DNNL_VERBOSE=1 python3 whisper-application.py ``` -You should see output similar to the image below with a log output, transcript of the audio and the `Inference elapsed time`. +You should see output similar to the image below, which includes the log output, the audio transcript, and the `Inferencing elapsed time`. ![frontend](whisper_output_no_flags.png) -You've now run the Whisper model successfully on your Arm-based CPU. Continue to the next section to configure flags that can increase the performance your running model. +You have now run the Whisper model successfully on your Arm-based CPU. + +Continue to the next section to configure flags that can boost your model's performance. diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/whisper_deploy.md b/content/learning-paths/servers-and-cloud-computing/whisper/whisper_deploy.md index 2a677d55a1..61d32d6d15 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/whisper_deploy.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/whisper_deploy.md @@ -5,13 +5,15 @@ weight: 4 layout: learningpathall --- -## Setting environment variables that impact performance +## Optimize Environment Variables to Boost Performance -Speech-to-text applications often process large amounts of audio data in real time, requiring efficient computation to balance accuracy and speed. Low-level implementations of the kernels in the neural network enhance performance by reducing processing overhead. When tailored for specific hardware architectures, such as Arm CPUs, these kernels accelerate key tasks like feature extraction and neural network inference. Optimized kernels ensure that speech models like OpenAI’s Whisper can run efficiently, making high-quality transcription more accessible across various server applications. +Speech-to-text applications often process large amounts of audio data in real time, requiring efficient computation to balance accuracy and speed. Low-level implementations of neural network kernels can enhance performance by reducing processing overhead. -Other considerations below allow us to use the memory more efficiently. Things like allocating additional memory and threads for a certain task can increase performance. By enabling these hardware-aware options, applications achieve lower latency, reduced power consumption, and smoother real-time transcription. +When tailored for specific hardware architectures, such as Arm CPUs, these kernels accelerate key tasks such as feature extraction and neural network inference. Optimized kernels ensure that speech models like OpenAI’s Whisper run efficiently, making high-quality transcription more accessible across various server applications. -Use the following flags to enable fast math BFloat16(BF16) GEMM kernels, Linux Transparent Huge Page (THP) allocations, logs to confirm kernel and set LRU cache capacity and OMP_NUM_THREADS to run the Whisper efficiently on Arm machines. +Other factors contribute to more efficient memory usage. For example, allocating additional memory and threads for specific tasks can boost performance. By leveraging these hardware-aware optimizations, applications can achieve lower latency, reduced power consumption, and smoother real-time transcription. + +Use the following flags to optimize performance on Arm machines: ```bash export DNNL_DEFAULT_FPMATH_MODE=BF16 @@ -19,13 +21,22 @@ export THP_MEM_ALLOC_ENABLE=1 export LRU_CACHE_CAPACITY=1024 export OMP_NUM_THREADS=32 ``` +These variables do the following: + +*`export DNNL_DEFAULT_FPMATH_MODE=BF16` - sets the default floating-point math mode for the oneDNN library to BF16 (bfloat16). This can improve performance and efficiency on hardware that supports BF16 precision. + +*`export THP_MEM_ALLOC_ENABLE=1` - enables an optimized memory allocation strategy - often leveraging transparent huge pages - which can enhance memory management and reduce fragmentation in frameworks like PyTorch. + +*`export LRU_CACHE_CAPACITY=1024` - configures the capacity of a Least Recently Used (LRU) cache to 1024 entries. This helps store and quickly retrieve recently used data, reducing redundant computations. + +*`export OMP_NUM_THREADS=32` - sets the number of threads for OpenMP-based parallel processing to 32, allowing your application to take full advantage of multi-core systems for faster performance. {{% notice Note %}} BF16 support is merged into PyTorch versions greater than 2.3.0. {{% /notice %}} ## Run Whisper File -After setting the environment variables in the previous step, now lets run the Whisper model again and analyze the performance impact. +After setting the environment variables in the previous step, run the Whisper model again and analyze the performance impact. Run the `whisper-application.py` file: @@ -35,10 +46,10 @@ python3 whisper-application.py ## Analyze output -You should now observe that the processing time has gone down compared to the last run: +You should now see that the processing time has gone down compared to the last run: ![frontend](whisper_output.png) -The output in the above image has the log containing `attr-fpmath:bf16`, which confirms that fast math BF16 kernels are used in the compute process to improve the performance. +The output in the above image has the log containing `attr-fpmath:bf16`, which confirms that the compute process uses fast math BF16 kernels to improve performance. -By enabling the environment variables as described in the learning path you can see the performance uplift with the Whisper using Hugging Face Transformers framework on Arm. +You have now learned how configuring these environment variables can achieve performance uplift of OpenAI's Whisper model when using Hugging Face Transformers framework on Arm-based systems. diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index 36da1ad1de..785074ef50 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 334 + content_total: 335 content_with_all_tests_passing: 31 content_with_tests_enabled: 57 sw_categories: diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index 3badc1ac5b..5f67ca68db 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -5098,3 +5098,100 @@ avg_close_time_hrs: 0 num_issues: 18 percent_closed_vs_total: 0.0 +- a_date: '2025-03-10' + content: + automotive: 1 + cross-platform: 29 + embedded-and-microcontrollers: 41 + install-guides: 94 + iot: 5 + laptops-and-desktops: 35 + mobile-graphics-and-gaming: 27 + servers-and-cloud-computing: 103 + total: 335 + contributions: + external: 88 + internal: 436 + github_engagement: + num_forks: 30 + num_prs: 16 + individual_authors: + adnan-alsinan: 1 + alaaeddine-chakroun: 2 + albin-bernhardsson: 1 + alex-su: 1 + alexandros-lamprineas: 1 + annie-tallund: 2 + arm: 3 + arnaud-de-grandmaison: 3 + avin-zarlez: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong: 1 + chen-zhang: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 2 + david-spickett: 2 + dawid-borycki: 32 + diego-russo: 2 + dominica-abena-o.-amanfo: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + georgios-mermigkis: 1 + graham-woodward: 1 + han-yin: 1 + iago-calvo-lista: 1 + james-whitaker: 1 + jason-andrews: 97 + joe-stech: 4 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez: 1 + julie-gaskin: 4 + julio-suarez: 5 + kasper-mecklenburg: 1 + kieran-hejmadi: 3 + koki-mitsunami: 2 + konstantinos-margaritis: 9 + kristof-beyls: 1 + leandro-nunes: 1 + liliya-wu: 1 + masoud-koleini: 1 + mathias-brossard: 1 + michael-hall: 5 + na-li: 1 + nader-zouaoui: 2 + nikhil-gupta: 1 + nobel-chowdary-mandepudi: 3 + odin-shen: 3 + owen-wu: 2 + pareena-verma: 41 + paul-howard: 2 + pranay-bakre: 5 + preema-merlin-dsouza: 1 + przemyslaw-wirkus: 2 + ravi-malhotra: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 46 + thirdai: 1 + tianyu-li: 1 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari: 2 + visualsilicon: 1 + willen-yang: 1 + ying-yu: 2 + zach-lasiuk: 2 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 20 + percent_closed_vs_total: 0.0 diff --git a/themes/arm-design-system-hugo-theme/layouts/install-guides/installtoolsall.html b/themes/arm-design-system-hugo-theme/layouts/install-guides/installtoolsall.html index 86e841fab9..c79c57bf16 100644 --- a/themes/arm-design-system-hugo-theme/layouts/install-guides/installtoolsall.html +++ b/themes/arm-design-system-hugo-theme/layouts/install-guides/installtoolsall.html @@ -130,7 +130,11 @@

{{.Title}}

{{if not .Params.multi_install}}
- {{partial "general-formatting/feedback.html" (dict "context" . "learningpath_or_toolinstall" "tool quick-install guide") }} +
+
+ {{partial "general-formatting/feedback.html" (dict "context" . "learningpath_or_toolinstall" "tool quick-install guide") }} +
+
{{end}} diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/demo-stats--llm-voice-transcriber.html b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/demo-stats--llm-voice-transcriber.html index e928248e7e..6d90a8a674 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/demo-stats--llm-voice-transcriber.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/demo-stats--llm-voice-transcriber.html @@ -131,7 +131,7 @@

Stats

-

seconds to complete transcription.

+

seconds to complete transcription of audio recorded at kHz.


diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/javascript--llm-voice-transcriber.html b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/javascript--llm-voice-transcriber.html index d3889e46a0..1111e25c76 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/javascript--llm-voice-transcriber.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/demo-components/llm-voice-transcriber/javascript--llm-voice-transcriber.html @@ -20,7 +20,7 @@ var connectionStatus = { isActive: false }; - + var sample_rate = 0; @@ -30,6 +30,8 @@ document.getElementById('stats-initial').hidden = true; // Update time taken document.getElementById('total-time-metric').textContent = time_taken; + + document.getElementById('sample-rate-metric').textContent = sample_rate; } @@ -126,6 +128,15 @@ mediaRecorder.start(); isRecording = true; + // Create an AudioContext to get the sample rate + const audioContext = new AudioContext(); + sample_rate = audioContext.sampleRate; // update global var + sample_rate = 48000; + sample_rate = (sample_rate / 1000).toFixed(1).replace(/\.0$/, ''); + + console.log('Recording audio at: ', sample_rate, 'kHz'); + + // Cap recording at set time audio_cap_timeout = setTimeout(() => { diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/general-formatting/feedback.html b/themes/arm-design-system-hugo-theme/layouts/partials/general-formatting/feedback.html index 37670f3d29..b496253c04 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/general-formatting/feedback.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/general-formatting/feedback.html @@ -22,89 +22,84 @@ {{ $issuesURL := printf "%s/issues/new?title=Feedback - %s" $gh_repo (safeURL .context.Parent.Title ) -}} - -

-
-
+
-

Feedback

+

Give Feedback

- -
-

How would you rate the overall quality of this {{.learningpath_or_toolinstall}}?

-
- - - - - + +
+

How would you rate this {{.learningpath_or_toolinstall}}?

+
+ + + + + +
+
+ + + + + + -
\ No newline at end of file +
diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/next-steps.html b/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/next-steps.html index 748a3c6944..c9f7a541bc 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/next-steps.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/learning-paths/next-steps.html @@ -47,13 +47,29 @@ font-size: 16px; } + + #privacy-information { + margin-left: 16px; + margin-right: 24px; + } + .devprog-privacy-text { + color: var(--arm-light-grey); + } + + #nonchina-information { + display: block; + } + #china-information { + display: none; + } + -
+
+ +
+
+ Submit + Close +
+
+ Join now +
+ + +
diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/navigation/nav-buttons.html b/themes/arm-design-system-hugo-theme/layouts/partials/navigation/nav-buttons.html index fd7fb1a7da..1585822433 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/navigation/nav-buttons.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/navigation/nav-buttons.html @@ -10,7 +10,7 @@ -
+
{{ $counter := 0 }} {{ range .learningpathfiles }} {{ if eq .Weight $.context.Weight }} diff --git a/themes/arm-design-system-hugo-theme/static/img/arm_dev_program_half.jpg b/themes/arm-design-system-hugo-theme/static/img/arm_dev_program_half.jpg new file mode 100644 index 0000000000..ef0a7ef148 Binary files /dev/null and b/themes/arm-design-system-hugo-theme/static/img/arm_dev_program_half.jpg differ diff --git a/themes/arm-design-system-hugo-theme/static/img/dev_prog.png b/themes/arm-design-system-hugo-theme/static/img/dev_prog.png new file mode 100644 index 0000000000..5ddc9f3d84 Binary files /dev/null and b/themes/arm-design-system-hugo-theme/static/img/dev_prog.png differ