diff --git a/.github/workflows/weekly_stats_update.yml b/.github/workflows/weekly_stats_update.yml index 59ed5ad721..1cc1b21e54 100644 --- a/.github/workflows/weekly_stats_update.yml +++ b/.github/workflows/weekly_stats_update.yml @@ -17,6 +17,7 @@ on: jobs: # This workflow contains a single job update_stats: + if: github.repository == 'ArmDeveloperEcosystem/arm-learning-paths' runs-on: ubuntu-latest steps: - name: Check out current repo diff --git a/.wordlist.txt b/.wordlist.txt index 796f136e22..5afe320b18 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -4474,4 +4474,117 @@ AssetLib PerformanceStudio VkThread precompiled -rollouts \ No newline at end of file +rollouts +Bhusari +DLLAMA +FlameGraph +FlameGraphs +JSP +KBC +MMIO +Paravirtualized +PreserveFramePointer +Servlet +TDISP +VirtIO +WebSocket +agentpath +alarmtimer +aoss +apb +ata +bpf +brendangregg +chipidea +clk +cma +counterintuitive +cpuhp +cros +csd +devfreq +devlink +dma +dpaa +dwc +ecurity +edma +evice +filelock +filemap +flamegraphs +fsl +glink +gpu +hcd +hns +hw +hwmon +icmp +initcall +iomap +iommu +ipi +irq +jbd +jvmti +kmem +ksm +kvm +kyber +libata +libperf +lockd +mdio +memcg +mmc +mtu +musb +napi +ncryption +netfs +netlink +nfs +ntegrity +nterface +oom +optee +pagemap +paravirtualized +percpu +printk +pwm +qcom +qdisc +ras +rcu +regmap +rgerganov’s +rotocol +rpcgss +rpmh +rseq +rtc +sched +scmi +scsi +skb +smbus +smp +spi +spmi +sunrpc +swiotlb +tegra +thp +tlb +udp +ufs +untrusted +uring +virtio +vmalloc +vmscan +workqueue +xdp +xhci diff --git a/assets/contributors.csv b/assets/contributors.csv index 6df4525ab1..1317f49b31 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -1,96 +1,97 @@ -author,company,github,linkedin,twitter,website -Jason Andrews,Arm,jasonrandrews,jason-andrews-7b05a8,, -Pareena Verma,Arm,pareenaverma,pareena-verma-7853607,, -Ronan Synnott,Arm,,ronansynnott,, -Florent Lebeau,Arm,,,, -Brenda Strech,Remote.It,bstrech,bstrech,@remote_it,www.remote.it -Liliya Wu,Arm,Liliyaw,liliya-wu-8b6227216,, -Julio Suarez,Arm,jsrz,juliosuarez,, -Gabriel Peterson,Arm,gabrieldpeterson,gabrieldpeterson,@gabedpeterson,https://corteximplant.com/@gabe -Christopher Seidl,Arm,,,, -Michael Hall,Arm,,,, -Kasper Mecklenburg,Arm,,,, -Mathias Brossard,Arm,,,, -Julie Gaskin,Arm,,,, -Pranay Bakre,Arm,,,, -Elham Harirpoush,Arm,,,, -Frédéric -lefred- Descamps,OCI,,,,lefred.be -Fr�d�ric -lefred- Descamps,OCI,,,,lefred.be -Kristof Beyls,Arm,,,, -David Spickett,Arm,,,, -Uma Ramalingam,Arm,uma-ramalingam,,, -Konstantinos Margaritis,VectorCamp,markos,konstantinosmargaritis,@freevec1,https://vectorcamp.gr/ -Diego Russo,Arm,diegorusso,diegor,diegor,https://www.diegor.it -Jonathan Davies,Arm,,,, -Zhengjun Xing,Arm,,,, -Leandro Nunes,Arm,,,, -Dawid Borycki,,dawidborycki,,, -Ying Yu,Arm,,,, -Bolt Liu,Arm,,,, -Roberto Lopez Mendez,Arm,,,, -Arnaud de Grandmaison,Arm,Arnaud-de-Grandmaison-ARM,arnauddegrandmaison,, -Jose-Emilio Munoz-Lopez,Arm,,,, -James Whitaker,Arm,,,, -Johanna Skinnider,Arm,,,, -Varun Chari,Arm,,,, -Adnan AlSinan,Arm,,,, -Graham Woodward,Arm,,,, -Basma El Gaabouri,Arm,,,, -Gayathri Narayana Yegna Narayanan,Arm,,,, -Alexandros Lamprineas,Arm,,,, -Annie Tallund,Arm,annietllnd,annietallund,, -Cyril Rohr,RunsOn,crohr,cyrilrohr,, -Rin Dobrescu,Arm,,,, -Przemyslaw Wirkus,Arm,PrzemekWirkus,przemyslaw-wirkus-78b73352,, -Nader Zouaoui,Day Devs,nader-zouaoui,nader-zouaoui,@zouaoui_nader,https://daydevs.com/ -Alaaeddine Chakroun,Day Devs,Alaaeddine-Chakroun,alaaeddine-chakroun,,https://daydevs.com/ -Koki Mitsunami,Arm,,kmitsunami,, -Chen Zhang,Zilliz,,,, -Tianyu Li,Arm,,,, -Georgios Mermigkis,VectorCamp,gMerm,georgios-mermigkis,,https://vectorcamp.gr/ -Ben Clark,Arm,,,, -Han Yin,Arm,hanyin-arm,nacosiren,, -Willen Yang,Arm,,,, -Daniel Gubay,,,,, -Paul Howard,,,,, -Iago Calvo Lista,Arm,,,, -Stephen Theobald,Arm,,,, -ThirdAI,,,,, -Preema Merlin Dsouza,,,,, -Dominica Abena O. Amanfo,,,,, -Arm,,,,, -Albin Bernhardsson,,,,, -Przemyslaw Wirkus,,,,, -Zach Lasiuk,,,,, -Daniel Nguyen,,,,, -Joe Stech,Arm,JoeStech,joestech,, -visualSilicon,,,,, -Konstantinos Margaritis,VectorCamp,,,, -Kieran Hejmadi,,,,, -Alex Su,,,,, -Chaodong Gong,,,,, -Owen Wu,Arm,,,, -Koki Mitsunami,,,,, -Nikhil Gupta,,,,, -Nobel Chowdary Mandepudi,Arm,,,, -Ravi Malhotra,Arm,,,, -Masoud Koleini,,,,, -Na Li,Arm,,,, -Tom Pilar,,,,, -Cyril Rohr,,,,, -Odin Shen,Arm,odincodeshen,odin-shen-lmshen,, -Avin Zarlez,Arm,AvinZarlez,avinzarlez,,https://www.avinzarlez.com/ -Shuheng Deng,Arm,,,, -Yiyang Fan,Arm,,,, -Julien Jayat,Arm,JulienJayat-Arm,julien-jayat-a980a397,, -Geremy Cohen,Arm,geremyCohen,geremyinanutshell,, -Barbara Corriero,Arm,,,, -Nina Drozd,Arm,NinaARM,ninadrozd,, -Jun He,Arm,JunHe77,jun-he-91969822,, -Gian Marco Iodice,Arm,,,, -Aude Vuilliomenet,Arm,,,, -Andrew Kilroy,Arm,,,, -Peter Harris,Arm,,,, -Chenying Kuo,Adlink,evshary,evshary,, -William Liang,,wyliang,,, -Waheed Brown,Arm,https://github.com/armwaheed,https://www.linkedin.com/in/waheedbrown/,, +author,company,github,linkedin,twitter,website +Jason Andrews,Arm,jasonrandrews,jason-andrews-7b05a8,, +Pareena Verma,Arm,pareenaverma,pareena-verma-7853607,, +Ronan Synnott,Arm,,ronansynnott,, +Florent Lebeau,Arm,,,, +Brenda Strech,Remote.It,bstrech,bstrech,@remote_it,www.remote.it +Liliya Wu,Arm,Liliyaw,liliya-wu-8b6227216,, +Julio Suarez,Arm,jsrz,juliosuarez,, +Gabriel Peterson,Arm,gabrieldpeterson,gabrieldpeterson,@gabedpeterson,https://corteximplant.com/@gabe +Christopher Seidl,Arm,,,, +Michael Hall,Arm,,,, +Kasper Mecklenburg,Arm,,,, +Mathias Brossard,Arm,,,, +Julie Gaskin,Arm,,,, +Pranay Bakre,Arm,,,, +Elham Harirpoush,Arm,,,, +Frédéric -lefred- Descamps,OCI,,,,lefred.be +Fr�d�ric -lefred- Descamps,OCI,,,,lefred.be +Kristof Beyls,Arm,,,, +David Spickett,Arm,,,, +Uma Ramalingam,Arm,uma-ramalingam,,, +Konstantinos Margaritis,VectorCamp,markos,konstantinosmargaritis,@freevec1,https://vectorcamp.gr/ +Diego Russo,Arm,diegorusso,diegor,diegor,https://www.diegor.it +Jonathan Davies,Arm,,,, +Zhengjun Xing,Arm,,,, +Leandro Nunes,Arm,,,, +Dawid Borycki,,dawidborycki,,, +Ying Yu,Arm,,,, +Bolt Liu,Arm,,,, +Roberto Lopez Mendez,Arm,,,, +Arnaud de Grandmaison,Arm,Arnaud-de-Grandmaison-ARM,arnauddegrandmaison,, +Jose-Emilio Munoz-Lopez,Arm,,,, +James Whitaker,Arm,,,, +Johanna Skinnider,Arm,,,, +Varun Chari,Arm,,,, +Adnan AlSinan,Arm,,,, +Graham Woodward,Arm,,,, +Basma El Gaabouri,Arm,,,, +Gayathri Narayana Yegna Narayanan,Arm,,,, +Alexandros Lamprineas,Arm,,,, +Annie Tallund,Arm,annietllnd,annietallund,, +Cyril Rohr,RunsOn,crohr,cyrilrohr,, +Rin Dobrescu,Arm,,,, +Przemyslaw Wirkus,Arm,PrzemekWirkus,przemyslaw-wirkus-78b73352,, +Nader Zouaoui,Day Devs,nader-zouaoui,nader-zouaoui,@zouaoui_nader,https://daydevs.com/ +Alaaeddine Chakroun,Day Devs,Alaaeddine-Chakroun,alaaeddine-chakroun,,https://daydevs.com/ +Koki Mitsunami,Arm,,kmitsunami,, +Chen Zhang,Zilliz,,,, +Tianyu Li,Arm,,,, +Georgios Mermigkis,VectorCamp,gMerm,georgios-mermigkis,,https://vectorcamp.gr/ +Ben Clark,Arm,,,, +Han Yin,Arm,hanyin-arm,nacosiren,, +Willen Yang,Arm,,,, +Daniel Gubay,,,,, +Paul Howard,,,,, +Iago Calvo Lista,Arm,,,, +Stephen Theobald,Arm,,,, +ThirdAI,,,,, +Preema Merlin Dsouza,,,,, +Dominica Abena O. Amanfo,,,,, +Arm,,,,, +Albin Bernhardsson,,,,, +Przemyslaw Wirkus,,,,, +Zach Lasiuk,,,,, +Daniel Nguyen,,,,, +Joe Stech,Arm,JoeStech,joestech,, +visualSilicon,,,,, +Konstantinos Margaritis,VectorCamp,,,, +Kieran Hejmadi,,,,, +Alex Su,,,,, +Chaodong Gong,,,,, +Owen Wu,Arm,,,, +Koki Mitsunami,,,,, +Nikhil Gupta,,,,, +Nobel Chowdary Mandepudi,Arm,,,, +Ravi Malhotra,Arm,,,, +Masoud Koleini,,,,, +Na Li,Arm,,,, +Tom Pilar,,,,, +Cyril Rohr,,,,, +Odin Shen,Arm,odincodeshen,odin-shen-lmshen,, +Avin Zarlez,Arm,AvinZarlez,avinzarlez,,https://www.avinzarlez.com/ +Shuheng Deng,Arm,,,, +Yiyang Fan,Arm,,,, +Julien Jayat,Arm,JulienJayat-Arm,julien-jayat-a980a397,, +Geremy Cohen,Arm,geremyCohen,geremyinanutshell,, +Barbara Corriero,Arm,,,, +Nina Drozd,Arm,NinaARM,ninadrozd,, +Jun He,Arm,JunHe77,jun-he-91969822,, +Gian Marco Iodice,Arm,,,, +Aude Vuilliomenet,Arm,,,, +Andrew Kilroy,Arm,,,, +Peter Harris,Arm,,,, +Chenying Kuo,Adlink,evshary,evshary,, +William Liang,,wyliang,,, +Waheed Brown,Arm,https://github.com/armwaheed,https://www.linkedin.com/in/waheedbrown/,, +Aryan Bhusari,Arm,,https://www.linkedin.com/in/aryanbhusari,, \ No newline at end of file diff --git a/content/install-guides/azure-cli.md b/content/install-guides/azure-cli.md index d1625082c8..ce40f2b4b1 100644 --- a/content/install-guides/azure-cli.md +++ b/content/install-guides/azure-cli.md @@ -22,11 +22,7 @@ It is available for a variety of operating systems and Linux distributions and h ## How do I prepare to install the Azure CLI on Ubuntu? -[General installation information](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt) is available which covers all supported Linux distributions. Starting with version 2.46.0, Azure CLI supports Arm64 Linux distributions. The 'apt' package manager contains both x86_64 and Arm64 packages for the following linux distributions - -```output -Ubuntu 20.04, Ubuntu 22.04, Ubuntu 24.04 -``` +[General installation information](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt) is available which covers all supported Linux distributions. Starting with version 2.46.0, Azure CLI supports Arm64 Linux distributions. The 'apt' package manager contains both x86_64 and Arm64 packages for Ubuntu 20.04, Ubuntu 22.04, and Ubuntu 24.04. ## How do I install the Azure CLI using the installation script? @@ -60,8 +56,8 @@ You should see an output similar to: ```output { - "azure-cli": "2.61.0", - "azure-cli-core": "2.61.0", + "azure-cli": "2.75.0", + "azure-cli-core": "2.75.0", "azure-cli-telemetry": "1.1.0", "extensions": {} } @@ -98,8 +94,8 @@ You should see an output similar to: ```output { - "azure-cli": "2.61.0", - "azure-cli-core": "2.61.0", + "azure-cli": "2.75.0", + "azure-cli-core": "2.75.0", "azure-cli-telemetry": "1.1.0", "extensions": {} } diff --git a/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md b/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md index 55245d22ff..75f3bf2cbe 100644 --- a/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md +++ b/content/learning-paths/automotive/openadkit2_safetyisolation/_index.md @@ -8,7 +8,7 @@ who_is_this_for: This Learning Path is for automotive engineers developing safet learning_objectives: - Apply functional safety principles, including risk prevention, fault detection, and ASIL compliance, to build robust, certifiable automotive systems - Use DDS and a publish-subscribe architecture for low-latency, scalable, and fault-tolerant communication in autonomous driving systems - - Implement distributed development by separating the simulation platform into independent, safety-isolated components + - Implement distributed development by separating the simulation platform into independent, safety-isolated components prerequisites: - Access to two Arm-based Neoverse cloud instances, or a local Arm Neoverse Linux system with at least 16 CPUs and 32 GB of RAM diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md index a9ac71fcaa..590d49ade7 100755 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/2-executorch-setup.md @@ -15,8 +15,8 @@ The best practice is to generate an isolated Python environment in which to inst ### Option 1: Create a Python virtual environment ```bash -python3.10 -m venv executorch -source executorch/bin/activate +python3.10 -m venv executorch-venv +source executorch-venv/bin/activate ``` The prompt of your terminal has `executorch` as a prefix to indicate the virtual environment is active. @@ -28,8 +28,8 @@ Install Miniconda on your development machine by following the [Installing conda Once `conda` is installed, create the environment: ```bash -conda create -yn executorch python=3.10.0 -conda activate executorch +conda create -yn executorch-venv python=3.10.0 +conda activate executorch-venv ``` ### Clone ExecuTorch and install the required dependencies @@ -40,7 +40,7 @@ From within the conda environment, run the commands below to download the ExecuT git clone https://github.com/pytorch/executorch.git cd executorch git submodule sync -git submodule update --init +git submodule update --init --recursive ./install_executorch.sh ./examples/models/llama/install_requirements.sh ``` diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/4-prepare-llama-models.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/4-prepare-llama-models.md index 116bb3f364..a6c05c4e64 100755 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/4-prepare-llama-models.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/4-prepare-llama-models.md @@ -46,7 +46,8 @@ python3 -m examples.models.llama.export_llama \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001, 128006, 128007]}' \ --embedding-quantize 4,32 \ --output_name="llama3_1B_kv_sdpa_xnn_qe_4_64_1024_embedding_4bit.pte" \ ---max_seq_length 1024 +--max_seq_length 1024 \ +--max_context_length 1024 ``` Due to the larger vocabulary size of Llama 3, you should quantize the embeddings with `--embedding-quantize 4,32` to further reduce the model size. diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md index fe1bd9981e..4f07675eac 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/5-run-benchmark-on-android.md @@ -38,18 +38,23 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON \ -DXNNPACK_ENABLE_ARM_BF16=OFF \ + -DBUILD_TESTING=OFF \ -Bcmake-out-android . cmake --build cmake-out-android -j7 --target install --config Release ``` {{% notice Note %}} -Make sure you add -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON option to enable support for KleidiAI kernels in ExecuTorch with XNNPack. +Starting with Executorch version 0.7 beta, KleidiAI is enabled by default. The -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON option is enabled and adds default support for KleidiAI kernels in ExecuTorch with XNNPack. {{% /notice %}} ### 3. Build Llama runner for Android @@ -67,7 +72,8 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_USE_TIKTOKEN=ON \ + -DSUPPORT_REGEX_LOOKAHEAD=ON \ + -DBUILD_TESTING=OFF \ -Bcmake-out-android/examples/models/llama \ examples/models/llama @@ -144,13 +150,13 @@ Reached to the end of generation I 00:00:05.399314 executorch:runner.cpp:257] RSS after finishing text generation: 1269.445312 MiB (0 if unsupported) PyTorchObserver {"prompt_tokens":54,"generated_tokens":51,"model_load_start_ms":1710296339487,"model_load_end_ms":1710296343047,"inference_start_ms":1710296343370,"inference_end_ms":1710296344877,"prompt_eval_end_ms":1710296343556,"first_token_ms":1710296343556,"aggregate_sampling_time_ms":49,"SCALING_FACTOR_UNITS_PER_SECOND":1000} -I 00:00:05.399342 executorch:stats.h:111] Prompt Tokens: 54 Generated Tokens: 51 -I 00:00:05.399344 executorch:stats.h:117] Model Load Time: 3.560000 (seconds) -I 00:00:05.399346 executorch:stats.h:127] Total inference time: 1.507000 (seconds) Rate: 33.842070 (tokens/second) -I 00:00:05.399348 executorch:stats.h:135] Prompt evaluation: 0.186000 (seconds) Rate: 290.322581 (tokens/second) -I 00:00:05.399350 executorch:stats.h:146] Generated 51 tokens: 1.321000 (seconds) Rate: 38.607116 (tokens/second) -I 00:00:05.399352 executorch:stats.h:154] Time to first generated token: 0.186000 (seconds) -I 00:00:05.399354 executorch:stats.h:161] Sampling time over 105 tokens: 0.049000 (seconds) +I 00:00:04.530945 executorch:stats.h:108] Prompt Tokens: 54 Generated Tokens: 69 +I 00:00:04.530947 executorch:stats.h:114] Model Load Time: 1.196000 (seconds) +I 00:00:04.530949 executorch:stats.h:124] Total inference time: 1.934000 (seconds) Rate: 35.677353 (tokens/second) +I 00:00:04.530952 executorch:stats.h:132] Prompt evaluation: 0.176000 (seconds) Rate: 306.818182 (tokens/second) +I 00:00:04.530954 executorch:stats.h:143] Generated 69 tokens: 1.758000 (seconds) Rate: 39.249147 (tokens/second) +I 00:00:04.530956 executorch:stats.h:151] Time to first generated token: 0.176000 (seconds) +I 00:00:04.530959 executorch:stats.h:158] Sampling time over 123 tokens: 0.067000 (seconds) ``` You have successfully run the Llama 3.1 1B Instruct model on your Android smartphone with ExecuTorch using KleidiAI kernels. diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/_index.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/_index.md index 9b272c76ab..b9fbecd3f9 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/_index.md @@ -1,17 +1,13 @@ --- -title: Use the Arm Performance Studio Integration extension in Godot - -draft: true -cascade: - draft: true - +title: Profile Android game performance in Godot with Arm Performance Studio + minutes_to_complete: 15 -who_is_this_for: This is an introductory topic for Godot developers who are targeting Android devices and want to get more insight into how their game performs on devices with Arm CPUs and GPUs. +who_is_this_for: This is an introductory topic for Godot developers targeting Android devices who want to optimize game performance on Arm CPUs and Mali GPUs using Arm Performance Studio tools. learning_objectives: - Install the Arm Performance Studio Integration extension in Godot - - Annotate your Godot game with markers that give context to a profile in Arm Performance Studio tools + - Annotate your Godot game with performance markers for profiling in Streamline and Performance Advisor prerequisites: - Familiarity with Godot diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/add-markers.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/add-markers.md new file mode 100644 index 0000000000..176283b81b --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/add-markers.md @@ -0,0 +1,43 @@ +--- +title: Annotate Game Events for Profiling in Godot +weight: 5 +layout: learningpathall +--- + +## Use the Performance Studio extension in your project + +All annotation features are provided through the `PerformanceStudio` class. To begin, create an instance in your script: + +```gdscript +var performance_studio = PerformanceStudio.new() + + +## Add single markers to highlight key game events + +The simplest annotations are single markers. These appear in the Streamline timeline and help you correlate game behavior with performance data. + +To emit a basic marker, use the `marker()` method with a descriptive label: + +```gdscript +performance_studio.marker("Game Started") +``` + +This creates a timestamped marker labeled **Game Started**. When you capture a profile in Streamline, you’ll see this marker at the point the game starts. + +![Marker annotation in Streamline#center](sl_marker.png "Marker annotation in Streamline") + + +## Assign a custom color + +You can assign a color to the marker using the `marker_color()` method: + +```gdscript +performance_studio.marker_color("Game Started", Color8(0, 255, 0)) +``` + +This example displays the Game Started marker in green. Use different colors to visually distinguish important game events. + + + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/arm_mobile_studio_integrations.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/arm_mobile_studio_integrations.md deleted file mode 100644 index f345ac8828..0000000000 --- a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/arm_mobile_studio_integrations.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -title: Arm Performance Studio Godot integrations -weight: 3 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## Arm Performance Studio Godot integrations - -[Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio) is a free suite of analysis tools to help you profile game performance on mobile devices with Arm CPUs and GPUs. Arm provides a Godot extension to make data from [Godot games](https://godotengine.org/) visible in the Arm Performance Studio tools, [Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer) and [Performance Advisor](https://developer.arm.com/Tools%20and%20Software/Performance%20Advisor). - -This package provides a simple way to incorporate annotations into your Godot project. These annotations enable you to mark the timeline with events or custom counters which provides valuable context alongside the performance data in Streamline, so you can see what was happening in the game when bottlenecks occur. For example, here you can see markers that highlight where a wave of enemies is spawning: - -![Marker annotations in Streamline](sl_annotation.png "Figure 1. Marker annotations in Streamline") - -{{% notice Note %}} -The Arm Performance Studio Integration extension is supported with Godot version 4.3 and later. -{{% /notice %}} - -### How to install the Arm Performance Studio Integration extension - -1. In Godot, click **AssetLib** to see the available extensions. - -2. Find the **Arm Performance Studio Integration** extension, then double-click to open the extension. - -3. The extension opens in a dialog box. Click **Download**. - -![Installing the Arm Performance Studio Integration extension in Godot](godot_install_performance_studio_extension.png "Figure 2. Installing the Arm Performance Studio Integration extension in Godot") - -4. A new dialog box opens where you can change the install folder if required. Click Install. - -### Using the extension - -All functionality in the extension is provided by the PerformanceStudio class, so first create an instance of it: - -```console -var performance_studio = PerformanceStudio.new() -``` - -### Adding single markers to a Godot project - -The simplest annotations are single markers, which can have a name and a color. To use them in a Godot project where you have installed this extension, simply call into the Performance Studio library as follows: - -```console -performance_studio.marker("Game Started") -``` - -This will emit a timestamped marker with the label "Game Started". When you capture a profile in Streamline, you can see this marker along the top of the timeline at the point that the game starts. - -![Marker annotation in Streamline](sl_marker.png "Figure 4. Marker annotation in Streamline") - -To give the annotation a color, use the `marker_color` method: - -```console -performance_studio.marker_color("Game Started", Color8(0, 255, 0)) -``` - -### Defining regions in a Godot project - -To define regions of interest within the game, you can specify a pair of markers prefixed with “Region Start” and “Region End”, for example: - -```console -performance_studio.marker("Region Start Times Square") -# Do work -performance_studio.marker("Region End Times Square") -``` - -These regions are shown on the frame rate analysis chart in the Performance Advisor report. - -![Regions in Performance Advisor](pa_frame_rate_regions.png "Figure 5. Regions in Performance Advisor") - -Also, dedicated charts for each region are appended to the end of the report, so you can analyze each region independently. - -![Dedicated region charts in Performance Advisor](pa_dedicated_region_charts.png "Figure 6. Dedicated region charts in Performance Advisor") - -### Using channels in a Godot project - -Channels are custom event timelines associated with a software thread. You can create channels and place annotations within them. A channel annotation has a text label and a color but, unlike markers, they span a range of time. - -To create a channel called "Spawner" and insert an annotation called "Spawning Wave", with the color red: - -```console -var channel : PerformanceStudio_Channel - -func _ready() -> void: - channel = performance_studio.create_channel("Spawner") - -# Annotations can then be inserted into a channel: -func _on_new_wave_started() -> void: - channel.annotate_color("Spawning Wave", Color8(255, 0, 0)) - -func _on_wave_completed() -> void: - channel.end() -``` - -To see channels in Streamline, select the **Core Map** view, and expand the **VkThread** thread: - -![Channel annotations in Streamline](sl_channel.png "Figure 7. Channel annotations in Streamline") - -### Creating counters - -Counters are numerical data points that can be plotted as a chart in the Streamline timeline view. Counters can be created as either absolute counters, where every value is an absolute value, or as a delta counter, where values are the number of instances of an event since the last value was emitted. All values are floats and will be presented to 2 decimal places. - -When charts are first defined, you can specify a title and series name. The title names the chart, the series names the data series. - -Multiple counter series can use the same title, which means that they will be plotted on the same chart in the Streamline timeline. - -To create a counter: - -```console -var counter = performance_studio.create_counter("Title", "Series", false) -``` - -Counter values are set easily as shown below: - -```console -counter.setValue(42.2) -``` - -### Custom Activity Maps - -[Custom Activity Map (CAM)](https://developer.arm.com/documentation/101816/latest/Annotate-your-code/User-space-annotations/Custom-Activity-Map-annotations) views allow execution information to be plotted on a hierarchical set of timelines. Like channel annotations, CAM views plot jobs on tracks, but unlike channel annotations, CAM views are not associated with a specific thread. Each CAM view contains one or more tracks and each track contains one or more jobs. - -![Custom activity maps in Streamline](sl_cam.png "Figure 8. Custom activity maps in Streamline") - -To create a custom activity map and add tracks to it: - -```console -var game_cam : PerformanceStudio_CAM -var wave_track : PerformanceStudio_CAMTrack -var ui_track : PerformanceStudio_CAMTrack - -func _ready() -> void: - # Create the CAM - game_cam = performance_studio.create_cam("Game Activity") - - # Add tracks to the CAM - wave_track = game_cam.create_track("Wave Activity") - ui_track = game_cam.create_track("UI Activity") -``` - -To create a job within a track: - -```console -var wave_job : PerformanceStudio_CAMJob - -func _on_new_wave_started() -> void: - wave_job = wave_track.create_job("Spawning Wave", Color8(255, 0, 0)) - -func _on_wave_completed() -> void: - wave_job.stop() -``` - -You can now annotate your Godot game and analyze the performance with markers that give context to a profile in Arm Performance Studio tools. diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/create-counters.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/create-counters.md new file mode 100644 index 0000000000..86929e4a6d --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/create-counters.md @@ -0,0 +1,43 @@ +--- +title: Create and track custom counters in Godot +weight: 8 +layout: learningpathall +--- +## What are counters? + +Counters are floating-point values plotted as line charts in Streamline. Each value appears with two decimal places of precision. + +There are two types of counters: + +- Absolute counters: every value is treated as an independent measurement + +- Delta counters: each value represents the change since the last measurement (for example, the number of enemy spawns since the last update) + +## Define your counter chart + +When charts are first defined, you can specify: + +- A title: this names the chart in Streamline + +- A series name: this labels the specific data stream within the chart + +You can group multiple counter series under the same title to plot them on the same chart. + +## Create and update a counter + +Use the `create_counter()` method to define a counter in your script. For example: + +```console +var counter = performance_studio.create_counter("Title", "Series", false) +``` + +The third parameter sets whether the counter is a delta counter `(true)` or absolute counter `(false)`. + +To update the counter value, use: + +```console +counter.setValue(42.2) +``` + +This value will appear in the timeline alongside other profiling data during a Streamline capture. + diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/define-regions.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/define-regions.md new file mode 100644 index 0000000000..73e3846105 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/define-regions.md @@ -0,0 +1,28 @@ +--- +title: Define performance regions in Godot +weight: 6 +layout: learningpathall +--- + +## Defining regions in your Godot project + +To define regions of interest within the game, you can specify a pair of markers prefixed with **Region Start** and **Region End**, for example: + +```console +performance_studio.marker("Region Start Times Square") +# Do work +performance_studio.marker("Region End Times Square") +``` + +These regions are shown on the frame rate analysis chart in the Performance Advisor report. + +![Regions in Performance Advisor#center](pa_frame_rate_regions.png "Regions in Performance Advisor") + +Performance Advisor also includes dedicated charts for each region at the end of the report, allowing you to analyze them independently. + +![Dedicated region charts in Performance Advisor#center](pa_dedicated_region_charts.png "Dedicated region charts in Performance Advisor") + + + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/install-extension.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/install-extension.md new file mode 100644 index 0000000000..18938803a3 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/install-extension.md @@ -0,0 +1,26 @@ +--- +title: Install the Arm Performance Studio extension in Godot +weight: 4 +layout: learningpathall +--- + +## Install the Arm Performance Studio extension in Godot + +To profile performance in your Godot game, first install the Arm Performance Studio extension using the Godot Asset Library. + +Start by opening your project in Godot, then select **AssetLib** from the top menu to browse available extensions. + +Search for **Arm Performance Studio Integration**, then double-click the result to open its details. + +In the extension dialog that appears, select **Download** to begin the installation. + +![Extension download dialog in Godot#center](godot_install_performance_studio_extension.png "Download dialog for the Arm Performance Studio Integration extension in Godot") + +When prompted, you can change the install folder if needed. To complete the setup, select **Install**. + +The extension will now be added to your project and ready to use for adding markers and counters. + + + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/profile-godot-with-aps.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/profile-godot-with-aps.md new file mode 100644 index 0000000000..ce97cd51a1 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/profile-godot-with-aps.md @@ -0,0 +1,35 @@ +--- +title: Profile your Godot game with Arm Performance Studio +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Overview + +In this Learning Path, you'll learn how to annotate and analyze performance in your Godot game using the Arm Performance Studio extension. You’ll add markers, counters, and timelines to capture game events and visualize them in Streamline and Performance Advisor. These tools help you identify CPU and GPU bottlenecks and optimize performance on Arm-based Android devices. + +{{% notice Note %}} + This extension is compatible with **Godot 4.3 and later**. +{{% /notice %}} + +## What is Arm Performance Studio? + +[Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio) is a free suite of analysis tools to profile game performance on mobile devices with Arm CPUs and GPUs. It includes: + +- [Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer): a performance analyzer that collects CPU and GPU metrics. + +- [Performance Advisor](https://developer.arm.com/Tools%20and%20Software/Performance%20Advisor): a report generator that offers optimization suggestions. + +Arm provides a Godot extension from [Godot games](https://godotengine.org/) that integrates with these tools, making it easier to capture performance data directly from your game. + +## Add annotations to your Godot project + +The Arm Performance Studio extension lets you add custom annotations to your Godot project. These annotations include timeline markers and counters that describe what's happening during gameplay,such as loading a level or spawning enemies. + +When you record a capture in Streamline, these annotations appear in the timeline alongside CPU and GPU metrics. This context makes it easier to correlate performance issues with in-game events. + +For example, here’s a capture showing a marker for when a wave of enemies spawns: + +![Marker annotations in Streamline#center](sl_annotation.png "Marker annotations in Streamline") diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-channels.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-channels.md new file mode 100644 index 0000000000..056d2e78c0 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-channels.md @@ -0,0 +1,37 @@ +--- +title: Use channels for threaded performance annotations +weight: 7 +layout: learningpathall +--- +## Use channels for threaded annotations in Godot + +Channels are custom event timelines associated with a specific software thread. Unlike single-point markers, channel annotations span a duration and include a label and optional color. You can use them to trace task execution or track long-running operations, such as asset loading or enemy spawning. + +## Create and annotate a channel + +To define a new channel named **Spawner** and insert an annotation labeled **Spawning Wave**, use the following approach: + +```console +var channel : PerformanceStudio_Channel + +func _ready() -> void: + channel = performance_studio.create_channel("Spawner") + +# Annotations can then be inserted into a channel: +func _on_new_wave_started() -> void: + channel.annotate_color("Spawning Wave", Color8(255, 0, 0)) + +func _on_wave_completed() -> void: + channel.end() +``` +In this example: + +- The `annotate_color()` method begins a red-colored annotation labeled Spawning Wave + +- The end() method marks when the annotation finishes + +## View channels in Streamline + +To see channels in Streamline, select the **Core Map** view, and expand the **VkThread** thread: + +![Channel annotations in Streamline#center](sl_channel.png "Channel annotations in Streamline") \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-custom-activity-maps.md b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-custom-activity-maps.md new file mode 100644 index 0000000000..166724ee88 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/godot_packages/use-custom-activity-maps.md @@ -0,0 +1,51 @@ +--- +title: Use custom activity maps in Godot profiling +weight: 9 +layout: learningpathall +--- + +## Use custom activity maps to organize game profiling + +[Custom Activity Map (CAM)](https://developer.arm.com/documentation/101816/latest/Annotate-your-code/User-space-annotations/Custom-Activity-Map-annotations) views allow execution information to be plotted on a hierarchical set of timelines. + +Like channel annotations, CAM views plot jobs on tracks, but unlike channel annotations, CAM views are not associated with a specific thread. Each CAM view contains one or more tracks and each track contains one or more jobs. + +![Custom activity maps in Streamline#center](sl_cam.png "Custom activity maps in Streamline") + +## Create a CAM and add tracks + +To define a custom activity map and add tracks for different systems: + +```console +var game_cam : PerformanceStudio_CAM +var wave_track : PerformanceStudio_CAMTrack +var ui_track : PerformanceStudio_CAMTrack + +func _ready() -> void: + # Create the CAM + game_cam = performance_studio.create_cam("Game Activity") + + # Add tracks to the CAM + wave_track = game_cam.create_track("Wave Activity") + ui_track = game_cam.create_track("UI Activity") +``` + +## Add jobs to tracks + +You can use jobs to represent specific time-bound tasks on a track. Here’s how to create and stop a job: + +```console +var wave_job : PerformanceStudio_CAMJob + +func _on_new_wave_started() -> void: + wave_job = wave_track.create_job("Spawning Wave", Color8(255, 0, 0)) + +func _on_wave_completed() -> void: + wave_job.stop() +``` + +## Analyze custom activity maps in Streamline + +Once you capture a profiling session, you can view your CAM data in Streamline under the Custom Activity Map section. Each job appears on its assigned track, enabling you to inspect how long each task ran and when overlapping operations occurred. + +Use CAMs to structure your profiling data around gameplay concepts, not just threads, making your performance analysis more intuitive and actionable. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index 2cf5628cc6..878d7bd782 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -8,8 +8,8 @@ key_ip: maintopic: true operatingsystems_filter: - Android: 2 -- Linux: 154 -- macOS: 10 +- Linux: 157 +- macOS: 11 - Windows: 14 pinned_modules: - module: @@ -22,8 +22,8 @@ subjects_filter: - Containers and Virtualization: 29 - Databases: 15 - Libraries: 9 -- ML: 28 -- Performance and Architecture: 60 +- ML: 29 +- Performance and Architecture: 62 - Storage: 1 - Web: 10 subtitle: Optimize cloud native apps on Arm for performance and cost @@ -47,6 +47,8 @@ tools_software_languages_filter: - ASP.NET Core: 2 - Assembly: 4 - assembly: 1 +- Async-profiler: 1 +- AWS: 1 - AWS CDK: 2 - AWS CodeBuild: 1 - AWS EC2: 2 @@ -65,7 +67,7 @@ tools_software_languages_filter: - C++: 8 - C/C++: 2 - Capstone: 1 -- CCA: 6 +- CCA: 7 - Clair: 1 - Clang: 10 - ClickBench: 1 @@ -77,10 +79,11 @@ tools_software_languages_filter: - Daytona: 1 - Demo: 3 - Django: 1 -- Docker: 17 +- Docker: 18 - Envoy: 2 - ExecuTorch: 1 - FAISS: 1 +- FlameGraph: 1 - Flink: 1 - Fortran: 1 - FunASR: 1 @@ -88,7 +91,7 @@ tools_software_languages_filter: - GCC: 22 - gdb: 1 - Geekbench: 1 -- GenAI: 11 +- GenAI: 12 - GitHub: 6 - GitLab: 1 - Glibc: 1 @@ -114,7 +117,7 @@ tools_software_languages_filter: - Linaro Forge: 1 - Litmus7: 1 - Llama.cpp: 1 -- LLM: 9 +- LLM: 10 - llvm-mca: 1 - LSE: 1 - MariaDB: 1 @@ -132,6 +135,7 @@ tools_software_languages_filter: - Ollama: 1 - ONNX Runtime: 1 - OpenBLAS: 1 +- OpenJDK-21: 1 - OpenShift: 1 - OrchardCore: 1 - PAPI: 1 @@ -144,7 +148,7 @@ tools_software_languages_filter: - RAG: 1 - Redis: 3 - Remote.It: 2 -- RME: 6 +- RME: 7 - Runbook: 71 - Rust: 2 - snappy: 1 @@ -161,6 +165,7 @@ tools_software_languages_filter: - TensorFlow: 2 - Terraform: 11 - ThirdAI: 1 +- Tomcat: 1 - Trusted Firmware: 1 - TSan: 1 - TypeScript: 1 @@ -173,6 +178,7 @@ tools_software_languages_filter: - Whisper: 1 - WindowsPerf: 1 - WordPress: 3 +- wrk2: 1 - x265: 1 - zlib: 1 - Zookeeper: 1 diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/00_overview.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/00_overview.md index 2e3ddadafe..2fbf1b961e 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/00_overview.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/00_overview.md @@ -8,9 +8,9 @@ layout: learningpathall ## The AFM-4.5B model -AFM-4.5B is a 4.5-billion-parameter foundation model designed to balance accuracy, efficiency, and broad language coverage. Trained on nearly 7 trillion tokens of carefully filtered data, it performs well across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish. +[AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) is a 4.5-billion-parameter foundation model designed to balance accuracy, efficiency, and broad language coverage. Trained on nearly 8 trillion tokens of carefully filtered data, it performs well across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish. -In this Learning Path, you'll deploy AFM-4.5B using [Llama.cpp](https://github.com/ggerganov/llama.cpp) on an Arm-based AWS Graviton4 instance. You’ll walk through the full workflow, from setting up your environment and compiling the runtime, to downloading, quantizing, and running inference on the model. You'll also evaluate model quality using perplexity, a common metric for measuring how well a language model predicts text. +In this Learning Path, you'll deploy [AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) using [Llama.cpp](https://github.com/ggerganov/llama.cpp) on an Arm-based AWS Graviton4 instance. You’ll walk through the full workflow, from setting up your environment and compiling the runtime, to downloading, quantizing, and running inference on the model. You'll also evaluate model quality using perplexity, a common metric for measuring how well a language model predicts text. This hands-on guide helps developers build cost-efficient, high-performance LLM applications on modern Arm server infrastructure using open-source tools and real-world deployment practices. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md index 8b8c53c779..a81fd12c81 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md @@ -6,7 +6,7 @@ weight: 4 layout: learningpathall --- -In this step, you'll set up the Graviton4 instance with the tools and dependencies required to build and run the Arcee Foundation Model. This includes installing system packages and a Python environment. +In this step, you'll set up the Graviton4 instance with the tools and dependencies required to build and run the AFM-4.5B model. This includes installing system packages and a Python environment. ## Update the package list diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md index 95fa1e416f..1cdd269a23 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md @@ -7,7 +7,7 @@ layout: learningpathall --- ## Build the Llama.cpp inference engine -In this step, you'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model, optimized for inference on a range of hardware platforms,including Arm-based processors like AWS Graviton4. +In this step, you'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model, optimized for inference on a range of hardware platforms, including Arm-based processors like AWS Graviton4. Even though AFM-4.5B uses a custom model architecture, you can still use the standard Llama.cpp repository - Arcee AI has contributed the necessary modeling code upstream. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md index b680dcf7eb..db8d79dc36 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md @@ -32,7 +32,7 @@ This command does the following: - Runs the activation script, which modifies your shell environment - Updates your shell prompt to show `env-llama-cpp`, indicating the environment is active -- Updates `PATH` to use so the environment’s Python interpreter +- Updates `PATH` to use the environment’s Python interpreter - Ensures all `pip` commands install packages into the isolated environment ## Upgrade pip to the latest version @@ -72,7 +72,8 @@ After the installation completes, your virtual environment includes: - **NumPy**: for numerical computations and array operations - **Requests**: for HTTP operations and API calls - **Other dependencies**: additional packages required by llama.cpp's Python bindings and utilities -Your environment is now ready to run Python scripts that integrate with the compiled Llama.cpp binaries + +Your environment is now ready to run Python scripts that integrate with the compiled Llama.cpp binaries. {{< notice Tip >}} Before running any Python commands, make sure your virtual environment is activated. {{< /notice >}} diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md index 3feb85647e..5c68008870 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md @@ -6,10 +6,20 @@ weight: 7 layout: learningpathall --- -In this step, you’ll download the AFM-4.5B model from Hugging Face, convert it to the GGUF format for compatibility with `llama.cpp`, and generate quantized versions to optimize memory usage and improve inference speed. +In this step, you’ll download the [AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) model from Hugging Face, convert it to the GGUF format for compatibility with `llama.cpp`, and generate quantized versions to optimize memory usage and improve inference speed. + +{{% notice Note %}} +If you want to skip the model optimization process, [GGUF](https://huggingface.co/arcee-ai/AFM-4.5B-GGUF) versions are available. {{% /notice %}} Make sure to activate your virtual environment before running any commands. The instructions below walk you through downloading and preparing the model for efficient use on AWS Graviton4. +## Signing up to Hugging Face + +In order to download AFM-4.5B, you will need: +- a Hugging Face account: you can sign up at [https://huggingface.co](https://huggingface.co) +- a read-only Hugging Face token: once logged in, you can create one at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). Don't forget to store it, as you will only be able to view it once. +- to accept the terms of AFM-4.5B at [https://huggingface.co/arcee-ai/AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) + ## Install the Hugging Face libraries ```bash @@ -19,14 +29,31 @@ pip install huggingface_hub hf_xet This command installs: - `huggingface_hub`: Python client for downloading models and datasets -- `hf_xet`: Git extension for fetching large model files stored on Hugging Face +- `hf_xet`: Git extension for fetching large model files hosted on Hugging Face + +These tools include the `hf` command-line interface you'll use next. + +## Log in to the Hugging Face Hub + +```bash +hf auth login + + _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| + _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| + _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| + _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| + _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| + + To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens . +Enter your token (input will not be visible): +``` -These tools include the `huggingface-cli` command-line interface you'll use next. +Please enter the token you created above, and answer 'n' to "Add token as git credential? (Y/n)". ## Download the AFM-4.5B model ```bash -huggingface-cli download arcee-ai/afm-4.5B --local-dir models/afm-4-5b +hf download arcee-ai/afm-4.5B --local-dir models/afm-4-5b ``` This command downloads the model to the `models/afm-4-5b` directory: @@ -60,7 +87,7 @@ This command creates a 4-bit quantized version of the model: - `llama-quantize` is the quantization tool from Llama.cpp. - `afm-4-5B-F16.gguf` is the input GGUF model file in 16-bit precision. - `Q4_0` applies zero-point 4-bit quantization. -- This reduces the model size by approximately 45% (from ~15GB to ~8GB). +- This reduces the model size by approximately ~70% (from ~15GB to ~4.4GB). - The quantized model will use less memory and run faster, though with a small reduction in accuracy. - The output file will be `afm-4-5B-Q4_0.gguf`. @@ -78,7 +105,7 @@ bin/llama-quantize models/afm-4-5b/afm-4-5B-F16.gguf models/afm-4-5b/afm-4-5B-Q8 This command creates an 8-bit quantized version of the model: - `Q8_0` specifies 8-bit quantization with zero-point compression. -- This reduces the model size by approximately 70% (from ~15GB to ~4.4GB). +- This reduces the model size by approximately ~45% (from ~15GB to ~8GB). - The 8-bit version provides a better balance between memory usage and accuracy than 4-bit quantization. - The output file is named `afm-4-5B-Q8_0.gguf`. - Commonly used in production scenarios where memory resources are available. @@ -89,7 +116,7 @@ Similar to Q4_0, Arm has contributed optimized kernels for Q8_0 quantization tha ## Model files ready for inference -After completing these steps, you'll have three versions of the AFM-4.5B model: +After completing these steps, you'll have three versions of the AFM-4.5B model in `models/afm-4-5b`: - `afm-4-5B-F16.gguf` - The original full-precision model (~15GB) - `afm-4-5B-Q4_0.gguf` - 4-bit quantized version (~4.4GB) for memory-constrained environments - `afm-4-5B-Q8_0.gguf` - 8-bit quantized version (~8GB) for balanced performance and memory usage diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md index 0e84e97f56..a4fd44830e 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md @@ -6,7 +6,7 @@ weight: 8 layout: learningpathall --- -Now that you have the AFM-4.5B models in GGUF format, you can run inference using various Llama.cpp tools. In this step, you'll explore how to generate text, benchmark performance, and interact with the model through both command-line and HTTP APIs. +Now that you have the [AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) models in GGUF format, you can run inference using various Llama.cpp tools. In this step, you'll explore how to generate text, benchmark performance, and interact with the model through both command-line and HTTP APIs. ## Use llama-cli for interactive text generation @@ -55,14 +55,15 @@ To exit the session, type `Ctrl+C` or `/bye`. You'll then see performance metrics like this: ```bash -llama_perf_sampler_print: sampling time = 26.66 ms / 356 runs ( 0.07 ms per token, 13352.84 tokens per second) -llama_perf_context_print: load time = 782.72 ms -llama_perf_context_print: prompt eval time = 392.40 ms / 24 tokens ( 16.35 ms per token, 61.16 tokens per second) -llama_perf_context_print: eval time = 13173.66 ms / 331 runs ( 39.80 ms per token, 25.13 tokens per second) -llama_perf_context_print: total time = 129945.08 ms / 355 tokens +llama_perf_sampler_print: sampling time = 9.47 ms / 119 runs ( 0.08 ms per token, 12569.98 tokens per second) +llama_perf_context_print: load time = 616.69 ms +llama_perf_context_print: prompt eval time = 344.39 ms / 23 tokens ( 14.97 ms per token, 66.79 tokens per second) +llama_perf_context_print: eval time = 9289.81 ms / 352 runs ( 26.39 ms per token, 37.89 tokens per second) +llama_perf_context_print: total time = 17446.13 ms / 375 tokens +llama_perf_context_print: graphs reused = 0 ``` -In this example, the 8-bit model running on 16 threads generated 355 tokens, at ~25 tokens per second (`eval time`). +In this example, the 8-bit model running on 16 threads generated 375 tokens, at ~37 tokens per second (`eval time`). ## Run a non-interactive prompt @@ -77,7 +78,7 @@ This command: - Sends a one-time prompt using `-p` - Prints the generated response and exits -The 4-bit model delivers faster generation—expect around 40 tokens per second on Graviton4. This shows how a more aggressive quantization recipe helps deliver faster performance. +The 4-bit model delivers faster generation—expect around 60 tokens per second on Graviton4. This shows how a more aggressive quantization recipe helps deliver faster performance. ## Use llama-server for API access @@ -130,29 +131,29 @@ The response includes the model’s reply and performance metrics: "index": 0, "message": { "role": "assistant", - "content": "Quantum computing uses quantum-mechanical phenomena, such as superposition and entanglement, to perform calculations. It allows for multiple possibilities to exist simultaneously, which can speed up certain processes. Unlike classical computers, quantum computers can solve complex problems and simulate systems more efficiently. Quantum bits (qubits) store information, and quantum gates perform operations. Quantum computing has potential applications in fields like cryptography, optimization, and materials science. Its development is an active area of research, with companies like IBM, Google, and Microsoft investing in quantum computing technology." + "content": "Quantum computing uses quantum-mechanical phenomena like superposition and entanglement to solve complex problems much faster than classical computers. Instead of binary bits (0 or 1), quantum bits (qubits) can exist in multiple states simultaneously, allowing for parallel processing of vast combinations of possibilities. This enables quantum computers to perform certain calculations exponentially faster, particularly in areas like cryptography, optimization, and drug discovery. However, quantum systems are fragile and prone to errors, requiring advanced error correction techniques. Current quantum computers are still in early stages but show promise for transformative applications." } } ], - "created": 1750929895, + "created": 1753876147, "model": "afm-4-5b", - "system_fingerprint": "b5757-716301d1", + "system_fingerprint": "b6030-1e15bfd4", "object": "chat.completion", "usage": { - "completion_tokens": 111, + "completion_tokens": 115, "prompt_tokens": 20, - "total_tokens": 131 + "total_tokens": 135 }, - "id": "chatcmpl-tb93ww9iYCErwLJmsV0YLrIadVvpBk4m", + "id": "chatcmpl-0Zwzu03zbu77MFx4ogBsqz8E4IdxHOLU", "timings": { - "prompt_n": 11, - "prompt_ms": 105.651, - "prompt_per_token_ms": 9.604636363636363, - "prompt_per_second": 104.11638318615064, - "predicted_n": 111, - "predicted_ms": 2725.982, - "predicted_per_token_ms": 24.558396396396397, - "predicted_per_second": 40.719271073690145 + "prompt_n": 20, + "prompt_ms": 68.37, + "prompt_per_token_ms": 3.4185000000000003, + "prompt_per_second": 292.525961679099, + "predicted_n": 115, + "predicted_ms": 1884.943, + "predicted_per_token_ms": 16.390808695652172, + "predicted_per_second": 61.00980241842857 } } ``` @@ -161,7 +162,7 @@ The response includes the model’s reply and performance metrics: You’ve now successfully: -- Run AFM-4.5B in interactive and non-interactive modes +- Run [AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) in interactive and non-interactive modes - Tested performance with different quantized models - Served the model as an OpenAI-compatible API endpoint diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md index f23b06cc3a..b4d3de7eab 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md @@ -26,9 +26,9 @@ bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf ``` Typical results on a 16 vCPU instance: -- **F16 model**: ~15-16 tokens/second, ~15GB memory usage -- **Q8_0 model**: ~25 tokens/second, ~8GB memory usage -- **Q4_0 model**: ~40 tokens/second, ~4.4GB memory usage +- **F16 model**: ~25 tokens/second, ~9GB memory usage +- **Q8_0 model**: ~40 tokens/second, ~5GB memory usage +- **Q4_0 model**: ~60 tokens/second, ~3GB memory usage Your actual results might vary depending on your specific instance configuration and system load. @@ -40,28 +40,31 @@ Use this command to benchmark performance across prompt sizes and thread counts: bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf \ -p 128,256,512 \ -n 128 \ - -t 8,16,24 + -t 4,8,16 ``` This command does the following: - Loads the 4-bit model and runs inference benchmarks - `-p`: evaluates prompt lengths of 128, 256, and 512 tokens - `-n`: generates 128 tokens -- `-t`: runs inference using 4, 8, and 24 threads +- `-t`: runs inference using 4, 8, and 16 threads -Here’s an example of how performance scales across threads and prompt sizes: +Here’s an example of how performance scales across threads and prompt sizes (pp = prompt processing, tg = text generation): | model | size | params | backend | threads | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp128 | 62.90 ± 0.08 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp512 | 57.63 ± 0.06 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | tg128 | 15.18 ± 0.02 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp128 | 116.23 ± 0.04 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp512 | 106.39 ± 0.03 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | tg128 | 25.29 ± 0.05 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp128 | 206.67 ± 0.10 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp512 | 190.18 ± 0.03 | -| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | tg128 | 40.99 ± 0.36 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 4 | pp128 | 106.03 ± 0.21 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 4 | pp256 | 102.82 ± 0.05 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 4 | pp512 | 95.41 ± 0.18 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 4 | tg128 | 24.15 ± 0.02 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 8 | pp128 | 196.02 ± 0.42 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 8 | pp256 | 190.23 ± 0.34 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 8 | pp512 | 177.14 ± 0.31 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 8 | tg128 | 40.86 ± 0.11 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 16 | pp128 | 346.08 ± 0.62 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 16 | pp256 | 336.72 ± 1.43 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 16 | pp512 | 315.83 ± 0.22 | +| arcee 4B Q4_0 | 2.50 GiB | 4.62 B | CPU | 16 | tg128 | 62.39 ± 0.20 | Even with just four threads, the Q4_0 model achieves comfortable generation speeds. On larger instances, you can run multiple concurrent model processes to support parallel workloads. @@ -102,7 +105,7 @@ To reduce runtime, add the `--chunks` flag to evaluate a subset of the data. For ## Run the evaluation as a background script -Running a full perplexity evaluation on all three models takes about 5 hours. To avoid SSH timeouts and keep the process running after logout, wrap the commands in a shell script and run it in the background. +Running a full perplexity evaluation on all three models takes about 3 hours. To avoid SSH timeouts and keep the process running after logout, wrap the commands in a shell script and run it in the background. Create a script named ppl.sh: @@ -119,13 +122,13 @@ bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wik tail -f ppl.sh.log ``` -Here are the full results. +| Model | Generation speed (batch size 1, 16 vCPUs) | Memory Usage | Perplexity (Wikitext-2) | Perplexity Increase | +|:-------:|:----------------------:|:------------:|:----------:|:----------------------:| +| F16 | ~25 tokens per second | ~9 GB | 8.4612 +/- 0.06112 | 0 (baseline) | +| Q8_0 | ~40 tokens per second | ~5 GB | 8.4776 +/- 0.06128 | +0.19% | +| Q4_0 | ~60 tokens per second | ~3 GB | 9.1897 +/- 0.06604 | +8.6% | -| Model | Generation speed (tokens/s, 16 vCPUs) | Memory Usage | Perplexity (Wikitext-2) | -|:-------:|:----------------------:|:------------:|:----------:| -| F16 | ~15–16 | ~15 GB | TODO | -| Q8_0 | ~25 | ~8 GB | TODO | -| Q4_0 | ~40 | ~4.4 GB | TODO | +We can see that 8-bit quantization introduces negligible degradation. The 4-bit model does suffer more, but may still serve its purpose for simpler use cases. As always, you should run your own tests and make up your own mind. When you have finished your benchmarking and evaluation, make sure to terminate your AWS EC2 instance in the AWS Management Console to avoid incurring unnecessary charges for unused compute resources. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md index 2faa4f371d..fa343771a4 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md @@ -9,7 +9,7 @@ layout: learningpathall ## Wrap up your AFM-4.5B deployment -Congratulations! You have completed the process of deploying the Arcee AFM-4.5B foundation model on AWS Graviton4. +Congratulations! You have completed the process of deploying the Arcee [AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B) foundation model on AWS Graviton4. Here’s a summary of what you built and how you can take your knowledge forward. @@ -29,8 +29,8 @@ Using this Learning Path, you have: The benchmarking results demonstrate the power of quantization and Arm-based computing: -- **Memory efficiency** – the 4-bit model uses only ~4.4 GB of RAM compared to ~15 GB for the full-precision version -- **Speed improvements** – inference with Q4_0 is 2–3x faster (40+ tokens/sec vs. 15–16 tokens/sec) +- **Memory efficiency** – the 4-bit model uses only ~3 GB of RAM compared to ~9 GB for the full-precision version +- **Speed improvements** – inference with Q4_0 is 2.5x faster (~60+ tokens/sec vs. 25 tokens/sec) - **Cost optimization** – lower memory needs enable smaller, more affordable instances - **Quality preservation** – the quantized models maintain strong perplexity scores, showing minimal quality loss @@ -63,4 +63,4 @@ Together, Arcee AI’s foundation models, Llama.cpp’s efficient runtime, and G From chatbots and content generation to research tools, this stack strikes a balance between performance, cost, and developer control. -For more information on Arcee AI, and how you can build high-quality, secure, and cost-efficient AI solutions, please visit www.arcee.ai. +For more information on Arcee AI, and how you can build high-quality, secure, and cost-efficient AI solutions, please visit [www.arcee.ai](https://www.arcee.ai). diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md index 7a7b23f235..9988ed3cb3 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -1,10 +1,6 @@ --- title: Deploy Arcee AFM-4.5B on Arm-based AWS Graviton4 with Llama.cpp -draft: true -cascade: - draft: true - minutes_to_complete: 30 who_is_this_for: This Learning Path is for developers and ML engineers who want to deploy Arcee's AFM-4.5B small language model on AWS Graviton4 instances using Llama.cpp. diff --git a/content/learning-paths/servers-and-cloud-computing/azure-vm/_index.md b/content/learning-paths/servers-and-cloud-computing/azure-vm/_index.md index 8a7c2f5afa..357a8bdcd5 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-vm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-vm/_index.md @@ -17,7 +17,7 @@ learning_objectives: prerequisites: - A [Microsoft Azure](https://azure.microsoft.com/) account with permission to create resources, including instances using Cobalt 100 processors. - - A local Linux machine with [QEMU](https://www.qemu.org/download/) and the [Azure CLI](/install-guides/azure-cli/) installed and authenticated. + - A Linux machine with [QEMU](https://www.qemu.org/download/) and the [Azure CLI](/install-guides/azure-cli/) installed and authenticated. author: Jason Andrews diff --git a/content/learning-paths/servers-and-cloud-computing/azure-vm/azure-vm.md b/content/learning-paths/servers-and-cloud-computing/azure-vm/azure-vm.md index f7c1a0e446..0159ccae7d 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-vm/azure-vm.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-vm/azure-vm.md @@ -8,7 +8,7 @@ layout: learningpathall You can view the Azure Linux 3.0 project on [GitHub](https://github.com/microsoft/azurelinux). There are links to the ISO downloads in the project README. -Using QEMU, you can create a raw disk image and boot a virtual machine with the ISO to install the OS onto the disk. +Using QEMU, you can create a raw disk image and boot a virtual machine with the ISO to install the OS on the disk. Once the installation is complete, you can convert the raw disk to a fixed-size VHD, upload it to Azure Blob Storage, and then use the Azure CLI to create a custom Arm image. @@ -20,9 +20,9 @@ Use `wget` to download the Azure Linux ISO image file. wget https://aka.ms/azurelinux-3.0-aarch64.iso ``` -Use `qemu-img` to create a 32 GB raw disk image. +Use `qemu-img` to create a 32 GB empty raw disk image to install the OS. -This step creates a 32 GB empty raw disk image to install the OS. You can increase the disk size by modifying the value passed to `qemu-img`. +You can increase the disk size by modifying the value passed to `qemu-img`. ```bash qemu-img create -f raw azurelinux-arm64.raw 34359738368 @@ -46,7 +46,14 @@ qemu-system-aarch64 \ -device virtio-net-device,netdev=net0 ``` -Once the OS boots successfully, install the Azure Linux Agent for VM provisioning, and power off the VM. +Navigate through the installer by entering the hostname, username, and password for the custom image. +You should use the username of `azureuser` if you want match the instructions on the following pages. + +Be patient, it takes some time to complete the full installation. + +At the end of installation you are prompted for confirmation to reboot the system. + +Once the newly installed OS boots successfully, install the Azure Linux Agent for VM provisioning, and power off the VM. ```bash sudo dnf install WALinuxAgent -y @@ -55,6 +62,8 @@ sudo systemctl start waagent sudo poweroff ``` +Be patient, it takes some time to install the packages and power off. + ## Convert the raw disk to VHD Format Now that the raw disk image is ready to be used, convert the image to fixed-size VHD, making it compatible with Azure. @@ -64,7 +73,7 @@ qemu-img convert -f raw -o subformat=fixed,force_size -O vpc azurelinux-arm64.ra ``` {{% notice Note %}} -VHD files have 512 bytes of footer attached at the end. The `force_size` flag ensures that the exact virtual size specified is used for the final VHD file. Without this, QEMU may round the size or adjust for footer overhead (especially when converting from raw to VHD). The `force_size` flag forces the final image to match the original size. This flag helps make the final VHD size a clean, whole number in MB or GiB, which is required for Azure. +VHD files have 512 bytes of footer attached at the end. The `force_size` flag ensures that the exact virtual size specified is used for the final VHD file. Without this, QEMU may round the size or adjust for footer overhead (especially when converting from raw to VHD). The `force_size` flag forces the final image to match the original size. This flag makes the final VHD size a whole number in MB or GB, which is required for Azure. {{% /notice %}} Next, you can save the image in your Azure account. diff --git a/content/learning-paths/servers-and-cloud-computing/azure-vm/background.md b/content/learning-paths/servers-and-cloud-computing/azure-vm/background.md index dda13eaf11..fa9b4854f7 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-vm/background.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-vm/background.md @@ -36,8 +36,8 @@ You should see an output similar to: ```output { - "azure-cli": "2.61.0", - "azure-cli-core": "2.61.0", + "azure-cli": "2.75.0", + "azure-cli-core": "2.75.0", "azure-cli-telemetry": "1.1.0", "extensions": {} } diff --git a/content/learning-paths/servers-and-cloud-computing/azure-vm/save-image.md b/content/learning-paths/servers-and-cloud-computing/azure-vm/save-image.md index a9f52f461b..ab66336077 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-vm/save-image.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-vm/save-image.md @@ -37,6 +37,14 @@ VM_SIZE="Standard_D4ps_v6" You can modify the environment variables such as RESOURCE_GROUP, VM_NAME, and LOCATION based on your naming preferences, region, and resource requirements. {{% /notice %}} +Make sure to login to Azure using the CLI. + +```bash +az login +``` + +If a link is printed, open it in a browser and enter the provided code to authenticate. + Create a new resource group. If you are using an existing resource group for the RESOURCE_GROUP environment variable you can skip this step. ```bash @@ -88,41 +96,41 @@ az sig create \ Create the image definition. ```bash -az sig image-definition create - --resource-group "$RESOURCE_GROUP" - --gallery-name "$GALLERY_NAME" - --gallery-image-definition "$IMAGE_DEF_NAME" - --publisher "$PUBLISHER" - --offer "$OFFER" - --sku "$SKU" - --os-type "$OS_TYPE" - --architecture "$ARCHITECTURE" +az sig image-definition create \ + --resource-group "$RESOURCE_GROUP" \ + --gallery-name "$GALLERY_NAME" \ + --gallery-image-definition "$IMAGE_DEF_NAME" \ + --publisher "$PUBLISHER" \ + --offer "$OFFER" \ + --sku "$SKU" \ + --os-type "$OS_TYPE" \ + --architecture "$ARCHITECTURE" \ --hyper-v-generation "$HYPERV_GEN" ``` Create the image version to register the VHD as a version of the custom image. ```bash -az sig image-version create - --resource-group "$RESOURCE_GROUP" - --gallery-name "$GALLERY_NAME" - --gallery-image-definition "$IMAGE_DEF_NAME" - --gallery-image-version "$IMAGE_VERSION" - --location "$LOCATION" - --os-vhd-uri "[https://${STORAGE_ACCOUNT}.blob.core.windows.net/${CONTAINER_NAME}/${VHD_NAME](https://${storage_account}.blob.core.windows.net/$%7BCONTAINER_NAME%7D/$%7BVHD_NAME)}" - --os-vhd-storage-account "$STORAGE_ACCOUNT" +az sig image-version create \ + --resource-group "$RESOURCE_GROUP" \ + --gallery-name "$GALLERY_NAME" \ + --gallery-image-definition "$IMAGE_DEF_NAME" \ + --gallery-image-version "$IMAGE_VERSION" \ + --location "$LOCATION" \ + --os-vhd-uri "https://${STORAGE_ACCOUNT}.blob.core.windows.net/${CONTAINER_NAME}/${VHD_NAME}" \ + --os-vhd-storage-account "$STORAGE_ACCOUNT" \ --storage-account-type "$STORAGE_ACCOUNT_TYPE" ``` Once the image has been versioned, you can retrieve the unique image ID for use in VM creation. ```bash -IMAGE_ID=$(az sig image-version show - --resource-group "$RESOURCE_GROUP" - --gallery-name "$GALLERY_NAME" - --gallery-image-definition "$IMAGE_DEF_NAME" - --gallery-image-version "$IMAGE_VERSION" +IMAGE_ID=$(az sig image-version show \ + --resource-group "$RESOURCE_GROUP" \ + --gallery-name "$GALLERY_NAME" \ + --gallery-image-definition "$IMAGE_DEF_NAME" \ + --gallery-image-version "$IMAGE_VERSION" \ --query "id" -o tsv) ``` -Next, you can create a virtual machine with the new image using the image ID. \ No newline at end of file +Next, you can create a virtual machine with the new image using the image ID. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/azure-vm/start-vm.md b/content/learning-paths/servers-and-cloud-computing/azure-vm/start-vm.md index cb28839882..c8592c1f96 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-vm/start-vm.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-vm/start-vm.md @@ -38,4 +38,16 @@ Use the public IP address to SSH to the VM. Replace `` with t ssh azureuser@ ``` +After you login, print the machine information. + +```bash +uname -a +``` + +The output is similar to: + +```output +Linux MyAzureLinuxARMVM 6.6.92.2-2.azl3 #1 SMP Wed Jul 2 02:43:35 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux +``` + You are ready to use your Azure Linux virtual machine. diff --git a/content/learning-paths/servers-and-cloud-computing/cca-device-attach/1-introduction.md b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/1-introduction.md new file mode 100644 index 0000000000..364ce03d32 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/1-introduction.md @@ -0,0 +1,30 @@ +--- +title: "About CCA Realms" +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Arm’s *Realm Management Extension (RME)* is a key security feature introduced in +the Armv9-A architecture. It enables a new form of hardware-enforced isolation +designed to support Confidential Computing. It defines the set of hardware +features and properties that are required to comply with the Arm's Confidential +Computing Architecture (CCA) architecture. + +At the heart of RME is the concept of a Realm, a protected execution +environment that operates independently from the conventional Normal World +(used by operating systems and applications) and the Secure World (used by +trusted firmware or TEE). Realms are managed by a new privileged entity called +the Realm Management Monitor (RMM) and are enforced by the hardware via the +Granule Protection Table (GPT) and Granule Transitioning mechanism. + +Realms allow lower-privileged software, such as an application or a virtual +machine, to protect its content and execution from attacks by higher-privileged +software, such as an OS or a hypervisor. Realms provide an environment for +confidential computing, without requiring the Realm owner to trust the software +components that manage the resources that the Realm uses. + +To be useful, a Realm has to interact with the rest of the world at some point. +For example, a network interface is likely to be needed. This learning path will +show you how devices are attached and used by Realms. diff --git a/content/learning-paths/servers-and-cloud-computing/cca-device-attach/2-virtio.md b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/2-virtio.md new file mode 100644 index 0000000000..d9c1aeef5a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/2-virtio.md @@ -0,0 +1,339 @@ +--- +title: "Device attach: virtio" +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +This section provides a high level overview of VirtIO and Bounce Buffers, and how they +relate to CCA Realms. + +A Realm has to use physical devices at some point to interact with the external +and or physical world. The easiest way to do this is by using VirtIO, which +provides a fast high level emulation layer. This can be viewed as the first level of +device attach. + +More evolved device attach features can be +performed leveraging hardware security features like PCIe-TDISP (**T**EE +**D**evice **I**nterface **S**ecurity **P**rotocol) and PCIe-IDE (**I**ntegrity +and **D**ata **E**ncryption), where the host OS can assign a physical device to +a realm, which will be able to make security measurements on the physical device +and include it in its base and measurements. + +## VirtIO + +### What is VirtIO ? + +VirtIO is an abstraction layer for virtual devices in virtualized environments. +It provides standardized and efficient interfaces between guest virtual machines +(VMs) and host devices, making it easier to develop paravirtualized drivers. +Paravirtualized means that the guest OS is aware it’s running in a virtualized +environment and can use optimized drivers (VirtIO) to communicate with virtual +hardware. Emulating hardware devices (like NICs or disks) for VMs is slow and +inefficient. VirtIO provides a standardized and efficient interface that allows +VMs to bypass full device emulation and instead use optimized drivers. + +VirtIO is most commonly used with KVM/QEMU virtualization. Example drivers are: +- `virtio-net`: Paravirtualized networking +- `virtio-blk`: Block device (disk) access +- `virtio-fs`: File sharing (host ↔ guest) +- `virtio-balloon`: Dynamic memory management +- `virtio-rng`: Random number source +- `virtio-console`: Simple console interface +- ... + +### How VirtIO works in VMs + +Here is an overview of how VirtIO works in Virtual Machines: + +1. The Host Hypervisor (e.g., QEMU/KVM) exposes VirtIO “backend” devices. +2. The guest OS loads VirtIO _frontend_ drivers (e.g., `virtio_net`, + `virtio_blk`) that communicate using the VirtIO protocol. +3. Communication happens via shared memory (`virtqueues`) for I/O operations, + avoiding full device emulation. +4. Devices are exposed over the PCI or MMIO bus to the guest. + +For example, instead of emulating an Intel e1000 NIC, the host exposes a +`virtio-net` interface to the guest OS and the guest OS uses the `virtio-net` +driver to send/receive packets via shared buffers. + +## Bounce buffers + +### What are bounce buffers? + +Bounce buffers are temporary memory buffers used in the Linux kernel to handle situations where direct memory access (DMA) can’t be performed directly on the original data buffer. This often happens because: +1. The original buffer is not physically contiguous. +2. The buffer is in high memory or not accessible to the device. +3. The buffer doesn’t meet alignment or boundary requirements of the device. + +### Why bounce buffers? + +Data _bounces_ between: +- The original buffer (in user/kernel space) and +- The DMA-capable bounce buffer (used for I/O with the device) + +This ensures that data transfers can still happen even when the original memory +is not suitable or accessible for transfers. + +## CCA Realms, VirtIO and bounce buffers + +The defining feature of a Realm is that its memory (called *Realm memory*) is +cryptographically isolated from both the Normal and Secure Worlds. This means +that: +- Realm memory is encrypted using keys that are unique to each Realm. +- Non-Realm entities (like the host OS or hypervisor) cannot directly read or + write Realm memory. +- Even Direct Memory Access (DMA) from peripherals or untrusted drivers cannot + access Realm data. + +This design ensures confidentiality but introduces a problem: How can Realms +interact with untrusted components, such as: +- Network stacks in the host OS, +- Storage subsystems, +- I/O devices managed by untrusted drivers? + +The solution to safely exchange data between a Realm and the outside World is to +use bounce buffers as an intermediary. + +### How bounce buffers are used with RME + +1. Exporting Data: + - A Realm application prepares some data (e.g., results of computation). + - It copies this data from protected Realm memory into a bounce buffer. + - The Realm notifies the untrusted host or hypervisor that the data is ready. + - The host retrieves the data from the bounce buffer. + +2. Importing Data: + - The host places data (e.g., input from a file or device) into a bounce buffer. + - The Realm is notified and validates the source. + - The Realm copies the data from the bounce buffer into its protected memory. + +This pattern preserves confidentiality and integrity of Realm data, since: +- The Realm never allows direct access to its memory. +- It can validate and sanitize any data received via bounce buffers. +- No sensitive data is exposed without explicit copying. + +### Confidentiality preserved with bounce buffers, really? + +In the previous section, it was mentioned that bounce buffers preserve +confidentiality. Lets dive a little deeper into that. Bounce buffers +are nothing more than an explicitly shared temporary area between the Realm +world and the outside world. This does indeed preserve the confidentiality of +all the rest of the Realm data. On the other hand, for the data being +transferred, it is leaving the Realm world and will only remain confidential if it +is encrypted in some way, e.g. for network traffic, TLS should be used. + +## Seeing a Realm's bounce buffers at work + +Let's put this to work and check for ourselves that bounce buffers are used. The +steps in this section will build on the Key Broker demo that was used in the [CCA +Essentials learning path](/learning-paths/servers-and-cloud-computing/cca-essentials/example/), +demonstrating an end-to-end attestation. + +### Start the Key Broker Server (KBS) + +First, pull the docker container image with the pre-built KBS, and then run the container: + +```bash +docker pull armswdev/cca-learning-path:cca-key-broker-v2 +docker run --rm -it armswdev/cca-learning-path:cca-key-broker-v2 +``` + +Now within your running docker container, get a list of network interfaces: + +```bash +ip -c a +``` + +The output should look like: + +```output +1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + inet6 ::1/128 scope host + valid_lft forever preferred_lft forever +20: eth0@if21: mtu 1500 qdisc noqueue state UP group default + link/ether 02:42:ac:11:00:02 brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 172.17.0.2/16 brd 172.17.255.255 scope global eth0 + valid_lft forever preferred_lft forever +``` + +Start the KBS on the `eth0` network interface, and replace 172.17.0.2 shown in +the command below with the IP address corresponding to eth0 in the output of `ip +-c a` above. + +```bash +./keybroker-server -v --addr 172.17.0.2 +``` + +The output should look like: + +```output +INFO starting 16 workers +INFO Actix runtime found; starting in Actix runtime +INFO starting service: "actix-web-service-172.17.0.2:8088", workers: 16, listening on: 172.17.0.2:8088 +``` + +### Get into a Realm + +With the Key Broker Server running in one terminal, open up a new terminal in +which you will run the Key Broker Client (KBC). The intent is to +observe that the data transmitted over the network (thru `virtio_net`) are +indeed using bounce buffers. + +Pull the docker container image with the pre-built KBC, and then run the container: + +```bash +docker pull armswdev/cca-learning-path:cca-simulation-v2 +docker run --rm -it armswdev/cca-learning-path:cca-simulation-v2 +``` + +Within the running container, launch the `run-cca-fvp.sh` script to run the Arm +CCA pre-built binaries on the FVP: + +```bash +./run-cca-fvp.sh +``` +The `run-cca-fvp.sh` script uses the screen command to connect to the different +UARTs in the FVP. + +You should see the host Linux kernel boot on your terminal and you will be +prompted to log in to the host. Enter root as the username: + +```output +[ 4.169458] Run /sbin/init as init process +[ 4.273748] EXT4-fs (vda): re-mounted 64d1bcff-5d03-412c-83c6-48ec4253590e r/w. Quota mode: none. +Starting syslogd: OK +Starting klogd: OK +Running sysctl: OK +Starting network: [ 5.254843] smc91x 1a000000.ethernet eth0: link up, 10Mbps, half-duplex, lpa 0x0000 +udhcpc: started, v1.36.1 +udhcpc: broadcasting discover +udhcpc: broadcasting select for 172.20.51.1, server 172.20.51.254 +udhcpc: lease of 172.20.51.1 obtained from 172.20.51.254, lease time 86400 +deleting routers +adding dns 172.20.51.254 +OK + +Welcome to the CCA host +host login: root +(host) # +``` + +Change directory to `/cca` and use `lkvm` to launch a guest Linux in a Realm: +```bash +cd /cca +./lkvm run --realm --disable-sve --irqchip=gicv3-its --firmware KVMTOOL_EFI.fd -c 1 -m 512 --no-pvtime --disk guest-disk.img --restricted_mem --virtio-transport pci --pmu --network mode=user +``` + +You should see the realm boot. Note that `lkvm` is invoked with `--network +mode=user`, which makes the guest see the network through a VirtIO device. + +After boot up, which might take some time, you will be prompted to log in at the +guest Linux prompt. Use root again as the username: + +```output +Starting syslogd: OK +Starting klogd: OK +Running sysctl: OK +Starting network: udhcpc: started, v1.36.1 +udhcpc: broadcasting discover +udhcpc: broadcasting select for 192.168.33.15, server 192.168.33.1 +udhcpc: lease of 192.168.33.15 obtained from 192.168.33.1, lease time 14400 +deleting routers +adding dns 172.20.51.254 +OK + +Welcome to the CCA realm +realm login: root +(realm) # +``` + +### Observe bounce buffer usage in the realm + +First, check that the Linux kernel has tracing support: + +```bash { output_lines="2-46" } +ls /sys/kernel/debug/tracing/events/ +9p i2c_slave qcom_glink +alarmtimer icmp qcom_smp2p +asoc initcall qdisc +block interconnect ras +bpf_test_run io_uring raw_syscalls +bpf_trace iomap rcu +bridge iommu regmap +capability ipi regulator +cgroup irq rpcgss +chipidea jbd2 rpm +clk kmem rpmh +cma ksm rseq +compaction kvm rtc +cpuhp kyber sched +cros_ec libata scmi +csd lock scsi +dev lockd signal +devfreq maple_tree skb +devlink mdio smbus +dma memcg sock +dma_fence migrate spi +dpaa2_eth mmap spmi +dpaa_eth mmap_lock sunrpc +dwc3 mmc swiotlb +e1000e_trace module task +enable mtu3 tcp +error_report musb tegra_apb_dma +ext4 napi thermal +fib neigh thermal_power_allocator +filelock net thp +filemap netfs timer +fsl_edma netlink timer_migration +ftrace nfs timestamp +gadget nfs4 tlb +gpio notifier udp +gpu_mem oom ufs +handshake optee vmalloc +header_event page_isolation vmscan +header_page page_pool watchdog +hns3 pagemap workqueue +huge_memory percpu writeback +hugetlbfs power xdp +hw_pressure printk xhci-hcd +hwmon pwm +i2c qcom_aoss +``` + +As shown above, you should see a list of the available trace +points. + +Now, enable the kernel tracing infrastructure together with bounce buffer +tracing, read the trace in the background (filtering on `keybroker-app-`) and +run the Key Broker Client application in the realm, using the endpoint address +that the Key Broker Server is listening on (from the other terminal): + +```bash +echo 1 > /sys/kernel/debug/tracing/tracing_on +echo 1 > /sys/kernel/debug/tracing/events/swiotlb/enable +grep keybroker-app- /sys/kernel/debug/tracing/trace_pipe & +keybroker-app -v --endpoint http://172.17.0.2:8088 skywalker +``` + +In the `keybroker-app`command above, `skywalker` is the key name that is +requested from the KBS. + +The output should look like: + +```output +INFO Requesting key named 'skywalker' from the keybroker server with URL http://172.17.0.2:8088/keys/v1/key/skywalker +INFO Challenge (64 bytes) = [5c, ec, 1e, f5, 93, 54, 4a, 8a, ee, 2e, 46, a0, 50, 0d, 41, dd, d4, 60, b0, 58, 5b, 51, 71, 76, d1, 66, d3, b7, 38, e8, af, ae, 0a, 07, 4e, c5, 60, dc, 4a, c0, b8, 73, 98, d9, bd, af, 41, 96, 99, 6d, 74, cc, 19, 70, 24, c4, c9, 5c, 21, 61, 1a, cb, 76, 75] +INFO Submitting evidence to URL http://172.17.0.2:8088/keys/v1/evidence/1928844131 +INFO Attestation success :-) ! The key returned from the keybroker is 'May the force be with you.' + keybroker-app-143 [000] b..2. 1772.607321: swiotlb_bounced: dev_name: 0000:00:00.0 dma_mask=ffffffffffffffff dev_addr=80b6717e size=66 FORCE + keybroker-app-143 [000] b..2. 1772.644478: swiotlb_bounced: dev_name: 0000:00:00.0 dma_mask=ffffffffffffffff dev_addr=80b6717e size=66 FORCE +``` + +Note that the interleaving of the trace messages and KBC messages might differ +from one run to another. With the `switlb_bounced` messages above you can successfully observe that the bounce buffers are being used in the realm. + diff --git a/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_index.md b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_index.md new file mode 100644 index 0000000000..922c1278fa --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_index.md @@ -0,0 +1,61 @@ +--- +title: CCA device attach + +draft: true +cascade: + draft: true + +minutes_to_complete: 90 + +who_is_this_for: This is an advanced topic for developers who want to learn about Arm CCA Realms, VirtIO, Bounce Buffers and Device Attach. + +learning_objectives: + - Have a high level understanding of VirtIO and Bounce Buffers in CCA Realms + - Understand Device Attach in general, and specifically for different types of devices + +prerequisites: + - An AArch64 or x86_64 computer running Linux or macOS. You can use cloud instances, see this list of [Arm cloud service providers](/learning-paths/servers-and-cloud-computing/csp/). + - Completion of [Get Started with CCA Attestation and Veraison](/learning-paths/servers-and-cloud-computing/cca-veraison) Learning Path. + - Completion of the [Run an application in a Realm using the Arm Confidential Computing Architecture (CCA)](/learning-paths/servers-and-cloud-computing/cca-container/) Learning Path. + - Completion of the [Run an end-to-end Attestation Flow](/learning-paths/servers-and-cloud-computing/cca-essentials/) Learning Path. + +author: Arnaud de Grandmaison + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Neoverse + - Cortex-A +operatingsystems: + - Linux + - macOS +tools_software_languages: + - CCA + - RME + - Docker + +further_reading: + - resource: + title: Arm Confidential Compute Architecture + link: https://www.arm.com/architecture/security-features/arm-confidential-compute-architecture + type: website + - resource: + title: Arm Confidential Compute Architecture open source enablement + link: https://www.youtube.com/watch?v=JXrNkYysuXw + type: video + - resource: + title: Learn the architecture - Realm Management Extension + link: https://developer.arm.com/documentation/den0126 + type: documentation + - resource: + title: Realm Management Monitor specification + link: https://developer.arm.com/documentation/den0137/latest/ + type: documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/cca-device-attach/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/cca-essentials/example.md b/content/learning-paths/servers-and-cloud-computing/cca-essentials/example.md index 276279ced7..2d7283e55d 100644 --- a/content/learning-paths/servers-and-cloud-computing/cca-essentials/example.md +++ b/content/learning-paths/servers-and-cloud-computing/cca-essentials/example.md @@ -12,17 +12,17 @@ layout: "learningpathall" #### Background -The concept of a Key Broker Server (KBS) is a common one in confidential computing, and there are multiple open-source implementations, including the [Trustee](https://github.com/confidential-containers/trustee) from the [CNCF Confidential Containers](https://confidentialcontainers.org/) project. +The concept of a Key Broker Server (KBS) is a common one in confidential computing, and there are multiple open-source implementations, including the [Trustee](https://github.com/confidential-containers/trustee) from the [CNCF Confidential Containers](https://confidentialcontainers.org/) project. -The KBS in this Learning Path is part of the [Veraison](https://github.com/veraison) project. It has been created specifically for educational purposes, so is intentionally small and simple to understand, and is not designed for production use. +The KBS in this Learning Path is part of the [Veraison](https://github.com/veraison) project. It has been created specifically for educational purposes, so is intentionally small and simple to understand, and is not designed for production use. #### Get started First, pull the docker container image with the pre-built KBS, and then run the container: ```bash -docker pull armswdev/cca-learning-path:cca-key-broker-v1 -docker run --rm -it armswdev/cca-learning-path:cca-key-broker-v1 +docker pull armswdev/cca-learning-path:cca-key-broker-v2 +docker run --rm -it armswdev/cca-learning-path:cca-key-broker-v2 ``` Now within your running docker container, get a list of network interfaces: @@ -45,7 +45,7 @@ The output should look like: inet 172.17.0.2/16 brd 172.17.255.255 scope global eth0 valid_lft forever preferred_lft forever ``` -Start the KBS on the `eth0` network interface, and replace 172.17.0.2 shown in the command below with the IP address corresponding to eth0 in the output of "ip -c a" above. +Start the KBS on the `eth0` network interface, and replace 172.17.0.2 shown in the command below with the IP address corresponding to eth0 in the output of `ip -c a` above. ```bash ./keybroker-server -v --addr 172.17.0.2 @@ -66,12 +66,12 @@ With the Key Broker Server running in one terminal, open up a new terminal in wh In the new terminal that you have just opened, pull the docker container image that contains the FVP and pre-built software binaries to run the Key Broker Client in a realm. ```bash -docker pull armswdev/cca-learning-path:cca-simulation-v1 +docker pull armswdev/cca-learning-path:cca-simulation-v2 ``` Now run the docker container: ```bash -docker run --rm -it armswdev/cca-learning-path:cca-simulation-v1 +docker run --rm -it armswdev/cca-learning-path:cca-simulation-v2 ``` Within your running container, launch the `run-cca-fvp.sh` script to run the Arm CCA pre-built binaries on the FVP: @@ -79,11 +79,11 @@ Within your running container, launch the `run-cca-fvp.sh` script to run the Arm ```bash ./run-cca-fvp.sh ``` -The run-cca-fvp.sh script uses the screen command to connect to the different UARTs in the FVP. +The `run-cca-fvp.sh` script uses the screen command to connect to the different UARTs in the FVP. -You should see the host Linux kernel boot on your terminal. +You should see the host Linux kernel boot on your terminal. -You will be prompted to log in to the host. +You will be prompted to log in to the host. Enter root as the username: @@ -109,11 +109,11 @@ host login: root Use kvmtool to launch guest Linux in a Realm: ```bash cd /cca -./lkvm run --realm --disable-sve --irqchip=gicv3-its --firmware KVMTOOL_EFI.fd -c 1 -m 512 --no-pvtime --force-pci --disk guest-disk.img --measurement-algo=sha256 --restricted_mem +./lkvm run --realm --disable-sve --irqchip=gicv3-its --firmware KVMTOOL_EFI.fd -c 1 -m 512 --no-pvtime --force-pci --disk guest-disk.img --measurement-algo=sha256 --restricted_mem ``` -You should see the realm boot. +You should see the realm boot. -After boot up, you will be prompted to log in at the guest Linux prompt. +After boot up, which might take some time, you will be prompted to log in at the guest Linux prompt. Use root again as the username: @@ -134,15 +134,14 @@ realm login: root (realm) # ``` -Now run the Key Broker Client application in the realm. +Now run the Key Broker Client application in the realm. Use the endpoint address that the Key Broker Server is listening in on the other terminal: ```bash -cd /cca -./keybroker-app -v --endpoint http://172.17.0.2:8088 skywalker +keybroker-app -v --endpoint http://172.17.0.2:8088 skywalker ``` -In the command above, `skywalker` is the key name that is requested from the Key Broker Server. +In the command above, `skywalker` is the key name that is requested from the Key Broker Server. After some time, you should see the following output: ``` @@ -151,7 +150,7 @@ INFO Challenge (64 bytes) = [0f, ea, c4, e2, 24, 4e, fa, dc, 1d, ea, ea, 3d, 60, INFO Submitting evidence to URL http://172.17.0.2:8088/keys/v1/evidence/3974368321 INFO Attestation failure :-( ! AttestationFailure: No attestation result was obtained. No known-good reference values. ``` -You can see from the Key Broker client application output that the `skywalker` key is requested from the Key Broker Server, which did send a challenge. +You can see from the Key Broker client application output that the `skywalker` key is requested from the Key Broker Server, which did send a challenge. The Key Broker Client application uses the challenge to submit its evidence back to the Key Broker Server, but it receives an attestation failure. This is because the server does not have any known good reference values. @@ -164,15 +163,15 @@ command-line option to populate it with known-good RIM values: --reference-values <(echo '{ "reference-values": [ "tiA66VOokO071FfsCHr7es02vUbtVH5FpLLqTzT7jps=" ] }') INFO Evidence submitted for challenge 1302147796: no attestation result was obtained. No known-good reference values. ``` -From the server output, you can see that it did create the challenge for the Key Broker application, but it reports that it has no known good reference values. +From the server output, you can see that it did create the challenge for the Key Broker application, but it reports that it has no known good reference values. -It does however provide a way to provision the Key Broker Server with known good values if the client is trusted. +It does however provide a way to provision the Key Broker Server with known good values if the client is trusted. In a production environment, the known good reference value is generated using a deployment- specific process, but for demonstration purposes and simplification, you will use the value proposed by the Key Broker Server. -Now go ahead and terminate the running instance of the Key Broker Server using Ctrl+C and restart it with the known good reference value. +Now go ahead and terminate the running instance of the Key Broker Server using Ctrl+C and restart it with the known good reference value. -Notice here that you need to copy the `--reference-values` argument directly from the previous error message reported by the Key Broker. +Notice here that you need to copy the `--reference-values` argument directly from the previous error message reported by the Key Broker. When running the next command, ensure that you are copying the exact value reported, for example: @@ -183,7 +182,7 @@ When running the next command, ensure that you are copying the exact value repor On the terminal with the running realm, rerun the Key Broker Client application with the exact same command line parameters as before: ```bash -./keybroker-app -v --endpoint http://172.17.0.2:8088 skywalker +keybroker-app -v --endpoint http://172.17.0.2:8088 skywalker ``` You should now get a successful attestation as shown: diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md new file mode 100644 index 0000000000..4871cd65e6 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md @@ -0,0 +1,50 @@ +--- +title: Distributed inference using llama.cpp + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This learning path is for developers with some experience using llama.cpp who want to learn about distributed inference. + +learning_objectives: + - Set up the main host and worker nodes using llama.cpp + - Run a large quantized model (e.g., Llama 3.1 405B) on CPUs in a distributed manner on Arm machines + +prerequisites: + - An AWS Graviton4 c8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server. + - Familiarity with -> [Deploy a Large Language Model (LLM) chatbot with llama.cpp using KleidiAI on Arm servers](/learning-paths/servers-and-cloud-computing/llama-cpu) + - Familiarity with AWS + +author: Aryan Bhusari + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Neoverse +tools_software_languages: + - LLM + - GenAI + - AWS +operatingsystems: + - Linux + + + +further_reading: + - resource: + title: Llama.cpp rpc-server code + link: https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc + type: Code + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/example-picture.png b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/example-picture.png new file mode 100644 index 0000000000..c69844bed4 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/example-picture.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-1.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-1.md new file mode 100644 index 0000000000..6838a42e06 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-1.md @@ -0,0 +1,64 @@ +--- +title: Overview and Worker Node Configuration +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Before you begin +The instructions in this Learning Path are for any Arm server running Ubuntu 24.04.2 LTS. You will need at least three Arm server instances with at least 64 cores and 128GB of RAM to run this example. The instructions have been tested on an AWS Graviton4 c8g.16xlarge instance + +## Overview +llama.cpp is a C++ library that enables efficient inference of LLaMA and similar large language models on CPUs, optimized for local and embedded environments. Just over a year ago from its publication date, rgerganov’s RPC code was merged into llama.cpp, enabling distributed inference of large LLMs across multiple CPU-based machines—even when the models don’t fit into the memory of a single machine. In this learning path, we’ll explore how to run a 405B parameter model on Arm-based CPUs. + +For the purposes of this demonstration, the following experimental setup will be used: +- Total number of instances: 3 +- Instance type: c8g.16xlarge +- Model: Llama-3.1-405B_Q4_0.gguf + +One of the three nodes will serve as the master node, which physically hosts the model file. The other two nodes will act as worker nodes. In llama.cpp, remote procedure calls (RPC) are used to offload both the model and the computation over TCP connections between nodes. The master node forwards inference requests to the worker nodes, where all the actual computation is performed. + +## Implementation + +1. To get started, follow [this learning path](/learning-paths/servers-and-cloud-computing/llama-cpu) up to the step where you clone the llama.cpp repository. Since this setup involves multiple instances (or devices), you will need to replicate the initial setup on each device. Specifically, after executing the command below on all devices, continue with this learning path starting from Step 2. + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` +2. Now we can build the llama.cpp library with the RPC feature enabled by compiling it with the -DLLAMA_RPC=ON flag +```bash +cd llama.cpp +mkdir -p build-rpc +cd build-rpc +cmake .. -DGGML_RPC=ON -DLLAMA_BUILD_SERVER=ON +cmake --build . --config Release +``` + +`llama.cpp` is now built in the `build-rpc/bin` directory. +Check that `llama.cpp` has built correctly by running the help command: +```bash +cd build-rpc +bin/llama-cli -h +``` +If everything was built correctly, you should see a list of all the available flags that can be used with llama-cli. +3. Now, choose two of the three devices to act as backend workers. If the devices had varying compute capacities, the ones with the highest compute should be selected—especially for a 405B model. However, since all three devices have identical compute capabilities in this case, you can select any two to serve as backend workers. + +Communication between the master node and the worker nodes occurs through a socket created on each worker. This socket listens for incoming data from the master—such as model parameters, tokens, hidden states, and other inference-related information. +{{% notice Note %}}The RPC feature in llama.cpp is not secure by default, so you should never expose it to the open internet. To mitigate this risk, ensure that the security groups for all your EC2 instances are properly configured—restricting access to only trusted IPs or internal VPC traffic. This helps prevent unauthorized access to the RPC endpoints.{{% /notice %}} +Use the following command to start the listening on the worker nodes: +```bash +bin/rpc-server -p 50052 -H 0.0.0.0 -t 64 +``` +Below are the available flag options that can be used with the rpc-server functionality: + +```output +-h, --help show this help message and exit +-t, --threads number of threads for the CPU backend (default: 6) +-d DEV, --device device to use +-H HOST, --host HOST host to bind to (default: 127.0.0.1) +-p PORT, --port PORT port to bind to (default: 50052) +-m MEM, --mem MEM backend memory size (in MB) +-c, --cache enable local file cache +``` +Setting the host to 0.0.0.0 might seem counterintuitive given the earlier security warning, but it’s acceptable in this case because the security groups have been properly configured to block any unintended or unauthorized access. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-2.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-2.md new file mode 100644 index 0000000000..65a020ccab --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/how-to-2.md @@ -0,0 +1,213 @@ +--- +title: Configuring Master Node +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +(continued)
+4. In this learning path, we will use the following three IP addresses for the nodes. + +```bash +master_ip =" 172.31.110.10" +worker_ips = "172.31.110.11,172.31.110.12" +``` +Note that these IPs may be different in your setup. You can find the IP address of your AWS instance using the command provided below. +```bash +curl http://169.254.169.254/latest/meta-data/local-ipv4 +``` + +Now, on the master node, you can verify communication with the worker nodes using the following command on master node: +```bash +telnet 172.31.110.11 50052 +``` +If the backend server is set up correctly, the output of the `telnet` command should look like the following: +```bash +Trying 172.31.110.11... +Connected to 172.31.110.11. +Escape character is '^]'. +``` +Finally, you can execute the following command, to execute distributed inference: +```bash +bin/llama-cli -m /home/ubuntu/model.gguf -p "Tell me a joke" -n 128 --rpc "$worker_ips" -ngl 99 +``` +{{% notice Note %}}At the time of publication, llama.cpp only supports up to 16 backend workers.{{% /notice %}}
+The model file for this experiment is hosted on Arm’s private AWS S3 bucket. If you don’t have access to it, you can find a publicly available version of the model on Hugging Face. +The output: +```output +build: 5935 (2adf8d83) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for aarch64-linux-gnu +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device RPC[172.31.110.11:50052] (RPC[172.31.110.11:50052]) - 126497 MiB free +llama_model_load_from_file_impl: using device RPC[172.31.110.12:50052] (RPC[172.31.110.12:50052]) - 126497 MiB free +llama_model_loader: loaded meta data with 30 key-value pairs and 1138 tensors from /home/ubuntu/Llama-3.1-405B_Q4_0.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama Hf +llama_model_loader: - kv 3: general.size_label str = 406B +llama_model_loader: - kv 4: general.license str = llama3.1 +llama_model_loader: - kv 5: general.tags arr[str,6] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 6: general.languages arr[str,8] = ["en", "de", "fr", "it", "pt", "hi", ... +llama_model_loader: - kv 7: llama.block_count u32 = 126 +llama_model_loader: - kv 8: llama.context_length u32 = 131072 +llama_model_loader: - kv 9: llama.embedding_length u32 = 16384 +llama_model_loader: - kv 10: llama.feed_forward_length u32 = 53248 +llama_model_loader: - kv 11: llama.attention.head_count u32 = 128 +llama_model_loader: - kv 12: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 13: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 14: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 15: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 16: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 17: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 18: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 19: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 20: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 21: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 22: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 23: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 24: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 25: tokenizer.ggml.eos_token_id u32 = 128001 +llama_model_loader: - kv 26: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 27: tokenizer.ggml.add_sep_token bool = false +llama_model_loader: - kv 28: general.quantization_version u32 = 2 +llama_model_loader: - kv 29: general.file_type u32 = 2 +llama_model_loader: - type f32: 254 tensors +llama_model_loader: - type q4_0: 883 tensors +llama_model_loader: - type q6_K: 1 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_0 +print_info: file size = 213.13 GiB (4.51 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 16384 +print_info: n_layer = 126 +print_info: n_head = 128 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 53248 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = ?B +print_info: model params = 405.85 B +print_info: general.name = Llama Hf +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128001 '<|end_of_text|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = true) +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 0 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: CPU output buffer size = 0.49 MiB +llama_kv_cache_unified: RPC[172.31.110.11:50052] KV buffer size = 800.00 MiB +llama_kv_cache_unified: RPC[172.31.110.12:50052] KV buffer size = 784.00 MiB +llama_kv_cache_unified: CPU KV buffer size = 432.00 MiB +llama_kv_cache_unified: size = 2016.00 MiB ( 4096 cells, 126 layers, 1/ 1 seqs), K (f16): 1008.00 MiB, V (f16): 1008.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: RPC[172.31.110.11:50052] compute buffer size = 1160.00 MiB +llama_context: RPC[172.31.110.12:50052] compute buffer size = 1160.00 MiB +llama_context: CPU compute buffer size = 1160.01 MiB +llama_context: graph nodes = 4668 +llama_context: graph splits = 4 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 64 + +system_info: n_threads = 64 (n_threads_batch = 64) / 64 | CPU : NEON = 1 | ARM_FMA = 1 | FP16_VA = 1 | MATMUL_INT8 = 1 | SVE = 1 | DOTPROD = 1 | SVE_CNT = 16 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4077122424 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 128, n_keep = 1 + +Tell me a joke! (or a funny story) +Thread starter Fiver +This thread is for any jokes you may want to share with other members. Please keep them clean! +Reactions: Fiver +A duck walks into a bar, and asks the bartender, "Have you got any bread?" +The bartender says, "No, we don't have any bread." +The duck leaves. +A few minutes later, the duck returns, and asks the bartender, "Have you got any bread?" +The bartender says, "No, I told you, we don't have any bread." +A few minutes later, the duck returns, and asks the bartender, + +llama_perf_sampler_print: sampling time = 9.48 ms / 133 runs ( 0.07 ms per token, 14032.50 tokens per second) +llama_perf_context_print: load time = 1796754.73 ms +llama_perf_context_print: prompt eval time = 1925.98 ms / 5 tokens ( 385.20 ms per token, 2.60 tokens per second) +llama_perf_context_print: eval time = 77429.95 ms / 127 runs ( 609.68 ms per token, 1.64 tokens per second) +llama_perf_context_print: total time = 79394.06 ms / 132 tokens +llama_perf_context_print: graphs reused = 0 +``` +That's it! You have successfully run the llama-3.1-8B model on CPUs with the power of llama.cpp RPC functionality. The following table provides brief description of the metrics from `llama_perf`:

+ +| Log Line | Description | +|-------------------|-----------------------------------------------------------------------------| +| sampling time | Time spent choosing next tokens using sampling strategy (e.g., top-k, top-p). | +| load time | Time to load the model into memory and initialize weights/buffers. | +| prompt eval time | Time to process the input prompt tokens before generation (fills KV cache). | +| eval time | Time to generate output tokens by forward-passing through the model. | +| total time | Total time for both prompt processing and token generation (excludes model load). | + +Lastly to set up OpenAI compatible API, you can use the `llama-server` functionality. The process of implementing this is described [here](/learning-paths/servers-and-cloud-computing/llama-cpu) under the "Access the chatbot using the OpenAI-compatible API" section. Here is a snippet, for how to set up llama-server for distributed inference: +```bash +bin/llama-server -m /home/ubuntu/model.gguf --port 8080 --rpc "$worker_ips" -ngl 99 +``` +At the very end of the output to the above command, you will see something like the following: +```output +main: server is listening on http://127.0.0.1:8080 - starting the main loop +srv update_slots: all slots are idle +``` + diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/1_setup.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/1_setup.md new file mode 100644 index 0000000000..6fbe8aeb81 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/1_setup.md @@ -0,0 +1,123 @@ +--- +title: Setup Tomcat Benchmark Environment +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +## Overview +There are numerous performance analysis methods and tools for Java applications, among which the call stack flame graph method is regarded as a conventional entry-level approach. Therefore, generating flame graphs is considered a basic operation. +Various methods and tools are available for generating Java flame graphs, including `async-profiler`, `Java Agent`, `jstack`, `JFR` (Java Flight Recorder), etc. +This Learning Path focuses on introducing two simple and easy-to-use methods: `async-profiler` and `Java Agent`. + + +## Setup Benchmark Server - Tomcat +- [Apache Tomcat](https://tomcat.apache.org/) is an open-source Java Servlet container that enables running Java web applications, handling HTTP requests and serving dynamic content. +- As a core component in Java web development, Apache Tomcat supports Servlet, JSP, and WebSocket technologies, providing a lightweight runtime environment for web apps. + +1. Start by installing Java Development Kit (JDK) on your Arm-based server running Ubuntu: +```bash +sudo apt update +sudo apt install -y openjdk-21-jdk +``` + +2. Next, you can install Tomcat by either [building it from source](https://github.com/apache/tomcat) or downloading the pre-built package simply from [the official website](https://tomcat.apache.org/whichversion.html) +```bash +wget -c https://dlcdn.apache.org/tomcat/tomcat-11/v11.0.9/bin/apache-tomcat-11.0.9.tar.gz +tar xzf apache-tomcat-11.0.9.tar.gz +``` + +3. If you intend to access the built-in examples of Tomcat via an intranet IP or even an external IP, you need to modify a configuration file as shown: +```bash +vi apache-tomcat-11.0.9/webapps/examples/META-INF/context.xml +``` +Then change the allow value as shown and save the changes: +```output +# change +# to + +``` +Now you can start Tomcat Server: +```bash +./apache-tomcat-11.0.9/bin/startup.sh +``` + +The output from starting the server should look like: + +```output +Using CATALINA_BASE: /home/ubuntu/apache-tomcat-11.0.9 +Using CATALINA_HOME: /home/ubuntu/apache-tomcat-11.0.9 +Using CATALINA_TMPDIR: /home/ubuntu/apache-tomcat-11.0.9/temp +Using JRE_HOME: /usr +Using CLASSPATH: /home/ubuntu/apache-tomcat-11.0.9/bin/bootstrap.jar:/home/ubuntu/apache-tomcat-11.0.9/bin/tomcat-juli.jar +Using CATALINA_OPTS: +Tomcat started. +``` + +4. If you can access the page at "http://${tomcat_ip}:8080/examples" via a browser, you can proceed to the next benchmarking step. + +![example image alt-text#center](./_images/lp-tomcat-homepage.png "Tomcat-HomePage") + +![example image alt-text#center](./_images/lp-tomcat-examples.png "Tomcat-Examples") + +Make sure port 8080 is open in the security group of the IP address for your Arm-based Linux machine. + +## Setup Benchmark Client - [wrk2](https://github.com/giltene/wrk2) +`wrk2` is a high-performance HTTP benchmarking tool specialized in generating constant throughput loads and measuring latency percentiles for web services. `wrk2` is an enhanced version of `wrk` that provides accurate latency statistics under controlled request rates, ideal for performance testing of HTTP servers. + +Currently `wrk2` is only supported on x86 machines. You will run the Benchmark Client steps shown below on an x86_64 server running Ubuntu. + + +1. To use `wrk2`, you will need to install some essential tools before you can build it: +```bash +sudo apt-get update +sudo apt-get install -y build-essential libssl-dev git zlib1g-dev +``` + +2. Now you can clone and build it from source: +```bash +sudo git clone https://github.com/giltene/wrk2.git +cd wrk2 +sudo make +``` +Move the executable to somewhere in your PATH: +```bash +sudo cp wrk /usr/local/bin +``` + +3. Finally, you can run the benchmark of Tomcat through wrk2. +```bash +wrk -c32 -t16 -R50000 -d60 http://${tomcat_ip}:8080/examples/servlets/servlet/HelloWorldExample +``` +Shown below is the output of wrk2: + +```console +Running 1m test @ http://172.26.203.139:8080/examples/servlets/servlet/HelloWorldExample + 16 threads and 32 connections + Thread calibration: mean lat.: 0.986ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.984ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.999ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.994ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.983ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.989ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.991ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.993ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.985ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.990ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.987ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.990ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.984ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.991ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.978ms, rate sampling interval: 10ms + Thread calibration: mean lat.: 0.976ms, rate sampling interval: 10ms + Thread Stats Avg Stdev Max +/- Stdev + Latency 1.00ms 454.90us 5.09ms 63.98% + Req/Sec 3.31k 241.68 4.89k 63.83% + 2999817 requests in 1.00m, 1.56GB read +Requests/sec: 49997.08 +Transfer/sec: 26.57MB +``` + + diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/2_async-profiler.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/2_async-profiler.md new file mode 100644 index 0000000000..5346d45fac --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/2_async-profiler.md @@ -0,0 +1,32 @@ +--- +title: Java FlameGraph - Async-profiler +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Java Flame Graph Generation using [async-profiler](https://github.com/async-profiler/async-profiler) +`async-profiler` is a low-overhead sampling profiler for JVM applications, capable of capturing CPU, allocation, and lock events to generate actionable performance insights. +A lightweight tool for Java performance analysis, `async-profiler` produces flame graphs and detailed stack traces with minimal runtime impact, suitable for production environments. In this section, you will learn how to install and use it to profile your Tomcat instance being benchmarked. + +You should deploy `async-profiler` on the same Arm Linux machine where Tomcat is running to ensure accurate performance profiling. +1. Download async-profiler-4.0 and uncompress +```bash +wget -c https://github.com/async-profiler/async-profiler/releases/download/v4.0/async-profiler-4.0-linux-arm64.tar.gz +tar xzf async-profiler-4.0-linux-arm64.tar.gz +``` + +2. Run async-profiler to profile the Tomcat instance under benchmarking +```bash +cd async-profiler-4.0-linux-arm64/bin +./asprof -d 10 -f profile.html $(jps | awk /Bootstrap/'{print $1}') +``` +You can also run: +``` +./asprof -d 10 -f profile.html ${tomcat_process_id} +``` + +3. Now launch `profile.html` in a browser to analyse your profiling result + +![example image alt-text#center](_images/lp-flamegraph-async.png "Java Flame Graph via async-profiler") diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/3_agent.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/3_agent.md new file mode 100644 index 0000000000..96ff1ea117 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/3_agent.md @@ -0,0 +1,48 @@ +--- +title: Java FlameGraph - Java Agent +weight: 4 + + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Java Flame Graph Generation using Java agent and perf +To profile a Java application with perf and ensure proper symbol resolution, you must include `libperf-jvmti.so` when launching the Java application. +- `libperf-jvmti.so` is a JVM TI agent library enabling perf to resolve Java symbols, facilitating accurate profiling of Java applications. +- A specialized shared library, `libperf-jvmti.so` bridges perf and the JVM, enabling proper translation of memory addresses to Java method names during profiling. + +1. Find where `libperf-jvmti.so` is installed on your Arm-based Linux server: +```bash +pushd /usr/lib +find . -name libperf-jvmti.so` +``` +The output will show the path of the library that you will then include in your Tomcat setup file: +```bash +vi apache-tomcat-11.0.9/bin/catalina.sh +``` +Add JAVA_OPTS="$JAVA_OPTS -agentpath:/usr/lib/linux-tools-6.8.0-63/libperf-jvmti.so -XX:+PreserveFramePointer" to `catalina.sh`. Make sure the path matches the location on your machine from the previous step. + +Now shutdown and restart Tomcat: +```bash +cd apache-tomcat-11.0.9/bin +./shutdown.sh +./startup.sh +``` + +2. Use perf to profile Tomcat, and restart wrk that running on your x86 instance if necessary: +```bash +sudo perf record -g -k1 -p $(jps | awk /Bootstrap/'{print $1}') -- sleep 10 +``` +This command will record the collected data in a file named `perf.data` + +3. Convert the collected `perf.data` into a Java flame graph using FlameGraph +```bash +git clone https://github.com/brendangregg/FlameGraph.git +export PATH=$PATH:`pwd`/FlameGraph +sudo perf inject -j -i perf.data | perf script | stackcollapse-perf.pl | flamegraph.pl &> profile.svg +``` + +4. You can now successfully launch `profile.svg` in a browser to analyse the profiling result + +![example image alt-text#center](_images/lp-flamegraph-agent.png "Java Flame Graph via Java agent and perf") diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-agent.png b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-agent.png new file mode 100644 index 0000000000..d06da3f822 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-agent.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-async.png b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-async.png new file mode 100644 index 0000000000..d78ac14934 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-flamegraph-async.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-examples.png b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-examples.png new file mode 100644 index 0000000000..2590c08f65 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-examples.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-homepage.png b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-homepage.png new file mode 100644 index 0000000000..56b23a733a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_images/lp-tomcat-homepage.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md new file mode 100644 index 0000000000..06a3c9281c --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_index.md @@ -0,0 +1,57 @@ +--- +title: Analyze Java Performance on Arm servers using FlameGraphs + +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers looking to analyze the performance of their Java applications on the Arm Neoverse based servers using flame graphs. + +learning_objectives: + - How to set up tomcat benchmark environment + - How to generate flame graphs for Java applications using async-profiler + - How to generate flame graphs for Java applications using Java agent + +prerequisites: + - An Arm-based and x86 computer running Ubuntu. You can use a server instance from a cloud service provider of your choice. + - Basic familiarity with Java applications and flame graphs + +author: Ying Yu, Martin Ma + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse + +tools_software_languages: + - OpenJDK-21 + - Tomcat + - Async-profiler + - FlameGraph + - wrk2 +operatingsystems: + - Linux + + +further_reading: + - resource: + title: OpenJDK Wiki + link: https://wiki.openjdk.org/ + type: documentation + - resource: + title: Java FlameGraphs + link: https://www.brendangregg.com/flamegraphs.html + type: website + + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-perf-flamegraph/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index 48c5619c87..5e020828c9 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 387 + content_total: 391 content_with_all_tests_passing: 0 content_with_tests_enabled: 61 sw_categories: @@ -196,3 +196,4 @@ sw_categories: zlib: readable_title: Learn how to build and use Cloudflare zlib on Arm servers tests_and_status: [] + diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index ad533ae6fa..12463a4ab0 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -6895,3 +6895,120 @@ avg_close_time_hrs: 0 num_issues: 21 percent_closed_vs_total: 0.0 +- a_date: '2025-07-28' + content: + automotive: 3 + cross-platform: 34 + embedded-and-microcontrollers: 43 + install-guides: 105 + iot: 6 + laptops-and-desktops: 38 + mobile-graphics-and-gaming: 34 + servers-and-cloud-computing: 128 + total: 391 + contributions: + external: 98 + internal: 516 + github_engagement: + num_forks: 30 + num_prs: 14 + individual_authors: + adnan-alsinan: 2 + alaaeddine-chakroun: 2 + albin-bernhardsson: 1 + alex-su: 1 + alexandros-lamprineas: 1 + andrew-choi: 2 + andrew-kilroy: 1 + annie-tallund: 4 + arm: 3 + arnaud-de-grandmaison: 5 + aude-vuilliomenet: 1 + avin-zarlez: 1 + barbara-corriero: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + bright-edudzi-gershon-kordorwu: 1 + chaodong-gong: 1 + chen-zhang: 1 + chenying-kuo: 1 + christophe-favergeon: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 2 + david-spickett: 2 + dawid-borycki: 33 + diego-russo: 2 + dominica-abena-o.-amanfo: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 2 + georgios-mermigkis: 1 + geremy-cohen: 3 + gian-marco-iodice: 1 + graham-woodward: 1 + han-yin: 1 + iago-calvo-lista: 1 + james-whitaker: 1 + jason-andrews: 105 + jeff-young: 1 + joana-cruz: 1 + joe-stech: 6 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez: 1 + julie-gaskin: 5 + julien-jayat: 1 + julio-suarez: 6 + jun-he: 1 + kasper-mecklenburg: 1 + kieran-hejmadi: 12 + koki-mitsunami: 2 + konstantinos-margaritis: 8 + kristof-beyls: 1 + leandro-nunes: 1 + liliya-wu: 1 + mark-thurman: 1 + masoud-koleini: 1 + mathias-brossard: 1 + michael-hall: 5 + na-li: 1 + nader-zouaoui: 2 + nikhil-gupta: 1 + nina-drozd: 1 + nobel-chowdary-mandepudi: 6 + odin-shen: 9 + owen-wu: 2 + pareena-verma: 46 + paul-howard: 3 + peter-harris: 1 + pranay-bakre: 5 + preema-merlin-dsouza: 1 + przemyslaw-wirkus: 2 + qixiang-xu: 1 + rani-chowdary-mandepudi: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + shuheng-deng: 1 + thirdai: 1 + tianyu-li: 2 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari: 2 + visualsilicon: 1 + willen-yang: 1 + william-liang: 1 + ying-yu: 2 + yiyang-fan: 1 + zach-lasiuk: 2 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 21 + percent_closed_vs_total: 0.0 \ No newline at end of file