diff --git a/README.md b/README.md index 1c0742370de39..28f91edd389c0 100644 --- a/README.md +++ b/README.md @@ -1,610 +1,169 @@ -# llama.cpp +# llama.cpp-gfx906: AMD MI50/MI60/Vega7 fork -![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) +This fork is specifically optimized for AMD GFX906 architecture (MI50, MI60, Vega VII) . The aim of this fork is to maximize prompt-processing and inference on a single card. Compatability is now tested on Qwen3 30B-A3B Thinking 2507 (Q4_0) and Qwen3 4B Instruct 2507 (Q4_0). -[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) -[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) +--- -[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md) +## Key Features of b6628 - forked -LLM inference in C/C++ +- **Replaced bpermute instructions with swizzle** (AMD native warp reductions, main contribution) -## Recent API changes +--- -- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289) -- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291) +## Target Hardware & Models -## Hot topics +### Supported GPUs +- **AMD MI50** (Vega 20) (only one actually tested) +- **AMD MI60** (Vega 20) +- **AMD Vega VII** (Vega 20) -- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)** -- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)** -- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095) -- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen) -- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md) -- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode -- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim -- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123 -- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 -- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +### Supported Models +- **All the llamacpp supported models** +- Tested extensively with **Qwen3-30B-A3B** (Q4_0, Q4_1) ----- +### Performance comparison -- lama bench +- did not use the -d because long prompt processing make gpu to reach 80C and throttle, making the comparison difficult +- all models tested with: + +| backend | ngl | threads | n_batch | type_k | type_v | fa | +| ---------- | --- | ------- | ------- | ------ | ------ | -- | +| ROCm | 99 | 12 | 1024 | q8_0 | q8_0 | 1 | -## Quick start +| **normal:** | size | params | test | t/s | +| ------------------------------ | ---------: | ---------: | --------------: | -------------------: | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp512 | 1768.68 ± 0.86 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp1024 | 1728.56 ± 0.33 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp2048 | 1636.15 ± 0.57 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp4096 | 1469.47 ± 1.09 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | tg128 | 116.76 ± 0.02 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | tg256 | 115.45 ± 1.11 | -Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine: -- Install `llama.cpp` using [brew, nix or winget](docs/install.md) -- Run with Docker - see our [Docker documentation](docs/docker.md) -- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases) -- Build from source by cloning this repository - check out [our build guide](docs/build.md) +| **swizzle:** | size | params | test | t/s | +| ------------------------------ | ---------: | ---------: | --------------: | -------------------: | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp512 | 1777.11 ± 0.65 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp1024 | 1734.32 ± 0.24 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp2048 | 1643.62 ± 0.25 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | pp4096 | 1479.31 ± 0.17 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | tg128 | 116.94 ± 0.04 | +| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | tg256 | 116.66 ± 0.04 | -Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more. -Example command: +| **normal:** | size | params | test | t/s | +| ------------------------------ | ---------: | ---------: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp512 | 1269.93 ± 9.69 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp1024 | 1255.27 ± 6.57 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp2048 | 1196.97 ± 2.63 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp4096 | 1081.50 ± 1.17 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | tg128 | 92.84 ± 0.10 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | tg256 | 92.69 ± 0.05 | -```sh -# Use a local model file -llama-cli -m my_model.gguf -# Or download and run a model directly from Hugging Face -llama-cli -hf ggml-org/gemma-3-1b-it-GGUF +| **swizzle:** | size | params | test | t/s | +| ------------------------------ | ---------: | ---------: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp512 | 1272.79 ± 7.94 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp1024 | 1257.33 ± 6.35 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp2048 | 1200.32 ± 2.16 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | pp4096 | 1087.70 ± 1.32 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | tg128 | 93.41 ± 0.09 | +| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | tg256 | 93.27 ± 0.05 | -# Launch OpenAI-compatible API server -llama-server -hf ggml-org/gemma-3-1b-it-GGUF -``` - -## Description - -The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide -range of hardware - locally and in the cloud. - -- Plain C/C++ implementation without any dependencies -- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks -- AVX, AVX2, AVX512 and AMX support for x86 architectures -- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use -- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA) -- Vulkan and SYCL backend support -- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity - -The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library. - -
-Models - -Typically finetunes of the base models below are supported as well. - -Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md) - -#### Text-only - -- [X] LLaMA 🦙 -- [x] LLaMA 2 🦙🦙 -- [x] LLaMA 3 🦙🦙🦙 -- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) -- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) -- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) -- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) -- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) -- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423) -- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) -- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) -- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) -- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187) -- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) -- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417) -- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553) -- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) -- [X] [StableLM models](https://huggingface.co/stabilityai) -- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) -- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557) -- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) -- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003) -- [x] [GPT-2](https://huggingface.co/gpt2) -- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118) -- [x] [InternLM2](https://huggingface.co/models?search=internlm2) -- [x] [CodeShell](https://github.com/WisdomShell/codeshell) -- [x] [Gemma](https://ai.google.dev/gemma) -- [x] [Mamba](https://github.com/state-spaces/mamba) -- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf) -- [x] [Xverse](https://huggingface.co/models?search=xverse) -- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r) -- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) -- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) -- [x] [OLMo](https://allenai.org/olmo) -- [x] [OLMo 2](https://allenai.org/olmo) -- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) -- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) -- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) -- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) -- [x] [Smaug](https://huggingface.co/models?search=Smaug) -- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B) -- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM) -- [x] [Flan T5](https://huggingface.co/models?search=flan-t5) -- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) -- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat) -- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e) -- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) -- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) -- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) -- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) -- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) -- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) -- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1) -- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) -- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview) -- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32) -- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) -- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7) - -#### Multimodal - -- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) -- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) -- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) -- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) -- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) -- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) -- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) -- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) -- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) -- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge) -- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) -- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) - -
- -
-Bindings - -- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama) -- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) -- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) -- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) -- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) -- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli) -- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) -- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) -- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) -- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) -- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) -- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) -- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client) -- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) -- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) -- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) -- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) -- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) -- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) -- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna) -- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) -- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) -- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) -- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326) -- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) -- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) -- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) -- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi) - -
- -
-UIs - -*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* - -- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) -- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) -- [Dot](https://github.com/alexpinel/Dot) (GPL) -- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) -- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0) -- [janhq/jan](https://github.com/janhq/jan) (AGPL) -- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT) -- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0) -- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) -- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) -- [LARS](https://github.com/abgulati/LARS) (AGPL) -- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) -- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) -- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) -- [LMStudio](https://lmstudio.ai/) (proprietary) -- [LocalAI](https://github.com/mudler/LocalAI) (MIT) -- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) -- [MindMac](https://mindmac.app) (proprietary) -- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) -- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) -- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0) -- [nat/openplayground](https://github.com/nat/openplayground) (MIT) -- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT) -- [ollama/ollama](https://github.com/ollama/ollama) (MIT) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) -- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT) -- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT) -- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT) -- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) -- [ramalama](https://github.com/containers/ramalama) (MIT) -- [semperai/amica](https://github.com/semperai/amica) (MIT) -- [withcatai/catai](https://github.com/withcatai/catai) (MIT) -- [Autopen](https://github.com/blackhole89/autopen) (GPL) - -
- -
-Tools - -- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML -- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp -- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption -- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage -- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example) - -
- -
-Infrastructure - -- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure -- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs -- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly -- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server -- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale -- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes. -
- -
-Games - -- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. - -
- - -## Supported backends - -| Backend | Target devices | -| --- | --- | -| [Metal](docs/build.md#metal-build) | Apple Silicon | -| [BLAS](docs/build.md#blas-build) | All | -| [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | -| [MUSA](docs/build.md#musa) | Moore Threads GPU | -| [CUDA](docs/build.md#cuda) | Nvidia GPU | -| [HIP](docs/build.md#hip) | AMD GPU | -| [Vulkan](docs/build.md#vulkan) | GPU | -| [CANN](docs/build.md#cann) | Ascend NPU | -| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | -| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE | -| [WebGPU [In Progress]](docs/build.md#webgpu) | All | -| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | - -## Obtaining and quantizing models - -The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: - -- [Trending](https://huggingface.co/models?library=gguf&sort=trending) -- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) - -You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. For example: - -```sh -llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -``` - -By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`. - -After downloading a model, use the CLI tools to run it locally - see below. - -`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. - -The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`: - -- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes -- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123) -- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268) -- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669) - -To learn more about model quantization, [read this documentation](tools/quantize/README.md) - -## [`llama-cli`](tools/main) - -#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality. - --
- Run in conversation mode - - Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME` - - ```bash - llama-cli -m model.gguf - - # > hi, who are you? - # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today? - # - # > what is 1+1? - # Easy peasy! The answer to 1+1 is... 2! - ``` - -
- --
- Run in conversation mode with custom chat template - - ```bash - # use the "chatml" template (use -h to see the list of supported templates) - llama-cli -m model.gguf -cnv --chat-template chatml - - # use a custom template - llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:' - ``` - -
- --
- Run simple text completion - - To disable conversation mode explicitly, use `-no-cnv` - - ```bash - llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv - - # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. - ``` - -
- --
- Constrain the output with a custom grammar - - ```bash - llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' - - # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"} - ``` - - The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md). - - For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/ - -
- - -## [`llama-server`](tools/server) - -#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs. - --
- Start a local HTTP server with default configuration on port 8080 - - ```bash - llama-server -m model.gguf --port 8080 - - # Basic web UI can be accessed via browser: http://localhost:8080 - # Chat completion endpoint: http://localhost:8080/v1/chat/completions - ``` - -
- --
- Support multiple-users and parallel decoding - - ```bash - # up to 4 concurrent requests, each with 4096 max context - llama-server -m model.gguf -c 16384 -np 4 - ``` - -
- --
- Enable speculative decoding +### Performance comparison -- prompt: write a 1000 words story +- tried some times to get same token count from both benches, however the slight speed increase is visible. - ```bash - # the draft.gguf model should be a small variant of the target model.gguf - llama-server -m model.gguf -md draft.gguf - ``` +|**normal:** | +| ---------------------------------------------------------------------------------------------------| +|prompt eval time = 61.27 ms / 15 tokens ( 4.08 ms per token, 244.83 tokens per second) | +| eval time = 27459.54 ms / 2238 tokens ( 12.27 ms per token, 81.50 tokens per second) | +| total time = 27520.80 ms / 2253 tokens | -
+|**swizzle:** | +| ---------------------------------------------------------------------------------------------------| +|prompt eval time = 60.72 ms / 15 tokens ( 4.05 ms per token, 247.03 tokens per second) | +| eval time = 26540.24 ms / 2240 tokens ( 11.85 ms per token, 84.40 tokens per second) | +| total time = 26600.97 ms / 2255 tokens | --
- Serve an embedding model - ```bash - # use the /embedding endpoint - llama-server -m model.gguf --embedding --pooling cls -ub 8192 - ``` -
+## Quick Start --
- Serve a reranking model +### Prerequisites - ```bash - # use the /reranking endpoint - llama-server -m model.gguf --reranking - ``` +- **ROCm 7.0.1** (tested version - other versions may work) +- **CMake 3.21+** +- **HIP compiler toolchain** +- **AMD GFX906 GPU** (MI50/MI60/Vega VII) +- **UBUNTU 24.04** (should work with other systems, not tested) -
+### System Dependencies --
- Constrain all outputs with a grammar - - ```bash - # custom grammar - llama-server -m model.gguf --grammar-file grammar.gbnf - - # JSON - llama-server -m model.gguf --grammar-file grammars/json.gbnf - ``` - -
- - -## [`llama-perplexity`](tools/perplexity) - -#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text. - --
- Measure the perplexity over a text file - - ```bash - llama-perplexity -m model.gguf -f file.txt - - # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ... - # Final estimate: PPL = 5.4007 +/- 0.67339 - ``` - -
- --
- Measure KL divergence - - ```bash - # TODO - ``` - -
- -[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) - -## [`llama-bench`](tools/llama-bench) - -#### Benchmark the performance of the inference for various parameters. - --
- Run default benchmark - - ```bash - llama-bench -m model.gguf - - # Output: - # | model | size | params | backend | threads | test | t/s | - # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: | - # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 | - # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 | - # - # build: 3e0ba0e60 (4229) - ``` - -
- -## [`llama-run`](tools/run) - -#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3]. - --
- Run a model with a specific prompt (by default it's pulled from Ollama registry) - - ```bash - llama-run granite-code - ``` - -
- -[^3]: [RamaLama](https://github.com/containers/ramalama) - -## [`llama-simple`](examples/simple) - -#### A minimal example for implementing apps with `llama.cpp`. Useful for developers. +```bash +# Ubuntu +sudo apt update +sudo apt install cmake build-essential --
- Basic text completion +# Install ROCm 7.0.1 following AMD's official guide +# Tensile library for gfx906 must be imported to use this ROCM version - ```bash - llama-simple -m model.gguf +# Verify ROCm installation +/opt/rocm/bin/rocm-smi +``` - # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of - ``` +### Build Instructions -
+#### 1. Clone the repository +```bash +git clone https://github.com/iacopPBK/llama.cpp-gfx906.git +cd llama.cpp-gfx906 +``` -## Contributing +#### 2. Compile using the provided script -- Contributors can open PRs -- Collaborators will be invited based on contributions -- Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch -- Any help with managing issues, PRs and projects is very appreciated! -- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions -- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information -- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205) -- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) +```bash +chmod +x SCRIPT_compile_MI50.sh +./SCRIPT_compile_MI50.sh +``` -## Other documentation +The compilation script automatically: +- Sets GFX906-specific compiler flags +- Enables HIP backend with GFX906 optimizations +- Builds with flash attention support +- Links against ROCm libraries (rocBLAS, hipBLAS) -- [main (cli)](tools/main/README.md) -- [server](tools/server/README.md) -- [GBNF grammars](grammars/README.md) +#### 3. Launch the server -#### Development documentation +```bash +# Edit SCRIPT_launch_server_MI50.sh to set your model path +vim SCRIPT_launch_server_MI50.sh -- [How to build](docs/build.md) -- [Running on Docker](docs/docker.md) -- [Build on Android](docs/android.md) -- [Performance troubleshooting](docs/development/token_generation_performance_tips.md) -- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks) +# Launch server with FA and KV quantizations +./SCRIPT_launch_server_MI50.sh +``` -#### Seminal papers and background on the models +### Environment Variables -If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: -- LLaMA: - - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) - - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) -- GPT-3 - - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) -- GPT-3.5 / InstructGPT / ChatGPT: - - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) +The optimized build sets these automatically: -## XCFramework -The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS, -and macOS. It can be used in Swift projects without the need to compile the -library from source. For example: -```swift -// swift-tools-version: 5.10 -// The swift-tools-version declares the minimum version of Swift required to build this package. +```bash +export HSA_OVERRIDE_GFX_VERSION=9.0.6 +export HIP_VISIBLE_DEVICES=0 +export ROCR_VISIBLE_DEVICES=0 +export GGML_BACKEND_HIP=1 +export HCC_AMDGPU_TARGET=gfx906 +``` -import PackageDescription +--- -let package = Package( - name: "MyLlamaPackage", - targets: [ - .executableTarget( - name: "MyLlamaPackage", - dependencies: [ - "LlamaFramework" - ]), - .binaryTarget( - name: "LlamaFramework", - url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip", - checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab" - ) - ] -) -``` -The above example is using an intermediate build `b5046` of the library. This can be modified -to use a different version by changing the URL and checksum. +## Build Configuration -## Completions -Command-line completion is available for some environments. +The build enables these optimizations: -#### Bash Completion -```bash -$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash -$ source ~/.llama-completion.bash -``` -Optionally this can be added to your `.bashrc` or `.bash_profile` to load it -automatically. For example: -```console -$ echo "source ~/.llama-completion.bash" >> ~/.bashrc -``` +- `GGML_HIP=ON` - Enable HIP backend +- `GGML_HIP_GFX906_OPTIMIZED=ON` - GFX906-specific optimizations +- `CMAKE_HIP_ARCHITECTURES=gfx906` - Target GFX906 architecture +- Flash attention with F16 precision (hardcoded) -## Dependencies +--- -- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license -- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain -- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License -- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License -- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License -- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html) -- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain +*Built with care for the AMD GFX906 community ❤️‍🔥 x 1000* diff --git a/SCRIPT_compile_MI50.sh b/SCRIPT_compile_MI50.sh new file mode 100755 index 0000000000000..2a3ffc09a266c --- /dev/null +++ b/SCRIPT_compile_MI50.sh @@ -0,0 +1,202 @@ +#!/bin/bash +# +# SCRIPT MI50 Compilation Script for llama.cpp +# Optimized build for AMD MI50 (gfx906) with ROCm/HIP support +# +# This script compiles llama.cpp with maximum optimizations for the MI50 GPU +# including server support, flash attention, and all performance features +# + +set -e # Exit on any error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}======================================${NC}" +echo -e "${BLUE} SCRIPT MI50 llama.cpp Builder ${NC}" +echo -e "${BLUE}======================================${NC}" + +# Check if we're in the right directory +if [[ ! -f "CMakeLists.txt" ]]; then + echo -e "${RED}Error: Not in llama.cpp root directory${NC}" + echo "Please run this script from the llama.cpp root directory" + exit 1 +fi + +# Verify ROCm installation +echo -e "${YELLOW}Checking ROCm installation...${NC}" +if ! command -v rocm_agent_enumerator &> /dev/null; then + echo -e "${RED}Error: ROCm not found. Please install ROCm first.${NC}" + exit 1 +fi + +# Check for gfx906 support +GPUS=$(rocm_agent_enumerator) +if [[ ! "$GPUS" =~ "gfx906" ]]; then + echo -e "${RED}Warning: gfx906 (MI50) not detected in system${NC}" + echo "Available GPUs: $GPUS" + read -p "Continue anyway? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +echo -e "${GREEN}✓ ROCm installation verified${NC}" +echo -e "${GREEN}✓ Available GPUs: $GPUS${NC}" + +# Set ROCm environment variables for optimal gfx906 compilation +echo -e "${YELLOW}Setting ROCm environment variables for gfx906...${NC}" +export ROCM_PATH=${ROCM_PATH:-/opt/rocm} +export HCC_AMDGPU_TARGET=gfx906 +export HSA_OVERRIDE_GFX_VERSION=9.0.6 +export AMDGPU_TARGETS=gfx906 +export GPU_TARGETS=gfx906 + +# Clean previous build +echo -e "${YELLOW}Cleaning previous build...${NC}" +rm -rf build +mkdir -p build + +# Configure with maximum optimizations +echo -e "${YELLOW}Configuring CMake with MI50 optimizations...${NC}" +cd build + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_HIP_COMPILER_FORCED=1 \ + -DCMAKE_HIP_ARCHITECTURES=gfx906 \ + -DCMAKE_C_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -ffast-math -fno-finite-math-only -ffp-contract=fast" \ + -DCMAKE_CXX_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -DGGML_HIP_GFX906_OPTIMIZED -ffast-math -fno-finite-math-only -ffp-contract=fast" \ + -DCMAKE_HIP_FLAGS=" --offload-arch=gfx906 -DGGML_HIP_GFX906_OPTIMIZED -Wno-ignored-attributes -Wno-cuda-compat -Wno-unused-result -mllvm -amdgpu-simplify-libcall -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -ffast-math -ffp-contract=fast" \ + -DGGML_HIP=ON \ + -DGGML_HIP_MMQ_MFMA=ON \ + -DGGML_HIP_GRAPHS=ON \ + -DGGML_HIP_NO_VMM=ON \ + -DGGML_HIP_EXPORT_METRICS=ON \ + -DGGML_HIP_GFX906_OPTIMIZED=ON \ + -DGGML_NATIVE=ON \ + -DGGML_CUDA_FA=ON \ + -DGGML_CUDA_FA_ALL_QUANTS=ON \ + -DGGML_CUDA_FORCE_MMQ=OFF \ + -DGGML_CUDA_FORCE_CUBLAS=OFF \ + -DGGML_CUDA_NO_PEER_COPY=ON \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_CURL=ON \ + -DLLAMA_STATIC=OFF + +if [[ $? -ne 0 ]]; then + echo -e "${RED}✗ CMake configuration failed${NC}" + exit 1 +fi + +echo -e "${GREEN}✓ CMake configuration successful${NC}" + +# Compile with all CPU cores and dump detailed logs +NPROC=$(nproc) +LOG_FILE="compilation_log.txt" +echo -e "${YELLOW}Compiling with $NPROC cores...${NC}" +echo -e "${YELLOW}This may take several minutes...${NC}" +echo -e "${YELLOW}Detailed compilation log will be saved to: $LOG_FILE${NC}" + +# Clear previous log +> $LOG_FILE + +# Run make with detailed output and save to log file +make -j$NPROC 2>&1 | tee $LOG_FILE + +if [[ ${PIPESTATUS[0]} -ne 0 ]]; then + echo -e "${RED}✗ Compilation failed${NC}" + echo -e "${RED}Check $LOG_FILE for detailed error information${NC}" + exit 1 +fi + +echo -e "${GREEN}✓ Compilation successful!${NC}" + +# Verify the build +echo -e "${YELLOW}Verifying build...${NC}" + +# Check if main executables were built +EXECUTABLES=( + "bin/llama-cli" + "bin/llama-server" + "bin/llama-bench" + "bin/libggml-hip.so" +) + +ALL_GOOD=true +for exec in "${EXECUTABLES[@]}"; do + if [[ -f "$exec" ]]; then + echo -e "${GREEN}✓ $exec built successfully${NC}" + + # Check HIP linking for executables (not libraries) + if [[ "$exec" =~ ^bin/llama- && ! "$exec" =~ \.so$ ]]; then + if ldd "$exec" | grep -q "libggml-hip.so"; then + echo -e "${GREEN} ✓ HIP backend linked${NC}" + else + echo -e "${RED} ✗ HIP backend not linked${NC}" + ALL_GOOD=false + fi + fi + else + echo -e "${RED}✗ $exec not found${NC}" + ALL_GOOD=false + fi +done + +if [[ "$ALL_GOOD" = false ]]; then + echo -e "${RED}✗ Build verification failed${NC}" + exit 1 +fi + +# Display ROCm libraries linked +echo -e "${YELLOW}ROCm libraries linked:${NC}" +ldd bin/llama-cli | grep -E "(hip|roc)" | head -5 + +# Quick functionality test +echo -e "${YELLOW}Testing HIP backend availability...${NC}" +if ./bin/llama-cli --help 2>/dev/null | grep -q "backend"; then + echo -e "${GREEN}✓ llama-cli responding correctly${NC}" +else + echo -e "${RED}✗ llama-cli test failed${NC}" +fi + +# Success message +echo +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN} ✓ BUILD COMPLETED SUCCESSFULLY ${NC}" +echo -e "${GREEN}======================================${NC}" +echo +echo -e "${BLUE}Built executables:${NC}" +echo " • CLI: ./build/bin/llama-cli" +echo " • Server: ./build/bin/llama-server" +echo " • Bench: ./build/bin/llama-bench" +echo +echo -e "${BLUE}Optimizations enabled:${NC}" +echo " • Target GPU: AMD MI50 (gfx906)" +echo " • HIP/ROCm backend with MFMA support" +echo " • Flash Attention kernels" +echo " • All quantization formats" +echo " • Performance metrics export" +echo " • Native CPU optimizations" +echo " • Optimization 5: GFX906 compiler flags (-ffast-math, early-inline, function-calls=false)" +echo +echo -e "${BLUE}Ready to run:${NC}" +echo " ./SCRIPT_launch_server_MI50.sh " +echo +echo -e "${YELLOW}Note: Make sure to set proper ROCm environment variables before running!${NC}" +echo +echo -e "${BLUE}For debugging with maximum HIP logging:${NC}" +echo " export AMD_LOG_LEVEL=8" +echo " export AMD_LOG_MASK=0xFFFFFFFF" +echo " export AMD_SERIALIZE_KERNEL=3" +echo " ./SCRIPT_launch_server_MI50.sh 2>&1 | tee hip_debug.log" diff --git a/SCRIPT_launch_server_MI50.sh b/SCRIPT_launch_server_MI50.sh new file mode 100755 index 0000000000000..122aeb3434d0d --- /dev/null +++ b/SCRIPT_launch_server_MI50.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# Launch llama.cpp server with AMD MI50 ROCm support +# Built for gfx906 architecture +# + +# Set ROCm environment variables for MI50 ONLY (optimal configuration) +export HSA_OVERRIDE_GFX_VERSION=9.0.6 +export HIP_VISIBLE_DEVICES=0 # ONLY MI50 (Device 0) +export CUDA_VISIBLE_DEVICES=0 # Additional CUDA compatibility +export ROCR_VISIBLE_DEVICES=0 # ROCr runtime device selection +export GGML_BACKEND_HIP=1 +export HCC_AMDGPU_TARGET=gfx906 + +# Path to your model file - update this to your actual model path + MODEL_PATH="/home/iacopo/Downloads/Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf" + +PARAMS=( + -m "$MODEL_PATH" + -ngl 99 # Offload all layers to GPU + -c 32000 # Context size + -np 1 # Parallel requests + -t $(nproc) # Use all CPU threads + --port 8090 # Server port + --host 0.0.0.0 # Listen on all interfaces + #--mlock # Lock model in memory + #--no-mmap # Don't use memory mapping + -b 512 # Batch size + #--cont-batching # Enable continuous batching + --flash-attn on # Enable flash attention + --cache-type-k q8_0 # q8_0 quantized K cache (50% memory savings) + --cache-type-v q8_0 # q8_0 quantized V cache (50% memory savings) + --main-gpu 0 # Force MI50 as main GPU + --device "ROCm0" # Explicit ROCm device + # --no-warmup # Skip warmup for consistent profiling +) + +# Check if model file exists +if [ ! -f "$MODEL_PATH" ]; then + echo "Error: Model file not found at: $MODEL_PATH" + echo "Usage: $0 [model_path] [additional_args...]" + echo "" + echo "Example: $0 ./models/llama-2-7b-chat.q4_0.gguf --ctx-size 8192" + exit 1 +fi + +# Display GPU info +echo "=== ROCm GPU Information ===" +rocm-smi --showproductname --showtemp --showmeminfo --showuse --showpower +echo "" + +# Launch llama.cpp server +echo "=== Launching llama.cpp server with MI50 optimization ===" +echo "Model: $MODEL_PATH" +echo "GPU: MI50 (gfx906)" +echo "Server will be available at: http://localhost:8080" +echo "Parameters: ${PARAMS[*]} ${@:2}" +echo "" + +cd "$(dirname "$0")" +./build/bin/llama-server "${PARAMS[@]}" "${@:2}" diff --git a/SCRIPT_llama_bench.sh b/SCRIPT_llama_bench.sh new file mode 100755 index 0000000000000..e70804280fcda --- /dev/null +++ b/SCRIPT_llama_bench.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# +# Run llama-bench with AMD MI50 ROCm support and GFX906 optimizations +# Built for gfx906 architecture - matches SCRIPT_launch_server configuration +# + +# Set ROCm environment variables for MI50 ONLY (optimal configuration) +export HSA_OVERRIDE_GFX_VERSION=9.0.6 +export HIP_VISIBLE_DEVICES=0 # ONLY MI50 (Device 0) +export CUDA_VISIBLE_DEVICES=0 # Additional CUDA compatibility +export ROCR_VISIBLE_DEVICES=0 # ROCr runtime device selection +export GGML_BACKEND_HIP=1 +export HCC_AMDGPU_TARGET=gfx906 + +# Path to your model file - update this to your actual model path +MODEL_PATH="/home/iacopo/Downloads/Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf" +# MODEL_PATH="/home/iacopo/Downloads/Qwen_Qwen3-4B-Instruct-2507-Q4_0.gguf" + +# Default benchmark parameters (matching server configuration) +BENCH_PARAMS=( + -m "$MODEL_PATH" + -ngl 99 # Offload all layers to GPU + -b 1024 # Batch size (matches server) + -t $(nproc) # Use all CPU threads + -fa 1 # Enable flash attention (GFX906 optimized + -ctk q8_0 # q8_0 quantized K cache (matches server) + -ctv q8_0 # q8_0 quantized V cache (matches server) + -d 512 + --main-gpu 0 # Force MI50 as main GPU + --progress # Show progress indicators +) + +# Benchmark configurations +QUICK_TEST="-p 512 -n 128" # Quick test +STANDARD_TEST="-p 512,1024,2048,4096 -n 128" # Standard comprehensive test +PROMPT_FOCUS="-p 512,1024,2048,4096,8192 -n 64" # Focus on prompt processing +GENERATION_FOCUS="-p 512 -n 128,256,512,1024" # Focus on text generation +EXTENSIVE_TEST="-p 512,1024,2048,4096,8192 -n 128,256,512" # Extensive testing + +# Function to display usage +usage() { + echo "Usage: $0 [test_type] [additional_llama-bench_args...]" + echo "" + echo "Test types:" + echo " quick - Quick test (512 prompt, 128 generation)" + echo " standard - Standard test (multiple prompt sizes, 2 gen sizes) [DEFAULT]" + echo " prompt - Focus on prompt processing (up to 8K prompts)" + echo " generation - Focus on text generation (multiple lengths)" + echo " extensive - Extensive testing (all combinations)" + echo " custom - Use your own parameters (provide as additional args)" + echo "" + echo "Examples:" + echo " $0 # Run standard benchmark" + echo " $0 quick # Run quick benchmark" + echo " $0 prompt # Test prompt processing" + echo " $0 custom -p 1024 -n 256 # Custom benchmark" + echo "" + echo "Model path: $MODEL_PATH" + echo "Output format: markdown (default), add -o csv for CSV output" +} + +# Check if model file exists +if [ ! -f "$MODEL_PATH" ]; then + echo "Error: Model file not found at: $MODEL_PATH" + echo "Please update MODEL_PATH in this script or ensure the model exists." + exit 1 +fi + +# Parse command line arguments +TEST_TYPE="${1:-standard}" +shift # Remove first argument, rest will be passed to llama-bench + +case "$TEST_TYPE" in + "help"|"-h"|"--help") + usage + exit 0 + ;; + "quick") + TEST_PARAMS="$QUICK_TEST" + echo "=== Running Quick Benchmark ===" + ;; + "standard") + TEST_PARAMS="$STANDARD_TEST" + echo "=== Running Standard Benchmark ===" + ;; + "prompt") + TEST_PARAMS="$PROMPT_FOCUS" + echo "=== Running Prompt Processing Focused Benchmark ===" + ;; + "generation") + TEST_PARAMS="$GENERATION_FOCUS" + echo "=== Running Text Generation Focused Benchmark ===" + ;; + "extensive") + TEST_PARAMS="$EXTENSIVE_TEST" + echo "=== Running Extensive Benchmark (this will take a while) ===" + ;; + "custom") + TEST_PARAMS="" + echo "=== Running Custom Benchmark ===" + echo "Custom parameters: $@" + ;; + *) + echo "Unknown test type: $TEST_TYPE" + usage + exit 1 + ;; +esac + +# Display system info +echo "Model: $(basename "$MODEL_PATH")" +echo "GPU: MI50 (gfx906) - Device 0 only" +echo "Flash Attention: ENABLED (GFX906 optimized)" +echo "KV Cache: q8_0 quantized" +echo "" + +# Display GPU info +echo "=== ROCm GPU Information ===" +rocm-smi --showproductname --showtemp --showmeminfo --showuse --showpower +echo "" + +# Change to script directory +cd "$(dirname "$0")" + +# Check if llama-bench exists +if [ ! -f "./build/bin/llama-bench" ]; then + echo "Error: llama-bench not found. Please compile the project first:" + echo " ./SCRIPT_compile_MI50.sh" + exit 1 +fi + +# Run the benchmark +echo "=== Starting llama-bench with GFX906 Flash Attention ===" +echo "Command: ./build/bin/llama-bench ${BENCH_PARAMS[*]} $TEST_PARAMS $@" +echo "" + +./build/bin/llama-bench "${BENCH_PARAMS[@]}" $TEST_PARAMS "$@" + +BENCH_EXIT_CODE=$? + +echo "" +echo "=== Benchmark Complete ===" +if [ $BENCH_EXIT_CODE -eq 0 ]; then + echo "✓ Benchmark completed successfully" + echo "" + echo "Tip: Add '-o csv' to get CSV output for analysis" + echo "Tip: Add '-r 10' to run more repetitions for better accuracy" +else + echo "✗ Benchmark failed with exit code: $BENCH_EXIT_CODE" +fi + +exit $BENCH_EXIT_CODE diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c4246b65eb788..f87e2b1398496 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -387,6 +387,99 @@ struct ggml_cuda_unroll<1> { } }; +//-----------------------------------------------------------------------------------------------------------------// +// AMD GFX906-optimized DS_SWIZZLE reduction primitives +#ifdef GGML_USE_HIP +__device__ __forceinline__ __attribute__((always_inline)) float ds_swizzle_xor_isa(float value, int xor_mask) { + int int_value = __float_as_int(value); + int result_int; + if (xor_mask == 1) { + result_int = __builtin_amdgcn_ds_swizzle(int_value, 0x041F); + } else if (xor_mask == 2) { + result_int = __builtin_amdgcn_ds_swizzle(int_value, 0x081F); + } else if (xor_mask == 4) { + result_int = __builtin_amdgcn_ds_swizzle(int_value, 0x101F); + } else if (xor_mask == 8) { + result_int = __builtin_amdgcn_ds_swizzle(int_value, 0x201F); + } else if (xor_mask == 16) { + result_int = __builtin_amdgcn_ds_swizzle(int_value, 0x401F); + } else { + result_int = int_value; // No swizzle for unsupported masks + } + return __int_as_float(result_int); +} + +template +__device__ __forceinline__ __attribute__((always_inline)) T amd_reduce_sum(T value) { + if constexpr (sizeof(T) == 4 && logical_width <= 32) { + if constexpr (logical_width >= 2) value += ds_swizzle_xor_isa(value, 1); + if constexpr (logical_width >= 4) value += ds_swizzle_xor_isa(value, 2); + if constexpr (logical_width >= 8) value += ds_swizzle_xor_isa(value, 4); + if constexpr (logical_width >= 16) value += ds_swizzle_xor_isa(value, 8); + if constexpr (logical_width >= 32) value += ds_swizzle_xor_isa(value, 16); + } else { +#pragma unroll + for (int offset = logical_width/2; offset > 0; offset >>= 1) { + value += __shfl_xor_sync(0xffffffff, value, offset, logical_width); + } + } + return value; +} + +template +__device__ __forceinline__ __attribute__((always_inline)) T amd_reduce_max(T value) { + if constexpr (sizeof(T) == 4 && logical_width <= 32) { + T shuffled; + if constexpr (logical_width >= 2) { + shuffled = ds_swizzle_xor_isa(value, 1); + value = (value > shuffled) ? value : shuffled; + } + if constexpr (logical_width >= 4) { + shuffled = ds_swizzle_xor_isa(value, 2); + value = (value > shuffled) ? value : shuffled; + } + if constexpr (logical_width >= 8) { + shuffled = ds_swizzle_xor_isa(value, 4); + value = (value > shuffled) ? value : shuffled; + } + if constexpr (logical_width >= 16) { + shuffled = ds_swizzle_xor_isa(value, 8); + value = (value > shuffled) ? value : shuffled; + } + if constexpr (logical_width >= 32) { + shuffled = ds_swizzle_xor_isa(value, 16); + value = (value > shuffled) ? value : shuffled; + } + } else { +#pragma unroll + for (int offset = logical_width/2; offset > 0; offset >>= 1) { + T shuffled = __shfl_xor_sync(0xffffffff, value, offset, logical_width); + value = (value > shuffled) ? value : shuffled; + } + } + return value; +} + +template +__device__ __forceinline__ half amd_reduce_max_half(half value) { + if constexpr (logical_width <= 32) { + float float_val = __half2float(value); + float_val = amd_reduce_max(float_val); + return __float2half(float_val); + } else { +#pragma unroll + for (int offset = logical_width/2; offset > 0; offset >>= 1) { + half shuffled = __shfl_xor_sync(0xffffffff, value, offset, logical_width); + float val_f = __half2float(value); + float shuffled_f = __half2float(shuffled); + value = __float2half(fmaxf(val_f, shuffled_f)); + } + return value; + } +} + +#endif +//-----------------------------------------------------------------------------------------------------------------// template static __device__ __forceinline__ int warp_reduce_sum(int x) { #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE @@ -397,16 +490,21 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { x += __shfl_xor_sync(0xffffffff, x, offset, width); } return x; -#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#endif } template static __device__ __forceinline__ float warp_reduce_sum(float x) { +#ifdef GGML_USE_HIP + return amd_reduce_sum(x); +#else + // Standard CUDA implementation #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { x += __shfl_xor_sync(0xffffffff, x, offset, width); } return x; +#endif // GGML_USE_HIP } template @@ -462,11 +560,30 @@ static __device__ __forceinline__ int warp_reduce_any(int x) { template static __device__ __forceinline__ float warp_reduce_max(float x) { +#ifdef GGML_USE_HIP + return amd_reduce_max(x); +#else + // Standard CUDA implementation #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width)); } return x; +#endif // GGML_USE_HIP +} + +// Overload for half precision with DS_SWIZZLE optimization +template +static __device__ __forceinline__ half warp_reduce_max(half x) { +#ifdef GGML_USE_HIP + return amd_reduce_max_half(x); +#else +#pragma unroll + for (int offset = width/2; offset > 0; offset >>= 1) { + x = ggml_cuda_hmax(x, __shfl_xor_sync(0xffffffff, x, offset, width)); + } + return x; +#endif // GGML_USE_HIP } static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 59c62553b01a2..3ec27a7f72ac3 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -66,9 +66,9 @@ static __global__ void flash_attn_ext_vec( #ifdef RDNA constexpr int nthreads_KQ_q = 2; #else - constexpr int nthreads_KQ_q = 4; + constexpr int nthreads_KQ_q = 2; #endif // RDNA - constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32); + constexpr int nthreads_V_q = 4; #else constexpr int nthreads_KQ_q = (D/4 < 32 ? D/4 : 32); constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);