diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 03cde0a48436f..7db85528659d3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -676,6 +676,35 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-cmake-visionos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-swift: runs-on: macos-latest diff --git a/build-xcframework.sh b/build-xcframework.sh index 37833dc4eabcb..2ce3939c43d6c 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \ -DCMAKE_SYSTEM_NAME=visionOS \ -DCMAKE_OSX_SYSROOT=xros \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ -S . cmake --build build-visionos --config Release -- -quiet @@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \ -DCMAKE_SYSTEM_NAME=visionOS \ -DCMAKE_OSX_SYSROOT=xrsimulator \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ -S . cmake --build build-visionos-sim --config Release -- -quiet diff --git a/ci/README.md b/ci/README.md index 8245c9df65db8..db4d9066816e8 100644 --- a/ci/README.md +++ b/ci/README.md @@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # with SYCL support source /opt/intel/oneapi/setvars.sh GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + +# with MUSA support +GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +``` + +## Running MUSA CI in a Docker Container + +Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container: + +### 1. Create a local directory to store cached models, configuration files and venv: + +```bash +mkdir -p $HOME/llama.cpp/ci-cache +``` + +### 2. Create a local directory to store CI run results: + +```bash +mkdir -p $HOME/llama.cpp/ci-results +``` + +### 3. Start a Docker container and run the CI: + +```bash +docker run --privileged -it \ + -v $HOME/llama.cpp/ci-cache:/ci-cache \ + -v $HOME/llama.cpp/ci-results:/ci-results \ + -v $PWD:/ws -w /ws \ + mthreads/musa:rc3.1.1-devel-ubuntu22.04 ``` + +Inside the container, execute the following commands: + +```bash +apt update -y && apt install -y cmake git python3.10-venv wget +git config --global --add safe.directory /ws +GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache +``` + +This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs. diff --git a/ci/run.sh b/ci/run.sh index 9fc19c89d80d2..efc24391d2e7e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -16,6 +16,9 @@ # # with VULKAN support # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with MUSA support +# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -52,13 +55,22 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then echo "source /opt/intel/oneapi/setvars.sh" exit 1 fi - + # Use only main GPU + export ONEAPI_DEVICE_SELECTOR="level_zero:0" + # Enable sysman for correct memory reporting + export ZES_ENABLE_SYSMAN=1 CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi if [ ! -z ${GG_BUILD_VULKAN} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1" fi + +if [ ! -z ${GG_BUILD_MUSA} ]; then + # Use qy1 by default (MTT S80) + MUSA_ARCH=${MUSA_ARCH:-21} + CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}" +fi ## helpers # download a file if it does not exist or if it is outdated @@ -808,7 +820,7 @@ export LLAMA_LOG_PREFIX=1 export LLAMA_LOG_TIMESTAMPS=1 if [ -z ${GG_BUILD_LOW_PERF} ]; then - # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt + # Create symlink: ./llama.cpp/models-mnt -> $MNT/models rm -rf ${SRC}/models-mnt mnt_models=${MNT}/models mkdir -p ${mnt_models} @@ -826,8 +838,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then fi ret=0 - -test $ret -eq 0 && gg_run ctest_debug +if [ -z ${GG_BUILD_SYCL} ]; then + # SYCL build breaks with debug build flags + test $ret -eq 0 && gg_run ctest_debug +fi test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then @@ -835,7 +849,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run rerank_tiny if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then - test $ret -eq 0 && gg_run test_scripts_debug + if [ -z ${GG_BUILD_SYCL} ]; then + test $ret -eq 0 && gg_run test_scripts_debug + fi test $ret -eq 0 && gg_run test_scripts_release fi @@ -846,7 +862,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run pythia_2_8b #test $ret -eq 0 && gg_run open_llama_7b_v2 fi - test $ret -eq 0 && gg_run ctest_with_model_debug + if [ -z ${GG_BUILD_SYCL} ]; then + test $ret -eq 0 && gg_run ctest_with_model_debug + fi test $ret -eq 0 && gg_run ctest_with_model_release fi fi diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d13d57c54154a..76ab4233ef2c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") else: raise ValueError("Mismatch between weight map and model parts for tensor names:\n" f"Missing tensors: {missing}\n" @@ -528,6 +529,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -537,13 +540,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: if token in added_vocab: # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: + if not added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): + if added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: # NOTE: this was added for Gemma. @@ -702,6 +705,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": # ref: https://huggingface.co/Xenova/gpt-4o res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" if res is None: logger.warning("\n") @@ -1099,13 +1105,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -1747,6 +1746,25 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + # we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + @Model.register("DeciLMForCausalLM") class DeciModel(Model): model_arch = gguf.MODEL_ARCH.DECI @@ -2404,10 +2422,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -2737,21 +2751,26 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) + _has_tok_embd = False + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - new_name = self.map_tensor_name(name) - - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None + new_name = self.map_tensor_name(name) - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + # assuming token_embd.weight is seen before output.weight + if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - return tensors + return [(new_name, data_torch)] @Model.register("InternLM2ForCausalLM") @@ -3366,7 +3385,7 @@ class Gemma3Model(Model): # we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): - hparams = Model.load_hparams(kwargs["dir_model"]) + hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) if "text_config" in hparams: hparams = {**hparams, **hparams["text_config"]} kwargs["hparams"] = hparams @@ -5339,7 +5358,7 @@ def main() -> None: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, + model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, eager=args.no_lazy, metadata_override=args.metadata, model_name=args.model_name, diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 07d3ce0e4eb78..ca90cf592932b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -110,6 +110,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, {"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", }, + {"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", }, ] diff --git a/docs/cuda-fedora.md b/docs/backend/CUDA-FEDORA.md similarity index 78% rename from docs/cuda-fedora.md rename to docs/backend/CUDA-FEDORA.md index 75cd2b499d086..1508faf776d28 100644 --- a/docs/cuda-fedora.md +++ b/docs/backend/CUDA-FEDORA.md @@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment) - [Installing Essential Development Tools](#installing-essential-development-tools) - [Adding the CUDA Repository](#adding-the-cuda-repository) -- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs) -- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts) -- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs) +- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries) - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package) - [Configuring the Environment](#configuring-the-environment) - [Verifying the Installation](#verifying-the-installation) @@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for sudo dnf distro-sync ``` -2. **Install the Default Text Editor (Optional):** +2. **Install **Vim** the default text editor (Optional):** ```bash sudo dnf install vim-default-editor --allowerasing @@ -97,36 +95,48 @@ After adding the repository, synchronize the package manager again: sudo dnf distro-sync ``` -## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs` +## Installing Nvidia Driver Libraries -We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go). +First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go): ```bash ls -la /usr/lib64/libcuda.so.1 ``` -**Explanation:** +### If *`libcuda.so.1`* is missing: + +``` +ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory +``` -- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA, - on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries. +**Explanation:** +The host dose not supply the CUDA drivers, **install them now:** -### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found). +#### Install the Nvidia Driver Libraries on Guest: ```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found). +### If *`libcuda.so.1`* exists: +``` +lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07 +``` + +**Explanation:** +The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:** -If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files. +#### Update the Toolbox RPM Database to include the Host-Supplied Libraries: -#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies) +Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host. + +##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies) ```bash -sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -#### 2. Update the RPM database to assume the installation of these packages. +##### 2. Update the RPM database to assume the installation of these packages. ```bash sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/* @@ -134,23 +144,26 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/* **Note:** -- The `--justdb` option only updates the RPM database, without touching the filesystem. +- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere. + +##### Check that the RPM Database has been correctly updated: -#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs` +**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing. -After manually installing the dependencies, run: ```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -You should receive a message indicating the package is already installed: +*(this time it will not install anything, as the database things that these packages are already installed)* ``` Updating and loading repositories: Repositories loaded. -Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed. -Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed. +Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed. Nothing to do. ``` @@ -207,9 +220,9 @@ You should see output similar to: ``` nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation -Built on Wed_Jan_15_19:20:09_PST_2025 -Cuda compilation tools, release 12.8, V12.8.61 -Build cuda_12.8.r12.8/compiler.35404655_0 +Built on Fri_Feb_21_20:23:50_PST_2025 +Cuda compilation tools, release 12.8, V12.8.93 +Build cuda_12.8.r12.8/compiler.35583870_0 ``` This output confirms that the CUDA compiler is accessible and indicates the installed version. diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 184cd419554f8..f97e696aaaac8 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -237,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` +**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target: + +```sh +git clone https://github.com/oneapi-src/oneDNN.git +cd oneDNN +cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake --build build-nvidia --config Release +``` + - **Adding support to AMD GPUs** **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. @@ -327,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # build all binary cmake --build build --config Release -j -v diff --git a/docs/build.md b/docs/build.md index 2e3975c145360..fbf12c7664d50 100644 --- a/docs/build.md +++ b/docs/build.md @@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo #### Compile and run inside a Fedora Toolbox Container -We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). +We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). **Recommended for:** - -- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). -- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde). +- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). + - (there are no supported CUDA packages for these systems) +- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads). + - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system) +- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean. - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download) @@ -433,6 +435,26 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). +## ArmĀ® KleidiAIā„¢ +KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend. + +To enable KleidiAI, go to the llama.cpp directory and build using CMake +```bash +cmake -B build -DGGML_CPU_KLEIDIAI=ON +cmake --build build --config Release +``` +You can verify that KleidiAI is being used by running +```bash +./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?" +``` +If KleidiAI is enabled, the ouput will contain a line similar to: +``` +load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB +``` +KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`. + +Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`. + ## Android To read documentation for how to build on Android, [click here](./android.md) diff --git a/docs/install.md b/docs/install.md index 0e23a2c9e7ae1..4971c18281cc9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -9,6 +9,13 @@ brew install llama.cpp ``` The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668 +## MacPorts + +```sh +sudo port install llama.cpp +``` +see also: https://ports.macports.org/port/llama.cpp/details/ + ## Nix On Mac and Linux, the Nix package manager can be used via diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 462a6d151933e..68e94b0b3c3f8 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -38,24 +38,6 @@ } #endif -GGML_ATTRIBUTE_FORMAT(1, 2) -static std::string fmt(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - const int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::string buf; - buf.resize(size); - const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - - return buf; -} - GGML_ATTRIBUTE_FORMAT(1, 2) static int printe(const char * fmt, ...) { va_list args; @@ -525,11 +507,11 @@ class HttpClient { int secs = static_cast(seconds) % 60; if (hrs > 0) { - return fmt("%dh %02dm %02ds", hrs, mins, secs); + return string_format("%dh %02dm %02ds", hrs, mins, secs); } else if (mins > 0) { - return fmt("%dm %02ds", mins, secs); + return string_format("%dm %02ds", mins, secs); } else { - return fmt("%ds", secs); + return string_format("%ds", secs); } } @@ -544,7 +526,7 @@ class HttpClient { } } - return fmt("%.2f %s", dbl_size, suffix[i]); + return string_format("%.2f %s", dbl_size, suffix[i]); } static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, @@ -578,7 +560,9 @@ class HttpClient { return (now_downloaded_plus_file_size * 100) / total_to_download; } - static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast(percentage)); } + static std::string generate_progress_prefix(curl_off_t percentage) { + return string_format("%3ld%% |", static_cast(percentage)); + } static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { const auto now = std::chrono::steady_clock::now(); @@ -589,9 +573,9 @@ class HttpClient { static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, double speed, double estimated_time) { const int width = 10; - return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, - human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, - human_readable_time(estimated_time).c_str()); + return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), + width, human_readable_size(total_to_download).c_str(), width, + human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str()); } static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) { diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index c7a3c426b623c..d0e6da8e4a1e0 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c2f1afeca450d..18caa9127662d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -830,6 +830,11 @@ struct server_task_result_cmpl_final : server_task_result { ret.push_back({"timings", timings.to_json()}); } + // extra fields for debugging purposes + if (verbose) { + ret["__verbose"] = to_json_non_oaicompat(); + } + return ret; } }; diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx index 79de305326241..d12b06e125e5a 100644 --- a/examples/server/webui/src/components/ChatScreen.tsx +++ b/examples/server/webui/src/components/ChatScreen.tsx @@ -99,13 +99,9 @@ export default function ChatScreen() { canvasData, replaceMessageAndGenerate, } = useAppContext(); - const [inputMsg, setInputMsg] = useState(prefilledMsg.content()); - const inputRef = useRef(null); + const textarea = useOptimizedTextarea(prefilledMsg.content()); - const { extraContext, clearExtraContext } = useVSCodeContext( - inputRef, - setInputMsg - ); + const { extraContext, clearExtraContext } = useVSCodeContext(textarea); // TODO: improve this when we have "upload file" feature const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined; @@ -135,9 +131,10 @@ export default function ChatScreen() { }; const sendNewMessage = async () => { - if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return; - const lastInpMsg = inputMsg; - setInputMsg(''); + const lastInpMsg = textarea.value(); + if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) + return; + textarea.setValue(''); scrollToBottom(false); setCurrNodeId(-1); // get the last message node @@ -146,13 +143,13 @@ export default function ChatScreen() { !(await sendMessage( currConvId, lastMsgNodeId, - inputMsg, + lastInpMsg, currExtra, onChunk )) ) { // restore the input message if failed - setInputMsg(lastInpMsg); + textarea.setValue(lastInpMsg); } // OK clearExtraContext(); @@ -195,16 +192,13 @@ export default function ChatScreen() { // send the prefilled message if needed sendNewMessage(); } else { - // otherwise, focus on the input and move the cursor to the end - if (inputRef.current) { - inputRef.current.focus(); - inputRef.current.selectionStart = inputRef.current.value.length; - } + // otherwise, focus on the input + textarea.focus(); } prefilledMsg.clear(); // no need to keep track of sendNewMessage // eslint-disable-next-line react-hooks/exhaustive-deps - }, [inputRef]); + }, [textarea.ref]); // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg) const pendingMsgDisplay: MessageDisplay[] = @@ -258,9 +252,7 @@ export default function ChatScreen() {