diff --git a/ci/run.sh b/ci/run.sh index 9fc19c89d80d2..edf5803531466 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -826,8 +826,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then fi ret=0 - -test $ret -eq 0 && gg_run ctest_debug +if [ -z ${GG_BUILD_SYCL} ]; then + # SYCL build breaks with debug build flags + test $ret -eq 0 && gg_run ctest_debug +fi test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then @@ -835,7 +837,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run rerank_tiny if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then - test $ret -eq 0 && gg_run test_scripts_debug + if [ -z ${GG_BUILD_SYCL} ]; then + test $ret -eq 0 && gg_run test_scripts_debug + fi test $ret -eq 0 && gg_run test_scripts_release fi @@ -846,7 +850,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run pythia_2_8b #test $ret -eq 0 && gg_run open_llama_7b_v2 fi - test $ret -eq 0 && gg_run ctest_with_model_debug + if [ -z ${GG_BUILD_SYCL} ]; then + test $ret -eq 0 && gg_run ctest_with_model_debug + fi test $ret -eq 0 && gg_run ctest_with_model_release fi fi diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d21edce16b71e..d9fa57027b771 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -705,6 +705,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": # ref: https://huggingface.co/Xenova/gpt-4o res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 07d3ce0e4eb78..ca90cf592932b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -110,6 +110,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, {"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", }, + {"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", }, ] diff --git a/docs/cuda-fedora.md b/docs/backend/CUDA-FEDORA.md similarity index 78% rename from docs/cuda-fedora.md rename to docs/backend/CUDA-FEDORA.md index 75cd2b499d086..1508faf776d28 100644 --- a/docs/cuda-fedora.md +++ b/docs/backend/CUDA-FEDORA.md @@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment) - [Installing Essential Development Tools](#installing-essential-development-tools) - [Adding the CUDA Repository](#adding-the-cuda-repository) -- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs) -- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts) -- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs) +- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries) - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package) - [Configuring the Environment](#configuring-the-environment) - [Verifying the Installation](#verifying-the-installation) @@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for sudo dnf distro-sync ``` -2. **Install the Default Text Editor (Optional):** +2. **Install **Vim** the default text editor (Optional):** ```bash sudo dnf install vim-default-editor --allowerasing @@ -97,36 +95,48 @@ After adding the repository, synchronize the package manager again: sudo dnf distro-sync ``` -## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs` +## Installing Nvidia Driver Libraries -We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go). +First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go): ```bash ls -la /usr/lib64/libcuda.so.1 ``` -**Explanation:** +### If *`libcuda.so.1`* is missing: + +``` +ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory +``` -- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA, - on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries. +**Explanation:** +The host dose not supply the CUDA drivers, **install them now:** -### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found). +#### Install the Nvidia Driver Libraries on Guest: ```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found). +### If *`libcuda.so.1`* exists: +``` +lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07 +``` + +**Explanation:** +The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:** -If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files. +#### Update the Toolbox RPM Database to include the Host-Supplied Libraries: -#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies) +Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host. + +##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies) ```bash -sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -#### 2. Update the RPM database to assume the installation of these packages. +##### 2. Update the RPM database to assume the installation of these packages. ```bash sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/* @@ -134,23 +144,26 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/* **Note:** -- The `--justdb` option only updates the RPM database, without touching the filesystem. +- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere. + +##### Check that the RPM Database has been correctly updated: -#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs` +**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing. -After manually installing the dependencies, run: ```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs +sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced ``` -You should receive a message indicating the package is already installed: +*(this time it will not install anything, as the database things that these packages are already installed)* ``` Updating and loading repositories: Repositories loaded. -Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed. -Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed. +Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed. +Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed. Nothing to do. ``` @@ -207,9 +220,9 @@ You should see output similar to: ``` nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation -Built on Wed_Jan_15_19:20:09_PST_2025 -Cuda compilation tools, release 12.8, V12.8.61 -Build cuda_12.8.r12.8/compiler.35404655_0 +Built on Fri_Feb_21_20:23:50_PST_2025 +Cuda compilation tools, release 12.8, V12.8.93 +Build cuda_12.8.r12.8/compiler.35583870_0 ``` This output confirms that the CUDA compiler is accessible and indicates the installed version. diff --git a/docs/build.md b/docs/build.md index 2e3975c145360..7b5503a1fee53 100644 --- a/docs/build.md +++ b/docs/build.md @@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo #### Compile and run inside a Fedora Toolbox Container -We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). +We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). **Recommended for:** - -- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). -- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde). +- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). + - (there are no supported CUDA packages for these systems) +- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads). + - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system) +- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean. - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 6b5cd32a4a541..954ff5f16924b 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -243,14 +243,14 @@ static bool fp16_mma_available(const int cc) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else - return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) } // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 10d461b773a6a..6dd5dcb85e15c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1192,7 +1192,7 @@ static void ggml_cuda_op_mul_mat_cublas( const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; - if (((cc >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { + if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 510c1e9b2aa9e..2c19485d51a92 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q( // The stream-k decomposition is only faster for recent NVIDIA GPUs. // Also its fixup needs to allocate a temporary buffer in the memory pool. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && - GGML_CUDA_CC_IS_NVIDIA(cc) && src1_ncols == ne11; + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && + ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 4ea8b8d4b1290..ee01154254e3d 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -90,7 +90,7 @@ struct tile_x_sizes { static int get_mmq_x_max_host(const int cc) { return new_mma_available(cc) ? 128 : - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ? + GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? #ifdef GGML_CUDA_FORCE_MMQ 128 : 64; #else @@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() { static int get_mmq_y_host(const int cc) { return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : - ((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64); + ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64); } static constexpr __device__ int get_mmq_y_device() { @@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc); + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; int mmq_x_best = 0; int nparts_best = INT_MAX; diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 59a208fe9c7e4..7efb51c8ec30a 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -25,124 +25,46 @@ endif () if (GGML_OPENCL_EMBED_KERNELS) add_compile_definitions(GGML_OPENCL_EMBED_KERNELS) - set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h") - set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h") - set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h") + set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py") + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated") - set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h") - set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h") - set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h") - set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h") - set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h") - set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h") - - set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py") - file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - - include_directories("${CMAKE_BINARY_DIR}/autogenerated") - - # Python must be accessible from command line - add_custom_command( - OUTPUT ${OPENCL_CL_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl - ${OPENCL_CL_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl - ${OPENCL_MM_CL_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_mm.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl - ${OPENCL_CVT_CL_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_cvt.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl - ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl - ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl - ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl - ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_transpose_16.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl - ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_transpose_32.cl.h" - ) - - add_custom_command( - OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED} - COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl - ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED} - DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT} - COMMENT "Generate ggml-opencl_transpose_32_16.cl.h" - ) - - target_sources(${TARGET_NAME} PRIVATE - ${OPENCL_CL_SOURCE_EMBED} - ${OPENCL_MM_CL_SOURCE_EMBED} - ${OPENCL_CVT_CL_SOURCE_EMBED} - ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} - ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} - ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} - ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} - ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} - ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}) -else () - # copy ggml-opencl.cl to bin directory - configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY) - configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY) - configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY) - - configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY) - configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY) - configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY) - configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY) - configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY) - configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY) + target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/autogenerated") endif () + +function(ggml_opencl_add_kernel KNAME) + set(KERN_HDR ${CMAKE_CURRENT_BINARY_DIR}/autogenerated/${KNAME}.cl.h) + set(KERN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels/${KNAME}.cl) + + if (GGML_OPENCL_EMBED_KERNELS) + message(STATUS "opencl: embedding kernel ${KNAME}") + + # Python must be accessible from command line + add_custom_command( + OUTPUT ${KERN_HDR} + COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} ${KERN_SRC} ${KERN_HDR} + DEPENDS ${KERN_SRC} ${EMBED_KERNEL_SCRIPT} + COMMENT "Generate ${KERN_HDR}" + ) + + target_sources(${TARGET_NAME} PRIVATE ${KERN_HDR}) + else () + message(STATUS "opencl: adding kernel ${KNAME}") + configure_file(${KERN_SRC} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${KNAME}.cl COPYONLY) + endif () +endfunction() + +set(GGML_OPENCL_KERNELS + ggml-opencl + ggml-opencl_mm + ggml-opencl_cvt + ggml-opencl_gemv_noshuffle + ggml-opencl_gemv_noshuffle_general + ggml-opencl_mul_mat_Ab_Bi_8x4 + ggml-opencl_transpose_16 + ggml-opencl_transpose_32 + ggml-opencl_transpose_32_16 +) + +foreach (K ${GGML_OPENCL_KERNELS}) + ggml_opencl_add_kernel(${K}) +endforeach() diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 31ecd9f81a85f..775b48cd05e1a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -105,6 +105,16 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { int unroll_count = 4; uint unrolled_iters = num_iters & ~(unroll_count - 1); +#if K_PER_ITER == 2 + // If the K dimension is odd, we need lastiter==true on the last iteration + // so OOB is computed correctly. Skip some unrolling to make that happen. + if ((p.ncols & 1) != 0 && + unrolled_iters == num_iters && + unrolled_iters > 0) { + unrolled_iters -= unroll_count; + } +#endif + uint i = 0; while (i < unrolled_iters) { // Manually partially unroll the loop @@ -113,8 +123,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { i++; } } + unroll_count = 2; unrolled_iters = num_iters & ~(unroll_count - 1); + +#if K_PER_ITER == 2 + if ((p.ncols & 1) != 0 && + unrolled_iters == num_iters && + unrolled_iters > 0) { + unrolled_iters -= unroll_count; + } +#endif + while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { diff --git a/include/llama.h b/include/llama.h index 6a44be404d914..25a9f82784df2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -107,6 +107,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, }; enum llama_rope_type { diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 3970b7485fe9d..9da97f1bc5057 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -476,7 +476,7 @@ struct llama_mlock::impl { char* errmsg = std::strerror(errno); bool suggest = (errno == ENOMEM); -#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) +#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) // visionOS/tvOS dont't support RLIMIT_MEMLOCK // Skip resource limit checks on visionOS/tvOS suggest = false; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a708d8b88c8c7..2ddc8108f4cb4 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -400,6 +400,12 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: + regex_exprs = { + "\\p{N}+", + "(?=(\\d{3})+(?!\\d))", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1604,6 +1610,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "gpt-4o") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; + } else if ( + tokenizer_pre == "superbpe") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ebc32d7919fe4..28f860a7f2969 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4204,6 +4204,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3})); for (auto bs : {1,2,4,8}) { for (auto nr : {1,4}) {