janhq · jan-service-account · Mar 25, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/ci/run.sh b/ci/run.sh
@@ -826,16 +826,20 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi
 
 ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
+if [ -z ${GG_BUILD_SYCL} ]; then
+    # SYCL build breaks with debug build flags
+    test $ret -eq 0 && gg_run ctest_debug
+fi
 test $ret -eq 0 && gg_run ctest_release
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     test $ret -eq 0 && gg_run embd_bge_small
     test $ret -eq 0 && gg_run rerank_tiny
 
     if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run test_scripts_debug
+        fi
         test $ret -eq 0 && gg_run test_scripts_release
     fi
 
@@ -846,7 +850,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
             test $ret -eq 0 && gg_run pythia_2_8b
             #test $ret -eq 0 && gg_run open_llama_7b_v2
         fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run ctest_with_model_debug
+        fi
         test $ret -eq 0 && gg_run ctest_with_model_release
     fi
 fi

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -705,6 +705,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
             # ref: https://huggingface.co/Xenova/gpt-4o
             res = "gpt-4o"
+        if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
+            # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
+            res = "superbpe"
 
         if res is None:
             logger.warning("\n")

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -110,6 +110,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
     {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
     {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
+    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
 ]
 
 

diff --git a/docs/cuda-fedora.md → docs/backend/CUDA-FEDORA.md b/docs/cuda-fedora.md → docs/backend/CUDA-FEDORA.md
@@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
-- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
-- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
-- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
+- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
 - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
 - [Configuring the Environment](#configuring-the-environment)
 - [Verifying the Installation](#verifying-the-installation)
@@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
    sudo dnf distro-sync
    ```
 
-2. **Install the Default Text Editor (Optional):**
+2. **Install **Vim** the default text editor (Optional):**
 
    ```bash
    sudo dnf install vim-default-editor --allowerasing
@@ -97,60 +95,75 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```
 
-## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+## Installing Nvidia Driver Libraries
 
-We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
+First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):
 
 ```bash
 ls -la /usr/lib64/libcuda.so.1
 ```
 
-**Explanation:**
+### If *`libcuda.so.1`* is missing:
+
+```
+ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
+```
 
-- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
-  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
+**Explanation:**
+The host dose not supply the CUDA drivers, **install them now:**
 
-### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+#### Install the Nvidia Driver Libraries on Guest:
 
 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```
 
-### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
+### If *`libcuda.so.1`* exists:
+```
+lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
+```
+
+**Explanation:**
+The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**
 
-If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
+#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
 
-#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
+Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
+
+##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)
 
 ```bash
-sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```
 
-#### 2. Update the RPM database to assume the installation of these packages.
+##### 2. Update the RPM database to assume the installation of these packages.
 
 ```bash
 sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
 ```
 
 **Note:**
 
-- The `--justdb` option only updates the RPM database, without touching the filesystem.
+- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.
+
+##### Check that the RPM Database has been correctly updated:
 
-#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.
 
-After manually installing the dependencies, run:
 
 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```
 
-You should receive a message indicating the package is already installed:
+*(this time it will not install anything, as the database things that these packages are already installed)*
 
 ```
 Updating and loading repositories:
 Repositories loaded.
-Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
-Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.
 
 Nothing to do.
 ```
@@ -207,9 +220,9 @@ You should see output similar to:
 ```
 nvcc: NVIDIA (R) Cuda compiler driver
 Copyright (c) 2005-2025 NVIDIA Corporation
-Built on Wed_Jan_15_19:20:09_PST_2025
-Cuda compilation tools, release 12.8, V12.8.61
-Build cuda_12.8.r12.8/compiler.35404655_0
+Built on Fri_Feb_21_20:23:50_PST_2025
+Cuda compilation tools, release 12.8, V12.8.93
+Build cuda_12.8.r12.8/compiler.35583870_0
 ```
 
 This output confirms that the CUDA compiler is accessible and indicates the installed version.

diff --git a/docs/build.md b/docs/build.md
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
 
 
 #### Compile and run inside a Fedora Toolbox Container
-We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
+We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
 
 **Recommended for:**
-
-- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
-- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+  - (there are no supported CUDA packages for these systems)
+- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
+  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
+- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
 - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
 
 

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -243,14 +243,14 @@ static bool fp16_mma_available(const int cc) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
     return false;
 #else
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
 }
 
 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fp16_mma_hardware_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA ||
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
 }
 

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1192,7 +1192,7 @@ static void ggml_cuda_op_mul_mat_cublas(
 
     const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
 
-    if (((cc >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
+    if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
         if (src0->type != GGML_TYPE_F16) {

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q(
     // The stream-k decomposition is only faster for recent NVIDIA GPUs.
     // Also its fixup needs to allocate a temporary buffer in the memory pool.
     // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
-    const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
-        GGML_CUDA_CC_IS_NVIDIA(cc) && src1_ncols == ne11;
+    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
+        ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
     const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
 
     switch (src0->type) {

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -90,7 +90,7 @@ struct tile_x_sizes {
 
 static int get_mmq_x_max_host(const int cc) {
     return new_mma_available(cc) ? 128 :
-        ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ?
+        GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
 #ifdef GGML_CUDA_FORCE_MMQ
             128                     : 64;
 #else
@@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
 
 static int get_mmq_y_host(const int cc) {
     return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
-        ((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64);
+        ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
 }
 
 static constexpr __device__ int get_mmq_y_device() {
@@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
     const int mmq_x_max = get_mmq_x_max_host(cc);
     const int mmq_y = get_mmq_y_host(cc);
     const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
-    const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc);
+    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
 
     int mmq_x_best  = 0;
     int nparts_best = INT_MAX;