diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline
index af8c0cea6155c..8ba5f117e5f08 100644
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -7,16 +7,16 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
             checkout scm        // Clone the repo on Runner
         }
     }
-    stage('Compiling llama.cpp'){
+    stage('Compiling jarvis.cpp'){
         sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
+            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V
         '''
     }
-    stage('Running llama.cpp'){
+    stage('Running jarvis.cpp'){
         sh'''#!/bin/bash
             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt            # Running jarvis.cpp on vector qemu-riscv64
+            cat jarvis_log.txt                   # Printing results
         '''
     }
 }
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index d5acd35e204d3..16d3d6b947eeb 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -26,7 +26,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
index 34ba856d3d1ca..51b6061020b5e 100644
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -19,7 +19,7 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
index df496bcd2b7ee..620d7d89cf40d 100644
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
     gfx803 \
@@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
 # Enable cURL
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev
 
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 2a06f82b738ae..62ee6f5069f00 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -15,7 +15,7 @@ WORKDIR /app
 
 COPY . .
 
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 
 
 RUN make -j$(nproc)
diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/jarvis-cli-cann.Dockerfile
similarity index 93%
rename from .devops/llama-cli-cann.Dockerfile
rename to .devops/jarvis-cli-cann.Dockerfile
index db5ba2f25ea67..99c83c0b15dfd 100644
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/jarvis-cli-cann.Dockerfile
@@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
     source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
     cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli
 
 # TODO: use image with NNRT
 FROM cosdt/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
 
 ENV LC_ALL=C.utf8
 
@@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
 ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
 
-ENTRYPOINT ["/llama-cli" ]
+ENTRYPOINT ["/jarvis-cli" ]
diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/jarvis-cli-cuda.Dockerfile
similarity index 82%
rename from .devops/llama-cli-cuda.Dockerfile
rename to .devops/jarvis-cli-cuda.Dockerfile
index b75163b94435a..43f8b2cb9a471 100644
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/jarvis-cli-cuda.Dockerfile
@@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
     cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target jarvis-cli -j$(nproc)
 
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 
@@ -31,7 +31,7 @@ RUN apt-get update && \
     apt-get install -y libgomp1
 
 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/jarvis-cli-intel.Dockerfile
similarity index 80%
rename from .devops/llama-cli-intel.Dockerfile
rename to .devops/jarvis-cli-intel.Dockerfile
index 79dba06a77d6e..cc3d64afef9df 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/jarvis-cli-intel.Dockerfile
@@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
     echo "Building with static libs" && \
     cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
     ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/jarvis-cli-musa.Dockerfile
similarity index 77%
rename from .devops/llama-cli-musa.Dockerfile
rename to .devops/jarvis-cli-musa.Dockerfile
index b5696794f1a56..69d13cc79cada 100644
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/jarvis-cli-musa.Dockerfile
@@ -16,7 +16,7 @@ WORKDIR /app
 COPY . .
 
 RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target jarvis-cli -j$(nproc)
 
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 
@@ -24,7 +24,7 @@ RUN apt-get update && \
     apt-get install -y libgomp1
 
 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/jarvis-cli-rocm.Dockerfile
similarity index 85%
rename from .devops/llama-cli-rocm.Dockerfile
rename to .devops/jarvis-cli-rocm.Dockerfile
index e60c747bdbf11..2eeb794358221 100644
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/jarvis-cli-rocm.Dockerfile
@@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
     gfx803 \
@@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
-RUN make -j$(nproc) llama-cli
+RUN make -j$(nproc) jarvis-cli
 
-ENTRYPOINT [ "/app/llama-cli" ]
+ENTRYPOINT [ "/app/jarvis-cli" ]
diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/jarvis-cli-vulkan.Dockerfile
similarity index 80%
rename from .devops/llama-cli-vulkan.Dockerfile
rename to .devops/jarvis-cli-vulkan.Dockerfile
index 9b0dad8bf7a13..57ebafa9bed2f 100644
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/jarvis-cli-vulkan.Dockerfile
@@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_VULKAN=1 && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli
 
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama-cli /llama-cli && \
+RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \
     rm -rf /app
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
diff --git a/.devops/llama-cli.Dockerfile b/.devops/jarvis-cli.Dockerfile
similarity index 72%
rename from .devops/llama-cli.Dockerfile
rename to .devops/jarvis-cli.Dockerfile
index 7f741aa46ecf0..6a3137f281679 100644
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/jarvis-cli.Dockerfile
@@ -9,15 +9,15 @@ WORKDIR /app
 
 COPY . .
 
-RUN make -j$(nproc) llama-cli
+RUN make -j$(nproc) jarvis-cli
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
 RUN apt-get update && \
     apt-get install -y libgomp1
 
-COPY --from=build /app/llama-cli /llama-cli
+COPY --from=build /app/jarvis-cli /jarvis-cli
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/jarvis-cpp-cuda.srpm.spec
similarity index 60%
rename from .devops/llama-cpp-cuda.srpm.spec
rename to .devops/jarvis-cpp-cuda.srpm.spec
index 7425d3a9d7a40..c806963f9eb7a 100644
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/jarvis-cpp-cuda.srpm.spec
@@ -3,7 +3,7 @@
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 
-# Notes for llama.cpp:
+# Notes for jarvis.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
@@ -12,44 +12,44 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 
-Name:           llama.cpp-cuda
+Name:           jarvis.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+Summary:        CPU Inference of JARVIS model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggerganov/jarvis.cpp
 
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 
 %description
-CPU inference for Meta's Lllama2 models using default options.
+CPU inference for Meta's Ljarvis2 models using default options.
 
 %prep
-%setup -n llama.cpp-master
+%setup -n jarvis.cpp-master
 
 %build
 make -j GGML_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli
+cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server
+cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/jarviscuda.service
 [Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
+Description=Jarvis.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 
 [Service]
 Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+EnvironmentFile=/etc/sysconfig/jarvis
+ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -58,8 +58,8 @@ WantedBy=default.target
 EOF
 
 mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/jarvis
+JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
 EOF
 
 %clean
@@ -67,11 +67,11 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-server
-%{_bindir}/llama-cuda-simple
-/usr/lib/systemd/system/llamacuda.service
-%config /etc/sysconfig/llama
+%{_bindir}/jarvis-cuda-cli
+%{_bindir}/jarvis-cuda-server
+%{_bindir}/jarvis-cuda-simple
+/usr/lib/systemd/system/jarviscuda.service
+%config /etc/sysconfig/jarvis
 
 %pre
 
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/jarvis-cpp.srpm.spec
similarity index 63%
rename from .devops/llama-cpp.srpm.spec
rename to .devops/jarvis-cpp.srpm.spec
index 4d5560089816c..243e932556298 100644
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/jarvis-cpp.srpm.spec
@@ -3,7 +3,7 @@
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 
-# Notes for llama.cpp:
+# Notes for jarvis.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 #    In the meantime, YYYYMMDD format will be used.
@@ -13,45 +13,45 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 
-Name:           llama.cpp
+Name:           jarvis.cpp
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+Summary:        CPU Inference of JARVIS model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggerganov/jarvis.cpp
 
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 
 %description
-CPU inference for Meta's Lllama2 models using default options.
+CPU inference for Meta's Ljarvis2 models using default options.
 Models are not included in this package and must be downloaded separately.
 
 %prep
-%setup -n llama.cpp-master
+%setup -n jarvis.cpp-master
 
 %build
 make -j
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli
+cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server
+cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/jarvis.service
 [Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
+Description=Jarvis.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 
 [Service]
 Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+EnvironmentFile=/etc/sysconfig/jarvis
+ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -60,8 +60,8 @@ WantedBy=default.target
 EOF
 
 mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/jarvis
+JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
 EOF
 
 %clean
@@ -69,11 +69,11 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llama-cli
-%{_bindir}/llama-server
-%{_bindir}/llama-simple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
+%{_bindir}/jarvis-cli
+%{_bindir}/jarvis-server
+%{_bindir}/jarvis-simple
+/usr/lib/systemd/system/jarvis.service
+%config /etc/sysconfig/jarvis
 
 %pre
 
diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/jarvis-server-cuda.Dockerfile
similarity index 74%
rename from .devops/llama-server-cuda.Dockerfile
rename to .devops/jarvis-server-cuda.Dockerfile
index a40e24205707f..435fe9e8d9bf9 100644
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/jarvis-server-cuda.Dockerfile
@@ -22,8 +22,8 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target jarvis-server -j$(nproc)
 
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 
@@ -31,12 +31,12 @@ RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server
 
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/jarvis-server-intel.Dockerfile
similarity index 75%
rename from .devops/llama-server-intel.Dockerfile
rename to .devops/jarvis-server-intel.Dockerfile
index 9c355b664f15e..1d3cc936fe00f 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/jarvis-server-intel.Dockerfile
@@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
     echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target llama-server
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target jarvis-server
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev curl
 
-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/jarvis-server-musa.Dockerfile
similarity index 68%
rename from .devops/llama-server-musa.Dockerfile
rename to .devops/jarvis-server-musa.Dockerfile
index 193a6d77cb9ed..1c8e8938bde96 100644
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/jarvis-server-musa.Dockerfile
@@ -15,8 +15,8 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target jarvis-server -j$(nproc)
 
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 
@@ -24,12 +24,12 @@ RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server
 
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/jarvis-server-rocm.Dockerfile
similarity index 84%
rename from .devops/llama-server-rocm.Dockerfile
rename to .devops/jarvis-server-rocm.Dockerfile
index 8553af75b61fc..a9192b3dbbc91 100644
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/jarvis-server-rocm.Dockerfile
@@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
     gfx803 \
@@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 # Enable cURL
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev curl
 
-RUN make -j$(nproc) llama-server
+RUN make -j$(nproc) jarvis-server
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/app/llama-server" ]
+ENTRYPOINT [ "/app/jarvis-server" ]
diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/jarvis-server-vulkan.Dockerfile
similarity index 75%
rename from .devops/llama-server-vulkan.Dockerfile
rename to .devops/jarvis-server-vulkan.Dockerfile
index 93c5e0c26e691..89811bed3e6ad 100644
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/jarvis-server-vulkan.Dockerfile
@@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target llama-server
+RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \
+    cmake --build build --config Release --target jarvis-server
 
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama-server /llama-server && \
+RUN cp /app/build/bin/jarvis-server /jarvis-server && \
     rm -rf /app
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
diff --git a/.devops/llama-server.Dockerfile b/.devops/jarvis-server.Dockerfile
similarity index 73%
rename from .devops/llama-server.Dockerfile
rename to .devops/jarvis-server.Dockerfile
index 02accc85e1368..cc39a213c173e 100644
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/jarvis-server.Dockerfile
@@ -9,21 +9,21 @@ WORKDIR /app
 
 COPY . .
 
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 
-RUN make -j$(nproc) llama-server
+RUN make -j$(nproc) jarvis-server
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/jarvis-server /jarvis-server
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix
index 0ecf19fc56d55..af01140753974 100644
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -6,10 +6,10 @@
         let
           inherit (config.packages) default;
           binaries = [
-            "llama-cli"
-            "llama-embedding"
-            "llama-server"
-            "llama-quantize"
+            "jarvis-cli"
+            "jarvis-embedding"
+            "jarvis-server"
+            "jarvis-quantize"
           ];
           mkApp = name: {
             type = "app";
diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix
index d607b4575772c..502070aa8a5f2 100644
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@@ -2,14 +2,14 @@
   lib,
   dockerTools,
   buildEnv,
-  llama-cpp,
+  jarvis-cpp,
   interactive ? true,
   coreutils,
 }:
 
 # A tar that can be fed into `docker load`:
 #
-# $ nix build .#llamaPackages.docker
+# $ nix build .#jarvisPackages.docker
 # $ docker load < result
 
 # For details and variations cf.
@@ -19,16 +19,16 @@
 
 # Approximate (compressed) sizes, at the time of writing, are:
 #
-# .#llamaPackages.docker: 125M;
-# .#llamaPackagesCuda.docker: 537M;
-# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+# .#jarvisPackages.docker: 125M;
+# .#jarvisPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M.
 
 dockerTools.buildLayeredImage {
-  name = llama-cpp.pname;
+  name = jarvis-cpp.pname;
   tag = "latest";
 
   contents =
-    [ llama-cpp ]
+    [ jarvis-cpp ]
     ++ lib.optionals interactive [
       coreutils
       dockerTools.binSh
diff --git a/.devops/nix/jetson-support.nix b/.devops/nix/jetson-support.nix
index 78e2e40e03864..56f4c5b7805a5 100644
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@@ -11,10 +11,10 @@
     {
       legacyPackages =
         let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
+          caps.jarvisPackagesXavier = "7.2";
+          caps.jarvisPackagesOrin = "8.7";
+          caps.jarvisPackagesTX2 = "6.2";
+          caps.jarvisPackagesNano = "5.3";
 
           pkgsFor =
             cap:
@@ -31,9 +31,9 @@
         builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
 
       packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+        jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp;
+        jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp;
+        jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp;
       };
     };
 }
diff --git a/.devops/nix/package-gguf-py.nix b/.devops/nix/package-gguf-py.nix
index cca2f36a5bd4d..62b622332bf65 100644
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@@ -1,6 +1,6 @@
 {
   lib,
-  llamaVersion,
+  jarvisVersion,
   numpy,
   tqdm,
   sentencepiece,
@@ -12,7 +12,7 @@
 
 buildPythonPackage {
   pname = "gguf";
-  version = llamaVersion;
+  version = jarvisVersion;
   pyproject = true;
   nativeBuildInputs = [ poetry-core ];
   propagatedBuildInputs = [
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 5d7d7ea5ae2d0..95d44360ceee4 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -33,7 +33,7 @@
   useRocm ? config.rocmSupport,
   enableCurl ? true,
   useVulkan ? false,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+  jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake
 
   # It's necessary to consistently use backendStdenv when building with CUDA support,
   # otherwise we get libstdc++ errors downstream.
@@ -103,8 +103,8 @@ let
 in
 
 effectiveStdenv.mkDerivation (finalAttrs: {
-  pname = "llama-cpp${pnameSuffix}";
-  version = llamaVersion;
+  pname = "jarvis-cpp${pnameSuffix}";
+  version = jarvisVersion;
 
   # Note: none of the files discarded here are visible in the sandbox or
   # affect the output hash. This also means they can be modified without
@@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
       --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
   '';
 
-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015,
   # `default.metallib` may be compiled with Metal compiler from XCode
   # and we need to escape sandbox on MacOS to access Metal compiler.
   # `xcrun` is used find the path of the Metal compiler, which is varible
   # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion
   __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
 
   nativeBuildInputs =
@@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {
 
   cmakeFlags =
     [
-      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "JARVIS_BUILD_SERVER" true)
       (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
       (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "JARVIS_CURL" enableCurl)
       (cmakeBool "GGML_NATIVE" false)
       (cmakeBool "GGML_BLAS" useBlas)
       (cmakeBool "GGML_CUDA" useCuda)
@@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
   # if they haven't been added yet.
   postInstall = ''
     mkdir -p $out/include
-    cp $src/include/llama.h $out/include/
+    cp $src/include/jarvis.h $out/include/
   '';
 
   meta = {
@@ -218,12 +218,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
     # overridden by importing Nixpkgs with `allowBroken = true`.
     broken = (useMetalKit && !effectiveStdenv.isDarwin);
 
-    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
+    description = "Inference of JARVIS model in pure C/C++${descriptionSuffix}";
+    homepage = "https://github.com/ggerganov/jarvis.cpp/";
     license = lib.licenses.mit;
 
     # Accommodates `nix run` and `lib.getExe`
-    mainProgram = "llama-cli";
+    mainProgram = "jarvis-cli";
 
     # These people might respond, on the best effort basis, if you ping them
     # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix
index 392e9ffe41bf5..7c91fd9022e48 100644
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -9,7 +9,7 @@
 }@inputs:
 
 let
-  llama-python-deps = with python3Packages; [
+  jarvis-python-deps = with python3Packages; [
     numpy
     sentencepiece
     transformers
@@ -18,7 +18,7 @@ let
     gguf-py
     tqdm
 
-    # for scripts/compare-llama-bench.py
+    # for scripts/compare-jarvis-bench.py
     gitpython
     tabulate
 
@@ -28,7 +28,7 @@ let
 
   ];
 
-  llama-python-test-deps = with python3Packages; [
+  jarvis-python-test-deps = with python3Packages; [
     # Server bench
     matplotlib
 
@@ -40,7 +40,7 @@ let
 in
 
 buildPythonPackage ({
-  pname = "llama-scripts";
+  pname = "jarvis-scripts";
   version = "0.0.0";
   pyproject = true;
 
@@ -61,6 +61,6 @@ buildPythonPackage ({
     src = lib.cleanSource ../../.;
   };
   nativeBuildInputs = [ poetry-core ];
-  nativeCheckInputs = llama-python-test-deps;
-  dependencies = llama-python-deps;
+  nativeCheckInputs = jarvis-python-test-deps;
+  dependencies = jarvis-python-deps;
 })
diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix
index 478e8c4228afa..4b1b4ff090bd5 100644
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -2,7 +2,7 @@
   lib,
   newScope,
   python3,
-  llamaVersion ? "0.0.0",
+  jarvisVersion ? "0.0.0",
 }:
 
 let
@@ -21,7 +21,7 @@ in
 # Cf. https://noogle.dev/f/lib/makeScope
 
 lib.makeScope newScope (self: {
-  inherit llamaVersion;
+  inherit jarvisVersion;
   gguf-py = self.callPackage ./package-gguf-py.nix {
     inherit
       buildPythonPackage
@@ -34,7 +34,7 @@ lib.makeScope newScope (self: {
       ;
   };
   python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
-  llama-cpp = self.callPackage ./package.nix { };
+  jarvis-cpp = self.callPackage ./package.nix { };
   docker = self.callPackage ./docker.nix { };
   docker-min = self.callPackage ./docker.nix { interactive = false; };
   sif = self.callPackage ./sif.nix { };
diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix
index 7a5e1dd0ffc4c..cc43dd75680e9 100644
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@@ -1,7 +1,7 @@
 {
   lib,
   singularity-tools,
-  llama-cpp,
+  jarvis-cpp,
   bashInteractive,
   interactive ? false,
 }:
@@ -10,8 +10,8 @@ let
   optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
-  inherit (llama-cpp) name;
-  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+  inherit (jarvis-cpp) name;
+  contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ];
 
   # These are excessive (but safe) for most variants. Building singularity
   # images requires superuser privileges, so we build them inside a VM in a
@@ -22,6 +22,6 @@ singularity-tools.buildImage rec {
   # Expected image sizes:
   # - cpu/blas: 150M,
   # - cuda, all gencodes: 560M,
-  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384;
   memSize = diskSize;
 }
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 24dcfd35079cb..a5a56c8231fab 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -10,9 +10,9 @@ shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
     python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    ./jarvis-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    ./jarvis-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
             echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
         else
             echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    ./jarvis-server "$@"
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
     echo "  --run (-r): Run a model previously converted into ggml"
     echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "  --convert (-c): Convert a jarvis model into ggml"
     echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
diff --git a/.dockerignore b/.dockerignore
index 064b7c7be86d0..a07624cfd185e 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,8 +12,8 @@ build*/
 
 models/*
 
-/llama-cli
-/llama-quantize
+/jarvis-cli
+/jarvis-quantize
 
 arm_neon.h
 compile_commands.json
diff --git a/.editorconfig b/.editorconfig
index f88f8da67cd78..ec03eee394d99 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -24,7 +24,7 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
+[examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*]
 indent_style = tab
 
 [examples/cvector-generator/*.txt]
diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml
index 54785854f776e..281fdb74ff70f 100644
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -1,5 +1,5 @@
 name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
+description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches)
 title: "Bug: "
 labels: ["bug-unconfirmed", "low severity"]
 body:
@@ -8,7 +8,7 @@ body:
       value: |
         Thanks for taking the time to fill out this bug report!
         Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
         If possible, please provide a minimal code example that reproduces the bug.
   - type: textarea
     id: what-happened
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
index a6285c6f05bac..9a4f564e37aae 100644
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -1,5 +1,5 @@
 name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
+description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable)
 title: "Bug: "
 labels: ["bug-unconfirmed", "medium severity"]
 body:
@@ -8,7 +8,7 @@ body:
       value: |
         Thanks for taking the time to fill out this bug report!
         Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
         If possible, please provide a minimal code example that reproduces the bug.
   - type: textarea
     id: what-happened
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml
index ff816b93769c3..cfa23d4afbdfb 100644
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -1,5 +1,5 @@
 name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
+description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow)
 title: "Bug: "
 labels: ["bug-unconfirmed", "high severity"]
 body:
@@ -8,7 +8,7 @@ body:
       value: |
         Thanks for taking the time to fill out this bug report!
         Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
         If possible, please provide a minimal code example that reproduces the bug.
   - type: textarea
     id: what-happened
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
index 7af42a80b3b93..e88543452a79c 100644
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -1,5 +1,5 @@
 name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
+description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss)
 title: "Bug: "
 labels: ["bug-unconfirmed", "critical severity"]
 body:
@@ -8,7 +8,7 @@ body:
       value: |
         Thanks for taking the time to fill out this bug report!
         Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
         If possible, please provide a minimal code example that reproduces the bug.
   - type: textarea
     id: what-happened
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml
index 58fca73183d41..b33f44a627b41 100644
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -1,12 +1,12 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp
+description: Used to request enhancements for jarvis.cpp
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
   - type: markdown
     attributes:
       value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas)
 
   - type: checkboxes
     id: prerequisites
@@ -16,18 +16,18 @@ body:
       options:
         - label: I am running the latest code. Mention the version if possible as well.
           required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md).
           required: true
         - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
           required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share.
           required: true
 
   - type: textarea
     id: feature-description
     attributes:
       label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+      description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement.
       placeholder: Detailed description of the enhancement
     validations:
       required: true
@@ -36,7 +36,7 @@ body:
     id: motivation
     attributes:
       label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users.
       placeholder: Explanation of why this feature is needed and its benefits
     validations:
       required: true
diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/06-research.yml
index 3ae4e9f8caaa4..51e4baf6fffa7 100644
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -6,7 +6,7 @@ body:
   - type: markdown
     attributes:
       value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
 
   - type: checkboxes
     id: research-stage
diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/07-refactor.yml
index 3a68d3d5355d6..0a8a58fccd0ba 100644
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -6,8 +6,8 @@ body:
   - type: markdown
     attributes:
       value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
 
   - type: textarea
     id: background-description
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index eb8c4b472df4c..fa85823fcdae0 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
   - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas
     about: Pop it there. It may then become an enhancement ticket.
   - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a
     about: Ask a question there!
   - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute
     about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 89436740d1ffb..7e5e48b35ac22 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -67,7 +67,7 @@ script:
 android:
     - changed-files:
         - any-glob-to-any-file:
-            - examples/llama.android/**
+            - examples/jarvis.android/**
 server:
     - changed-files:
         - any-glob-to-any-file:
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 997c6d9d05397..c1c783730f652 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,6 +1,6 @@
 
 
-- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md)
 - Self-reported review complexity:
   - [ ] Low
   - [ ] Medium
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled
index 1c8787ef78f7e..12f092afcee5f 100644
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
+#       https://github.com/ggerganov/jarvis.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@@ -27,10 +27,10 @@ on:
   push:
     branches:
       - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
   pull_request_target:
     types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
   schedule:
     -  cron: '04 2 * * *'
 
@@ -113,16 +113,16 @@ jobs:
           set -eux
           cmake -B build \
               -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DLLAMA_CUBLAS=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
+              -DJARVIS_CUBLAS=ON \
               -DCUDAToolkit_ROOT=/usr/local/cuda \
               -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
               -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
+              -DJARVIS_FATAL_WARNINGS=OFF \
+              -DJARVIS_ALL_WARNINGS=OFF \
               -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
+          cmake --build build --config Release -j $(nproc) --target jarvis-server
 
       - name: Download the dataset
         id: download_dataset
@@ -240,7 +240,7 @@ jobs:
           message: |
             <p align="center">
 
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
 
             </p>
 
@@ -249,9 +249,9 @@ jobs:
             <summary>Expand details for performance related PR only</summary>
 
             - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s
             - ${{ env.BENCH_GRAPH_XLABEL }}
 
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 423173b975897..d73089ed81b2e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -28,9 +28,9 @@ env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  JARVIS_LOG_COLORS: 1
+  JARVIS_LOG_PREFIX: 1
+  JARVIS_LOG_TIMESTAMPS: 1
 
 jobs:
   macOS-latest-cmake-arm64:
@@ -55,7 +55,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -82,14 +82,14 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: jarvis-bin-macos-arm64.zip
 
   macOS-latest-cmake-x64:
     runs-on: macos-12
@@ -112,8 +112,8 @@ jobs:
         run: |
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          # https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -140,20 +140,20 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: jarvis-bin-macos-x64.zip
 
   ubuntu-focal-make:
     runs-on: ubuntu-20.04
     env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true
+      JARVIS_NODE_AVAILABLE: true
+      JARVIS_PYTHON_AVAILABLE: true
 
     steps:
       - name: Clone
@@ -177,7 +177,7 @@ jobs:
       - name: Build
         id: make_build
         env:
-            LLAMA_FATAL_WARNINGS: 1
+            JARVIS_FATAL_WARNINGS: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 
@@ -204,8 +204,8 @@ jobs:
       - name: Build
         id: make_build
         env:
-          LLAMA_FATAL_WARNINGS: 1
-          LLAMA_CURL: 1
+          JARVIS_FATAL_WARNINGS: 1
+          JARVIS_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 
@@ -230,7 +230,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test
@@ -239,16 +239,16 @@ jobs:
           cd build
           ctest -L 'main|curl' --verbose --timeout 900
 
-      - name: Test llama2c conversion
-        id: llama2c_test
+      - name: Test jarvis2c conversion
+        id: jarvis2c_test
         run: |
           cd build
           echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin
+          echo "Fetch jarvis2c model"
+          wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin
+          ./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf
+          ./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
 
       - name: Determine tag name
         id: tag
@@ -268,14 +268,14 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: jarvis-bin-ubuntu-x64.zip
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -304,7 +304,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
           cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
 
       - name: Build (no OpenMP)
@@ -313,7 +313,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
           cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
 
       - name: Test
@@ -487,7 +487,7 @@ jobs:
 
   # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
+  #       ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
   macOS-latest-make:
     runs-on: macos-latest
 
@@ -505,7 +505,7 @@ jobs:
       - name: Build
         id: make_build
         env:
-            LLAMA_FATAL_WARNINGS: 1
+            JARVIS_FATAL_WARNINGS: 1
         run: |
           GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
 
@@ -517,7 +517,7 @@ jobs:
 
   # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
   #       would be great if we fix these
   macOS-latest-cmake:
     runs-on: macos-latest
@@ -539,7 +539,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
+          cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -570,9 +570,9 @@ jobs:
           cd build
           cmake -G Xcode .. \
             -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
+            -DJARVIS_BUILD_EXAMPLES=OFF \
+            -DJARVIS_BUILD_TESTS=OFF \
+            -DJARVIS_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=iOS \
             -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@@ -600,9 +600,9 @@ jobs:
           cd build
           cmake -G Xcode .. \
             -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
+            -DJARVIS_BUILD_EXAMPLES=OFF \
+            -DJARVIS_BUILD_TESTS=OFF \
+            -DJARVIS_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=tvOS \
             -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@@ -629,7 +629,7 @@ jobs:
       - name: xcodebuild for swift package
         id: xcodebuild
         run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+          xcodebuild -scheme jarvis -destination "${{ matrix.destination }}"
 
       - name: Build Swift Example
         id: make_build_swift_example
@@ -705,23 +705,23 @@ jobs:
       matrix:
         include:
           - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone
@@ -807,7 +807,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
+          $env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1
           & $sde -future -- ctest -L main -C Release --verbose --timeout 900
 
       - name: Determine tag name
@@ -827,15 +827,15 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: jarvis-bin-win-${{ matrix.build }}.zip
 
   windows-latest-cmake-cuda:
     runs-on: windows-2019
@@ -865,7 +865,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
           cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
@@ -886,28 +886,28 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
         run: |
           echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
           $dst='.\build\bin\cudart\'
           robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
 
   windows-latest-cmake-sycl:
     runs-on: windows-latest
@@ -963,14 +963,14 @@ jobs:
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
           echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: jarvis-bin-win-sycl-x64.zip
 
   windows-latest-cmake-hip:
     if: ${{ github.event.inputs.create_release != 'true' }}
@@ -1060,13 +1060,13 @@ jobs:
       - name: Pack artifacts
         id: pack_artifacts
         run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
 
   ios-xcode-build:
     runs-on: macos-latest
@@ -1076,7 +1076,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
+        run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
 
   android-build:
     runs-on: ubuntu-latest
@@ -1098,7 +1098,7 @@ jobs:
 
       - name: Build
         run: |
-          cd examples/llama.android
+          cd examples/jarvis.android
 
           ./gradlew build --no-daemon
 
@@ -1261,7 +1261,7 @@ jobs:
 #          sudo apt-get install cmake
 #
 #      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON
 #
 #      - name: Build
 #        run: |
@@ -1300,7 +1300,7 @@ jobs:
 #      - name: Upload binaries
 #        uses: actions/upload-artifact@v4
 #        with:
-#          name: llama-bin-${{ matrix.arch }}
+#          name: jarvis-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
 #
 #  windows-blas:
@@ -1339,7 +1339,7 @@ jobs:
 #        run: >
 #          cmake -S . -B ./build -A ${{ matrix.arch }}
 #          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
+#          -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }}
 #          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
 #
 #      - name: Build
@@ -1355,7 +1355,7 @@ jobs:
 #        if: matrix.blas == 'ON'
 #        uses: actions/upload-artifact@v4
 #        with:
-#          name: llama-blas-bin-${{ matrix.arch }}
+#          name: jarvis-blas-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
 #
 #  emscripten:
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a953cdac907ae..fee3e9145be21 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -37,21 +37,21 @@ jobs:
     strategy:
       matrix:
         config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" }
           - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" }
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 368dbdbe5dccc..e3344be63ad39 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -11,7 +11,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
       with:
-        repository: "ggerganov/llama.cpp"
+        repository: "ggerganov/jarvis.cpp"
     - uses: actions/labeler@v5
       with:
         configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index 0da6acdf1c81e..7473135ef5c79 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -47,8 +47,8 @@ jobs:
         extra-conf: |
           extra-platforms = aarch64-linux
           extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -56,7 +56,7 @@ jobs:
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: jarvis-cpp
     - name: Show all output paths
       run: >
           nix run github:nix-community/nix-eval-jobs
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 8ecbbe53b4ed1..3a748d9acf4d3 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -34,8 +34,8 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -61,8 +61,8 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -70,7 +70,7 @@ jobs:
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: jarvis-cpp
     - name: Build
       run: >
           nix run github:Mic92/nix-fast-build
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 699ac095d6c83..29943d52e2dc3 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -21,10 +21,10 @@ on:
     paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 
 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  JARVIS_LOG_COLORS: 1
+  JARVIS_LOG_PREFIX: 1
+  JARVIS_LOG_TIMESTAMPS: 1
+  JARVIS_LOG_VERBOSITY: 10
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -41,7 +41,7 @@ jobs:
         include:
           - build_type: Release
             sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+      fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken
 
     steps:
       - name: Dependencies
@@ -99,12 +99,12 @@ jobs:
         run: |
           cmake -B build \
               -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \
               -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
 
       - name: Build
         id: cmake_build
@@ -112,11 +112,11 @@ jobs:
         run: |
           cmake -B build \
               -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+              -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
 
       - name: Tests
         id: server_integration_tests
@@ -155,8 +155,8 @@ jobs:
       - name: Build
         id: cmake_build
         run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server
 
       - name: Python setup
         id: setup_python
@@ -180,7 +180,7 @@ jobs:
         run: |
           cd examples/server/tests
           $env:PYTHONIOENCODING = ":replace"
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp
 
       - name: Slow tests
         id: server_integration_tests_slow
diff --git a/.gitignore b/.gitignore
index 1092d097a7542..cf5abf6ff55de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,8 +48,8 @@ build*
 !build-info.sh
 !build.zig
 !docs/build.md
-/libllama.so
-/llama-*
+/libjarvis.so
+/jarvis-*
 /vulkan-shaders-gen
 android-ndk-*
 arm_neon.h
@@ -57,7 +57,7 @@ cmake-build-*
 CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
-llama-batched-swift
+jarvis-batched-swift
 /rpc-server
 out/
 tmp/
@@ -118,7 +118,7 @@ poetry.toml
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-grammar-parser
-/tests/test-llama-grammar
+/tests/test-jarvis-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef0932a7b9277..db4944fcb677c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("llama.cpp" C CXX)
+project("jarvis.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
 #set(CMAKE_WARN_DEPRECATED YES)
@@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(LLAMA_STANDALONE ON)
+    set(JARVIS_STANDALONE ON)
 
     include(git-vars)
 
     # configure project version
     # TODO
 else()
-    set(LLAMA_STANDALONE OFF)
+    set(JARVIS_STANDALONE OFF)
 endif()
 
 if (EMSCRIPTEN)
     set(BUILD_SHARED_LIBS_DEFAULT OFF)
 
-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+    option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON)
 else()
     if (MINGW)
         set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -51,41 +51,41 @@ endif()
 #
 
 # debug
-option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(JARVIS_ALL_WARNINGS           "jarvis: enable all compiler warnings"                   ON)
+option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF)
 
 # build
-option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
+option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF)
 
 # sanitizers
-option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
-option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
-option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+option(JARVIS_SANITIZE_THREAD    "jarvis: enable thread sanitizer"    OFF)
+option(JARVIS_SANITIZE_ADDRESS   "jarvis: enable address sanitizer"   OFF)
+option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF)
 
 # utils
-option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE})
 
 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(JARVIS_BUILD_TESTS    "jarvis: build tests"          ${JARVIS_STANDALONE})
+option(JARVIS_BUILD_EXAMPLES "jarvis: build examples"       ${JARVIS_STANDALONE})
+option(JARVIS_BUILD_SERVER   "jarvis: build server example" ${JARVIS_STANDALONE})
 
 # 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 
 # override ggml options
-set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
-set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
-set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
-set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+set(GGML_SANITIZE_THREAD    ${JARVIS_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${JARVIS_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${JARVIS_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${JARVIS_FATAL_WARNINGS})
 
 # change the default for these ggml options
-if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+if (NOT DEFINED GGML_JARVISFILE)
+    set(GGML_JARVISFILE_DEFAULT ON)
 endif()
 
 if (NOT DEFINED GGML_AMX)
@@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
 endif()
 
 # transition helpers
-function (llama_option_depr TYPE OLD NEW)
+function (jarvis_option_depr TYPE OLD NEW)
     if (${OLD})
         message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
         set(${NEW} ON PARENT_SCOPE)
     endif()
 endfunction()
 
-llama_option_depr(FATAL_ERROR LLAMA_CUBLAS              GGML_CUDA)
-llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
-llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
-llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
-llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
-llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
-llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
-llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
-llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS              GGML_CUDA)
+jarvis_option_depr(WARNING     JARVIS_CUDA                GGML_CUDA)
+jarvis_option_depr(WARNING     JARVIS_KOMPUTE             GGML_KOMPUTE)
+jarvis_option_depr(WARNING     JARVIS_METAL               GGML_METAL)
+jarvis_option_depr(WARNING     JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+jarvis_option_depr(WARNING     JARVIS_NATIVE              GGML_NATIVE)
+jarvis_option_depr(WARNING     JARVIS_RPC                 GGML_RPC)
+jarvis_option_depr(WARNING     JARVIS_SYCL                GGML_SYCL)
+jarvis_option_depr(WARNING     JARVIS_SYCL_F16            GGML_SYCL_F16)
+jarvis_option_depr(WARNING     JARVIS_CANN                GGML_CANN)
 
 #
 # build the library
@@ -132,18 +132,18 @@ add_subdirectory(src)
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
-set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+set(JARVIS_BUILD_NUMBER        ${BUILD_NUMBER})
+set(JARVIS_BUILD_COMMIT        ${BUILD_COMMIT})
+set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER})
 
-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
-set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(JARVIS_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(JARVIS_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
 
 
 # At the moment some compile definitions are placed within the ggml/src
 # directory but not exported on the `ggml` target. This could be improved by
-# determining _precisely_ which defines are necessary for the llama-config
+# determining _precisely_ which defines are necessary for the jarvis-config
 # package.
 #
 set(GGML_TRANSIENT_DEFINES)
@@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES)
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
 
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
+set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h)
+install(TARGETS jarvis LIBRARY PUBLIC_HEADER)
 
 configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
-    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
-              LLAMA_LIB_INSTALL_DIR
-              LLAMA_BIN_INSTALL_DIR )
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis
+    PATH_VARS JARVIS_INCLUDE_INSTALL_DIR
+              JARVIS_LIB_INSTALL_DIR
+              JARVIS_BIN_INSTALL_DIR )
 
 write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
-    VERSION ${LLAMA_INSTALL_VERSION}
+        ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
+    VERSION ${JARVIS_INSTALL_VERSION}
     COMPATIBILITY SameMajorVersion)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis)
 
 install(
     FILES convert_hf_to_gguf.py
@@ -190,27 +190,27 @@ install(
         WORLD_EXECUTE
     DESTINATION ${CMAKE_INSTALL_BINDIR})
 
-configure_file(cmake/llama.pc.in
-        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+configure_file(cmake/jarvis.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
         @ONLY)
 
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
         DESTINATION lib/pkgconfig)
 
 #
 # utils, programs, examples and tests
 #
 
-if (LLAMA_BUILD_COMMON)
+if (JARVIS_BUILD_COMMON)
     add_subdirectory(common)
 endif()
 
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     include(CTest)
     add_subdirectory(tests)
 endif()
 
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4c882c254cac5..d24987c935c10 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,7 +11,7 @@
 
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
-- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules
 
 # Coding guidelines
 
@@ -22,7 +22,7 @@
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
-- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 
 ![matmul](media/matmul.png)
 
@@ -30,4 +30,4 @@
 
 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
 
-https://github.com/ggerganov/llama.cpp/projects
+https://github.com/ggerganov/jarvis.cpp/projects
diff --git a/Makefile b/Makefile
index 719f45d167463..ad411dbdf8d18 100644
--- a/Makefile
+++ b/Makefile
@@ -1,44 +1,44 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama-baby-llama \
-	llama-batched \
-	llama-batched-bench \
-	llama-bench \
-	llama-cli \
-	llama-convert-llama2c-to-ggml \
-	llama-embedding \
-	llama-eval-callback \
-	llama-export-lora \
-	llama-gbnf-validator \
-	llama-gguf \
-	llama-gguf-hash \
-	llama-gguf-split \
-	llama-gritlm \
-	llama-imatrix \
-	llama-infill \
-	llama-llava-cli \
-	llama-minicpmv-cli\
-	llama-lookahead \
-	llama-lookup \
-	llama-lookup-create \
-	llama-lookup-merge \
-	llama-lookup-stats \
-	llama-parallel \
-	llama-passkey \
-	llama-perplexity \
-	llama-q8dot \
-	llama-quantize \
-	llama-quantize-stats \
-	llama-retrieval \
-	llama-save-load-state \
-	llama-server \
-	llama-simple \
-	llama-speculative \
-	llama-tokenize \
-	llama-vdot \
-	llama-cvector-generator \
-	llama-gen-docs \
+	jarvis-baby-jarvis \
+	jarvis-batched \
+	jarvis-batched-bench \
+	jarvis-bench \
+	jarvis-cli \
+	jarvis-convert-jarvis2c-to-ggml \
+	jarvis-embedding \
+	jarvis-eval-callback \
+	jarvis-export-lora \
+	jarvis-gbnf-validator \
+	jarvis-gguf \
+	jarvis-gguf-hash \
+	jarvis-gguf-split \
+	jarvis-gritlm \
+	jarvis-imatrix \
+	jarvis-infill \
+	jarvis-llava-cli \
+	jarvis-minicpmv-cli\
+	jarvis-lookahead \
+	jarvis-lookup \
+	jarvis-lookup-create \
+	jarvis-lookup-merge \
+	jarvis-lookup-stats \
+	jarvis-parallel \
+	jarvis-passkey \
+	jarvis-perplexity \
+	jarvis-q8dot \
+	jarvis-quantize \
+	jarvis-quantize-stats \
+	jarvis-retrieval \
+	jarvis-save-load-state \
+	jarvis-server \
+	jarvis-simple \
+	jarvis-speculative \
+	jarvis-tokenize \
+	jarvis-vdot \
+	jarvis-cvector-generator \
+	jarvis-gen-docs \
 	tests/test-c.o
 
 # Binaries only useful for tests
@@ -52,7 +52,7 @@ TEST_TARGETS = \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
-	tests/test-llama-grammar \
+	tests/test-jarvis-grammar \
 	tests/test-log \
 	tests/test-model-load-cancel \
 	tests/test-opt \
@@ -65,8 +65,8 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm
 
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \
 	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
 
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
@@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding
 LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
 
 # Deprecation aliases
-ifdef LLAMA_CUBLAS
-$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+ifdef JARVIS_CUBLAS
+$(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.)
 endif
 
-ifdef LLAMA_CUDA
+ifdef JARVIS_CUDA
 GGML_CUDA := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_KOMPUTE
+ifdef JARVIS_KOMPUTE
 GGML_KOMPUTE := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_METAL
+ifdef JARVIS_METAL
 GGML_METAL := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_RPC
+ifdef JARVIS_RPC
 GGML_RPC := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_SYCL
+ifdef JARVIS_SYCL
 GGML_SYCL := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_SYCL_F16
+ifdef JARVIS_SYCL_F16
 GGML_SYCL_F16 := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_OPENBLAS
+ifdef JARVIS_OPENBLAS
 GGML_OPENBLAS := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_OPENBLAS64
+ifdef JARVIS_OPENBLAS64
 GGML_OPENBLAS64 := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_BLIS
+ifdef JARVIS_BLIS
 GGML_BLIS := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_NO_LLAMAFILE
-GGML_NO_LLAMAFILE := 1
+ifdef JARVIS_NO_JARVISFILE
+GGML_NO_JARVISFILE := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_NO_ACCELERATE
+ifdef JARVIS_NO_ACCELERATE
 GGML_NO_ACCELERATE := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_NO_OPENMP
+ifdef JARVIS_NO_OPENMP
 GGML_NO_OPENMP := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_NO_METAL
+ifdef JARVIS_NO_METAL
 GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_DISABLE_LOGS
+ifdef JARVIS_DISABLE_LOGS
 REMOVE_WARNING := 1
 endif
 
-ifdef LLAMA_SERVER_VERBOSE
+ifdef JARVIS_SERVER_VERBOSE
 REMOVE_WARNING := 1
 endif
 
@@ -211,8 +211,8 @@ test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
 		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
@@ -257,7 +257,7 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
 
-ifdef LLAMA_NO_CCACHE
+ifdef JARVIS_NO_CCACHE
 GGML_NO_CCACHE := 1
 DEPRECATE_WARNING := 1
 endif
@@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES
 	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
 endif
 
-ifdef LLAMA_DEBUG
+ifdef JARVIS_DEBUG
 	MK_CFLAGS    += -O0 -g
 	MK_CXXFLAGS  += -O0 -g
 	MK_LDFLAGS   += -g
@@ -336,25 +336,25 @@ else
 	MK_NVCCFLAGS  += -O3 -g
 endif
 
-ifdef LLAMA_SANITIZE_THREAD
+ifdef JARVIS_SANITIZE_THREAD
 	MK_CFLAGS   += -fsanitize=thread -g
 	MK_CXXFLAGS += -fsanitize=thread -g
 	MK_LDFLAGS  += -fsanitize=thread -g
 endif
 
-ifdef LLAMA_SANITIZE_ADDRESS
+ifdef JARVIS_SANITIZE_ADDRESS
 	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
 endif
 
-ifdef LLAMA_SANITIZE_UNDEFINED
+ifdef JARVIS_SANITIZE_UNDEFINED
 	MK_CFLAGS   += -fsanitize=undefined -g
 	MK_CXXFLAGS += -fsanitize=undefined -g
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif
 
-ifdef LLAMA_SERVER_SSL
+ifdef JARVIS_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif
@@ -381,7 +381,7 @@ MK_CXXFLAGS += \
 	-Wmissing-declarations \
 	-Wmissing-noreturn
 
-ifeq ($(LLAMA_FATAL_WARNINGS),1)
+ifeq ($(JARVIS_FATAL_WARNINGS),1)
 	MK_CFLAGS   += -Werror
 	MK_CXXFLAGS += -Werror
 endif
@@ -420,7 +420,7 @@ ifeq ($(_WIN32),1)
 	LWINSOCK2 := -lws2_32
 endif
 
-ifdef LLAMA_GPROF
+ifdef JARVIS_GPROF
 	MK_CFLAGS   += -pg
 	MK_CXXFLAGS += -pg
 endif
@@ -448,7 +448,7 @@ endif
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
+	# https://github.com/ggerganov/jarvis.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
 
@@ -574,9 +574,9 @@ ifdef GGML_NVPL
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_NVPL
 
-ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+ifndef GGML_NO_JARVISFILE
+	MK_CPPFLAGS += -DGGML_USE_JARVISFILE
+	OBJ_GGML    += ggml/src/jarvisfile/sgemm.o
 endif
 
 ifndef GGML_NO_AMX
@@ -627,9 +627,9 @@ ifdef GGML_CUDA
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 
-ifdef LLAMA_FATAL_WARNINGS
+ifdef JARVIS_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
-endif # LLAMA_FATAL_WARNINGS
+endif # JARVIS_FATAL_WARNINGS
 
 ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
@@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT
 endif # JETSON_EOL_MODULE_DETECT
 endif # GGML_MUSA
 
-ifdef LLAMA_DEBUG
+ifdef JARVIS_DEBUG
 	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
+endif # JARVIS_DEBUG
 
 ifdef GGML_CUDA_DEBUG
 	MK_NVCCFLAGS += --device-debug
@@ -920,11 +920,11 @@ OBJ_GGML += \
 	ggml/src/ggml-quants.o \
 	ggml/src/ggml-aarch64.o
 
-OBJ_LLAMA = \
-	src/llama.o \
-	src/llama-vocab.o \
-	src/llama-grammar.o \
-	src/llama-sampling.o \
+OBJ_JARVIS = \
+	src/jarvis.o \
+	src/jarvis-vocab.o \
+	src/jarvis-grammar.o \
+	src/jarvis-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o
 
@@ -939,19 +939,19 @@ OBJ_COMMON = \
 	common/build-info.o \
 	common/json-schema-to-grammar.o
 
-OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON)
 
 LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
 LIB_GGML_S = $(LIB_PRE)ggml.a
 
-LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
-LIB_LLAMA_S = $(LIB_PRE)llama.a
+LIB_JARVIS   = $(LIB_PRE)jarvis$(DSO_EXT)
+LIB_JARVIS_S = $(LIB_PRE)jarvis.a
 
 LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
 LIB_COMMON_S = $(LIB_PRE)common.a
 
-LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
-LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
+LIB_ALL   = $(LIB_GGML)   $(LIB_JARVIS)   $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S)
 
 GF_CC := $(CC)
 include scripts/get-flags.mk
@@ -971,8 +971,8 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
-ifdef LLAMA_CURL
-override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+ifdef JARVIS_CURL
+override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL
 override LDFLAGS  := $(LDFLAGS) -lcurl
 endif
 
@@ -980,7 +980,7 @@ endif
 # Print build information
 #
 
-$(info I llama.cpp build info: )
+$(info I jarvis.cpp build info: )
 $(info I UNAME_S:   $(UNAME_S))
 $(info I UNAME_P:   $(UNAME_P))
 $(info I UNAME_M:   $(UNAME_M))
@@ -1009,30 +1009,30 @@ $(info )
 
 ifdef DEPRECATE_WARNING
 $(info !!! DEPRECATION WARNING !!!)
-$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
-$(info   - LLAMA_CUDA)
-$(info   - LLAMA_METAL)
-$(info   - LLAMA_METAL_EMBED_LIBRARY)
-$(info   - LLAMA_OPENMP)
-$(info   - LLAMA_RPC)
-$(info   - LLAMA_SYCL)
-$(info   - LLAMA_SYCL_F16)
-$(info   - LLAMA_OPENBLAS)
-$(info   - LLAMA_OPENBLAS64)
-$(info   - LLAMA_BLIS)
-$(info   - LLAMA_NO_LLAMAFILE)
-$(info   - LLAMA_NO_ACCELERATE)
-$(info   - LLAMA_NO_OPENMP)
-$(info   - LLAMA_NO_METAL)
-$(info   - LLAMA_NO_CCACHE)
+$(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info   - JARVIS_CUDA)
+$(info   - JARVIS_METAL)
+$(info   - JARVIS_METAL_EMBED_LIBRARY)
+$(info   - JARVIS_OPENMP)
+$(info   - JARVIS_RPC)
+$(info   - JARVIS_SYCL)
+$(info   - JARVIS_SYCL_F16)
+$(info   - JARVIS_OPENBLAS)
+$(info   - JARVIS_OPENBLAS64)
+$(info   - JARVIS_BLIS)
+$(info   - JARVIS_NO_JARVISFILE)
+$(info   - JARVIS_NO_ACCELERATE)
+$(info   - JARVIS_NO_OPENMP)
+$(info   - JARVIS_NO_METAL)
+$(info   - JARVIS_NO_CCACHE)
 $(info )
 endif
 
 ifdef REMOVE_WARNING
 $(info !!! REMOVAL WARNING !!!)
-$(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info The following JARVIS_ options have been removed and are no longer supported)
+$(info   - JARVIS_DISABLE_LOGS   (https://github.com/ggerganov/jarvis.cpp/pull/9418))
+$(info   - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418))
 $(info )
 endif
 
@@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-ifndef GGML_NO_LLAMAFILE
-ggml/src/llamafile/sgemm.o: \
-	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/llamafile/sgemm.h \
+ifndef GGML_NO_JARVISFILE
+ggml/src/jarvisfile/sgemm.o: \
+	ggml/src/jarvisfile/sgemm.cpp \
+	ggml/src/jarvisfile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # GGML_NO_LLAMAFILE
+endif # GGML_NO_JARVISFILE
 
 ifndef GGML_NO_AMX
 ggml/src/ggml-amx.o: \
@@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \
 	$(OBJ_GGML)
 	ar rcs $(LIB_GGML_S) $^
 
-# llama
+# jarvis
 
 src/unicode.o: \
 	src/unicode.cpp \
@@ -1127,14 +1127,14 @@ src/unicode-data.o: \
 	src/unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-src/llama.o: \
-	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-grammar.h \
-	src/llama-sampling.h \
+src/jarvis.o: \
+	src/jarvis.cpp \
+	src/jarvis-impl.h \
+	src/jarvis-vocab.h \
+	src/jarvis-grammar.h \
+	src/jarvis-sampling.h \
 	src/unicode.h \
-	include/llama.h \
+	include/jarvis.h \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h \
@@ -1142,37 +1142,37 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-src/llama-vocab.o: \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-impl.h \
-	include/llama.h
+src/jarvis-vocab.o: \
+	src/jarvis-vocab.cpp \
+	src/jarvis-vocab.h \
+	src/jarvis-impl.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-src/llama-grammar.o: \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-sampling.h \
-	include/llama.h
+src/jarvis-grammar.o: \
+	src/jarvis-grammar.cpp \
+	src/jarvis-grammar.h \
+	src/jarvis-impl.h \
+	src/jarvis-vocab.h \
+	src/jarvis-sampling.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-src/llama-sampling.o: \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-impl.h \
-	include/llama.h
+src/jarvis-sampling.o: \
+	src/jarvis-sampling.cpp \
+	src/jarvis-sampling.h \
+	src/jarvis-impl.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-$(LIB_LLAMA): \
-	$(OBJ_LLAMA) \
+$(LIB_JARVIS): \
+	$(OBJ_JARVIS) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-$(LIB_LLAMA_S): \
-	$(OBJ_LLAMA)
-	ar rcs $(LIB_LLAMA_S) $^
+$(LIB_JARVIS_S): \
+	$(OBJ_JARVIS)
+	ar rcs $(LIB_JARVIS_S) $^
 
 # common
 
@@ -1183,7 +1183,7 @@ common/common.o: \
 	common/sampling.h \
 	common/json.hpp \
 	common/json-schema-to-grammar.h \
-	include/llama.h
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common/arg.o: \
@@ -1199,7 +1199,7 @@ common/log.o: \
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
-	include/llama.h
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common/console.o: \
@@ -1224,7 +1224,7 @@ common/ngram-cache.o: \
 
 $(LIB_COMMON): \
 	$(OBJ_COMMON) \
-	$(LIB_LLAMA) \
+	$(LIB_JARVIS) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
@@ -1246,7 +1246,7 @@ clean:
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
 	rm -vrf ggml/src/*.o
-	rm -rvf ggml/src/llamafile/*.o
+	rm -rvf ggml/src/jarvisfile/*.o
 	rm -rvf common/build-info.cpp
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
@@ -1269,75 +1269,75 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
-llama-cli: examples/main/main.cpp \
+jarvis-cli: examples/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
-	@echo '====  Run ./llama-cli -h for help.  ===='
+	@echo '====  Run ./jarvis-cli -h for help.  ===='
 	@echo
 
-llama-infill: examples/infill/infill.cpp \
+jarvis-infill: examples/infill/infill.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-simple: examples/simple/simple.cpp \
+jarvis-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-tokenize: examples/tokenize/tokenize.cpp \
+jarvis-tokenize: examples/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-batched: examples/batched/batched.cpp \
+jarvis-batched: examples/batched/batched.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-quantize: examples/quantize/quantize.cpp \
+jarvis-quantize: examples/quantize/quantize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-perplexity: examples/perplexity/perplexity.cpp \
+jarvis-perplexity: examples/perplexity/perplexity.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-imatrix: examples/imatrix/imatrix.cpp \
+jarvis-imatrix: examples/imatrix/imatrix.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-embedding: examples/embedding/embedding.cpp \
+jarvis-embedding: examples/embedding/embedding.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gritlm: examples/gritlm/gritlm.cpp \
+jarvis-gritlm: examples/gritlm/gritlm.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gguf: examples/gguf/gguf.cpp \
+jarvis-gguf: examples/gguf/gguf.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \
 	examples/gguf-hash/deps/sha256/sha256.c
 	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
 
-llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-bench: examples/llama-bench/llama-bench.cpp \
+jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-baby-llama: examples/baby-llama/baby-llama.cpp \
+jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-export-lora: examples/export-lora/export-lora.cpp \
+jarvis-export-lora: examples/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-retrieval: examples/retrieval/retrieval.cpp \
+jarvis-retrieval: examples/retrieval/retrieval.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-speculative: examples/speculative/speculative.cpp \
+jarvis-speculative: examples/speculative/speculative.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-parallel: examples/parallel/parallel.cpp \
+jarvis-parallel: examples/parallel/parallel.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-lookahead: examples/lookahead/lookahead.cpp \
+jarvis-lookahead: examples/lookahead/lookahead.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-lookup: examples/lookup/lookup.cpp \
+jarvis-lookup: examples/lookup/lookup.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-lookup-create: examples/lookup/lookup-create.cpp \
+jarvis-lookup-create: examples/lookup/lookup-create.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-passkey: examples/passkey/passkey.cpp \
+jarvis-passkey: examples/passkey/passkey.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC
 
-llama-server: \
+jarvis-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
@@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@
 
-llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
 
-llama-llava-cli: examples/llava/llava-cli.cpp \
+jarvis-llava-cli: examples/llava/llava-cli.cpp \
 	examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c include/llama.h
+tests/test-c.o: tests/test-c.c include/jarvis.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 
 tests/test-backend-ops: tests/test-backend-ops.cpp \
@@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
 # PoCs
 #
 
-llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning
 #  Eventually we will want to remove these target from building all the time.
 main: examples/deprecation-warning/deprecation-warning.o
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead."
 
 server: examples/deprecation-warning/deprecation-warning.o
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead."
 
 quantize: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard quantize))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead."
 	@echo "  Remove the 'quantize' binary to remove this warning."
 	@echo "#########"
 endif
@@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard perplexity))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead."
 	@echo "  Remove the 'perplexity' binary to remove this warning."
 	@echo "#########"
 endif
@@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard embedding))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead."
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
diff --git a/Package.swift b/Package.swift
index 3a17e6c349b01..2832bcf5c3caa 100644
--- a/Package.swift
+++ b/Package.swift
@@ -3,10 +3,10 @@
 import PackageDescription
 
 var sources = [
-    "src/llama.cpp",
-    "src/llama-vocab.cpp",
-    "src/llama-grammar.cpp",
-    "src/llama-sampling.cpp",
+    "src/jarvis.cpp",
+    "src/jarvis-vocab.cpp",
+    "src/jarvis-grammar.cpp",
+    "src/jarvis-sampling.cpp",
     "src/unicode.cpp",
     "src/unicode-data.cpp",
     "ggml/src/ggml.c",
@@ -45,7 +45,7 @@ cSettings.append(
 #endif
 
 let package = Package(
-    name: "llama",
+    name: "jarvis",
     platforms: [
         .macOS(.v12),
         .iOS(.v14),
@@ -53,11 +53,11 @@ let package = Package(
         .tvOS(.v14)
     ],
     products: [
-        .library(name: "llama", targets: ["llama"]),
+        .library(name: "jarvis", targets: ["jarvis"]),
     ],
     targets: [
         .target(
-            name: "llama",
+            name: "jarvis",
             path: ".",
             exclude: [
                "cmake",
diff --git a/README.md b/README.md
index 8fe1f4b4b6a7a..94bd09da9df8c 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,30 @@
-# llama.cpp
+# jarvis.cpp
 
-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+![jarvis](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
+[![Server](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml)
+[![Conan Center](https://shields.io/conan/v/jarvis-cpp)](https://conan.io/center/jarvis-cpp)
 
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+Inference of Meta's [JARVIS](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 
 ## Recent API changes
 
-- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
-- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+- [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289)
+- [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291)
 
 ## Hot topics
 
-- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
-- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669**
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 ----
 
 ## Description
 
-The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
+The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
 variety of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
@@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
+Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has
 improved significantly thanks to many contributions. It is the main playground for developing new features for the
 [ggml](https://github.com/ggerganov/ggml) library.
 
@@ -43,31 +43,31 @@ improved significantly thanks to many contributions. It is the main playground f
 
 Typically finetunes of the base models below are supported as well.
 
-- [X] LLaMA 🦙
-- [x] LLaMA 2 🦙🦙
-- [x] LLaMA 3 🦙🦙🦙
+- [X] JARVIS 🦙
+- [x] JARVIS 2 🦙🦙
+- [x] JARVIS 3 🦙🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
-- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
+- [X] [Chinese JARVIS / Alpaca](https://github.com/ymcui/Chinese-JARVIS-Alpaca) and [Chinese JARVIS-2 / Alpaca-2](https://github.com/ymcui/Chinese-JARVIS-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
-- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
-- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
-- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
-- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
 - [x] [GPT-2](https://huggingface.co/gpt2)
-- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well.
 
 **Bindings:**
 
-- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
-- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
-- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
-- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python)
+- Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp)
+- Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp)
+- JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp)
 - JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
-- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
-- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
-- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
-- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
-- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
-- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
-- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis)
+- Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb)
+- Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs)
+- Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp)
+- Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs)
+- C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp)
 - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
-- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
-- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
-- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
-- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
-- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
-- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
-- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
-- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
-- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
+- Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj)
+- React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn)
+- Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp)
+- Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig)
+- Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart)
+- PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326)
+- Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp)
+- Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift)
+- Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis)
 
 **UI:**
 
 Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
-- [iohub/collama](https://github.com/iohub/coLLaMA)
+- [iohub/cojarvis](https://github.com/iohub/coJARVIS)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
@@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
+- [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
-- [ollama/ollama](https://github.com/ollama/ollama)
+- [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
@@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
-- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL)
 - [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
 
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+*(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)*
 
 **Tools:**
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
-- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
-- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) – download models from the Ojarvis library to be used directly with jarvis.cpp
+- [crashr/gppm](https://github.com/crashr/gppm) – launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
-- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
 
 **Infrastructure:**
 
-- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
-- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
+- [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly
 
 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@@ -198,11 +198,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 ## Demo
 
 <details>
-<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
+<summary>Typical run using JARVIS v2 13B on M2 Ultra</summary>
 
 ```
-$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
+$ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
+I jarvis.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
@@ -215,12 +215,12 @@ I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 make: Nothing to be done for `default'.
 main: build = 1041 (cf658ad)
 main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
+jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
+jarvis_model_loader: - type  f32:   81 tensors
+jarvis_model_loader: - type q4_0:  281 tensors
+jarvis_model_loader: - type q6_K:    1 tensors
 llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
+llm_load_print_meta: arch           = jarvis
 llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
@@ -240,7 +240,7 @@ llm_load_print_meta: freq_scale     = 1
 llm_load_print_meta: model type     = 13B
 llm_load_print_meta: model ftype    = mostly Q4_0
 llm_load_print_meta: model size     = 13.02 B
-llm_load_print_meta: general.name   = LLaMA v2
+llm_load_print_meta: general.name   = JARVIS v2
 llm_load_print_meta: BOS token = 1 '<s>'
 llm_load_print_meta: EOS token = 2 '</s>'
 llm_load_print_meta: UNK token = 0 '<unk>'
@@ -248,8 +248,8 @@ llm_load_print_meta: LF token  = 13 '<0x0A>'
 llm_load_tensors: ggml ctx size =    0.11 MB
 llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
 ...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
+jarvis_new_context_with_model: kv self size  =  400.00 MB
+jarvis_new_context_with_model: compute buffer total size =   75.41 MB
 
 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
@@ -271,19 +271,19 @@ How does a Website Work?
 A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
 The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
 How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
+jarvis_print_timings:        load time =   576.45 ms
+jarvis_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
+jarvis_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
+jarvis_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
+jarvis_print_timings:       total time = 25431.49 ms
 ```
 
 </details>
 
 <details>
-<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
+<summary>Demo of running both JARVIS-7B and whisper.cpp on a single M1 Pro MacBook</summary>
 
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
+And here is another demo of running both JARVIS-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
 
 https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
 
@@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support
 
 Firstly, you need to get the binary. There are different methods that you can follow:
 - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
-- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
+- Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md)
 - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
-- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
+- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases)
 
 You can run a basic completion using this command:
 
 ```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
 
 # Output:
 # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
@@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters.
 If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
 
 ```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
 
 # Output:
 # > hi, who are you?
@@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
 # Easy peasy! The answer to 1+1 is... 2!
 ```
 
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template)
 
 ```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 ```
 
 You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
 
 ```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
 ```
 
 ### Web server
 
-[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+[jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 
 Example usage:
 
 ```bash
-./llama-server -m your_model.gguf --port 8080
+./jarvis-server -m your_model.gguf --port 8080
 
 # Basic web UI can be accessed via browser: http://localhost:8080
 # Chat completion endpoint: http://localhost:8080/v1/chat/completions
@@ -357,7 +357,7 @@ Example usage:
 > [!NOTE]
 > If you prefer basic usage, please consider using conversation mode instead of interactive mode
 
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes JARVIS emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
 
 Here is an example of a few-shot interaction, invoked with the command
 
@@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh
 
 # custom arguments using a 13B model
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program.
 
 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
 
 ### Persistent Interaction
 
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
 
 ```bash
 # Start a new chat
@@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 
 ### Constrained output with grammars
 
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+`jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
 
 ```bash
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```
 
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
@@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar
 
 ## Build
 
-Please refer to [Build llama.cpp locally](./docs/build.md)
+Please refer to [Build jarvis.cpp locally](./docs/build.md)
 
 ## Supported backends
 
@@ -430,12 +430,12 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 ### Prepare and Quantize
 
 > [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours.
 
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+To obtain the official JARVIS 2 weights please see the <a href="#obtaining-and-using-the-facebook-jarvis-2-model">Obtaining and using the Facebook JARVIS 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 
-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives.
+It does not support JARVIS 3, you can use `convert_hf_to_gguf.py` with JARVIS 3 downloaded from Hugging Face.
 
 To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
 
@@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant
 You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
 
-To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
+To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md)
 
 ## Contributing
 
 - Contributors can open PRs
-- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
+- Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues, PRs and projects is very appreciated!
-- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
-- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 
 ## Other documentations
@@ -470,14 +470,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - [Running on Docker](./docs/docker.md)
 - [Build on Android](./docs/android.md)
 - [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks)
 
 **Seminal papers and background on the models**
 
-If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-- LLaMA:
-    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of JARVIS models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between JARVIS models and ChatGPT:
+- JARVIS:
+    - [Introducing JARVIS: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/)
+    - [JARVIS: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
 - GPT-3
     - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
 - GPT-3.5 / InstructGPT / ChatGPT:
diff --git a/SECURITY.md b/SECURITY.md
index f4322c6ee4d18..da3cafecc23e6 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,6 +1,6 @@
 # Security Policy
 
- - [**Using llama.cpp securely**](#using-llamacpp-securely)
+ - [**Using jarvis.cpp securely**](#using-jarviscpp-securely)
    - [Untrusted models](#untrusted-models)
    - [Untrusted inputs](#untrusted-inputs)
    - [Data privacy](#data-privacy)
@@ -8,7 +8,7 @@
    - [Multi-Tenant environments](#multi-tenant-environments)
  - [**Reporting a vulnerability**](#reporting-a-vulnerability)
 
-## Using llama.cpp securely
+## Using jarvis.cpp securely
 
 ### Untrusted models
 Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
@@ -26,7 +26,7 @@ For maximum security when handling untrusted inputs, you may need to employ the
 
 * Sandboxing: Isolate the environment where the inference happens.
 * Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
-* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
+* Updates: Keep both JARVIS C++ and your libraries updated with the latest security patches.
 * Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
     * Validation: Enforce strict rules on allowed characters and data types.
     * Filtering: Remove potentially malicious scripts or code fragments.
@@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
 
 ## Reporting a vulnerability
 
-Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of JARVIS C++.
 
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
 
-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new).
 
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/ci/README.md b/ci/README.md
index 4064705190697..a6a39b7901f18 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,11 +1,11 @@
 # CI
 
-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework:
 
 https://github.com/ggml-org/ci
 
 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.
 
diff --git a/ci/run.sh b/ci/run.sh
index dc26d94eed1fd..d4d934e86ac69 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -36,7 +36,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
+CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -217,7 +217,7 @@ function gg_sum_test_scripts_release {
 function gg_get_model {
     local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
     local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf"
     if [[ -s $gguf_0 ]]; then
         echo -n "$gguf_0"
     elif [[ -s $gguf_1 ]]; then
@@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug {
     local model; model=$(gg_get_model)
     cd build-ci-debug
     set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
     set +e
     cd ..
 }
@@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release {
     local model; model=$(gg_get_model)
     cd build-ci-release
     set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
     set +e
     cd ..
 }
@@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release {
     gg_printf '```\n'
 }
 
-# open_llama_7b_v2
+# open_jarvis_7b_v2
 
-function gg_run_open_llama_7b_v2 {
+function gg_run_open_jarvis_7b_v2 {
     cd ${SRC}
 
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json
 
     gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
     unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
 
-    path_models="../models-mnt/open-llama/7B-v2"
+    path_models="../models-mnt/open-jarvis/7B-v2"
     path_wiki="../models-mnt/wikitext/wikitext-2-raw"
 
     rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
 
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 {
 
     wiki_test="${path_wiki}/wiki.test.raw"
 
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/jarvis-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/jarvis-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -387,10 +387,10 @@ function gg_run_open_llama_7b_v2 {
     set +e
 }
 
-function gg_sum_open_llama_7b_v2 {
+function gg_sum_open_jarvis_7b_v2 {
     gg_printf '### %s\n\n' "${ci}"
 
-    gg_printf 'OpenLLaMA 7B-v2:\n'
+    gg_printf 'OpenJARVIS 7B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
     gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
@@ -449,45 +449,45 @@ function gg_run_pythia_1_4b {
 
     wiki_test_60="${path_wiki}/wiki.test-60.raw"
 
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/jarvis-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/jarvis-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -580,47 +580,47 @@ function gg_run_pythia_2_8b {
 
     wiki_test="${path_wiki}/wiki.test.raw"
 
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/jarvis-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/jarvis-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -704,10 +704,10 @@ function gg_run_embd_bge_small {
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
 
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
 
-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
 
     set +e
 }
@@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
     model_f16="${path_models}/ggml-model-f16.gguf"
 
     # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/jarvis-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
@@ -804,11 +804,11 @@ function gg_check_build_requirements {
 
 ## main
 
-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export JARVIS_LOG_PREFIX=1
+export JARVIS_LOG_TIMESTAMPS=1
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    # Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt
     rm -rf ${SRC}/models-mnt
     mnt_models=${MNT}/models
     mkdir -p ${mnt_models}
@@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
             test $ret -eq 0 && gg_run pythia_1_4b
         else
             test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
+            #test $ret -eq 0 && gg_run open_jarvis_7b_v2
         fi
         test $ret -eq 0 && gg_run ctest_with_model_debug
         test $ret -eq 0 && gg_run ctest_with_model_release
diff --git a/cmake/llama-config.cmake.in b/cmake/jarvis-config.cmake.in
similarity index 61%
rename from cmake/llama-config.cmake.in
rename to cmake/jarvis-config.cmake.in
index f072b76a39d2e..a64ac57a49a54 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/jarvis-config.cmake.in
@@ -1,7 +1,7 @@
-set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+set(JARVIS_VERSION      @JARVIS_INSTALL_VERSION@)
+set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@)
+set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@)
+set(JARVIS_SHARED_LIB   @BUILD_SHARED_LIBS@)
 
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
@@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@)
 
 @PACKAGE_INIT@
 
-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@")
+set_and_check(JARVIS_LIB_DIR     "@PACKAGE_JARVIS_LIB_INSTALL_DIR@")
+set_and_check(JARVIS_BIN_DIR     "@PACKAGE_JARVIS_BIN_INSTALL_DIR@")
 
 # Ensure transient dependencies satisfied
 
@@ -66,25 +66,25 @@ endif()
 
 find_library(ggml_LIBRARY ggml
     REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${JARVIS_LIB_DIR})
 
-find_library(llama_LIBRARY llama
+find_library(jarvis_LIBRARY jarvis
     REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${JARVIS_LIB_DIR})
 
-set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
-set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
+set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
+set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@")
 
-add_library(llama UNKNOWN IMPORTED)
+add_library(jarvis UNKNOWN IMPORTED)
 
-set_target_properties(llama
+set_target_properties(jarvis
     PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
+        INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}"
+        INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}"
         IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
+        IMPORTED_LOCATION "${jarvis_LIBRARY}"
         INTERFACE_COMPILE_FEATURES cxx_std_11
         POSITION_INDEPENDENT_CODE ON )
 
-check_required_components(Llama)
+check_required_components(Jarvis)
diff --git a/cmake/llama.pc.in b/cmake/jarvis.pc.in
similarity index 62%
rename from cmake/llama.pc.in
rename to cmake/jarvis.pc.in
index 326acbb6108fd..f61f10f3ac073 100644
--- a/cmake/llama.pc.in
+++ b/cmake/jarvis.pc.in
@@ -3,8 +3,8 @@ exec_prefix=${prefix}
 libdir=${exec_prefix}/lib
 includedir=${prefix}/include
 
-Name: llama
-Description: Port of Facebook's LLaMA model in C/C++
+Name: jarvis
+Description: Port of Facebook's JARVIS model in C/C++
 Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
+Libs: -L${libdir} -ljarvis
 Cflags: -I${includedir}
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 042e895add5e2..cfaa05b33ab72 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
-set(LLAMA_COMMON_EXTRA_LIBS build_info)
+set(JARVIS_COMMON_EXTRA_LIBS build_info)
 
 # Use curl to download model url
-if (LLAMA_CURL)
+if (JARVIS_CURL)
     find_package(CURL REQUIRED)
-    add_definitions(-DLLAMA_USE_CURL)
+    add_definitions(-DJARVIS_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
     find_library(CURL_LIBRARY curl REQUIRED)
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+    set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_11)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries     (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
index e1e933934f0ef..73a3542593ca2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -17,7 +17,7 @@
 
 using json = nlohmann::ordered_json;
 
-common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
+common_arg & common_arg::set_examples(std::initializer_list<enum jarvis_example> examples) {
     this->examples = std::move(examples);
     return *this;
 }
@@ -33,7 +33,7 @@ common_arg & common_arg::set_sparam() {
     return *this;
 }
 
-bool common_arg::in_example(enum llama_example ex) {
+bool common_arg::in_example(enum jarvis_example ex) {
     return examples.find(ex) != examples.end();
 }
 
@@ -279,7 +279,7 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
     std::vector<common_arg *> sparam_options;
     std::vector<common_arg *> specific_options;
     for (auto & opt : ctx_arg.options) {
-        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        // in case multiple JARVIS_EXAMPLE_* are set, we prioritize the JARVIS_EXAMPLE_* matching current example
         if (opt.is_sparam) {
             sparam_options.push_back(&opt);
         } else if (opt.in_example(ctx_arg.ex)) {
@@ -292,12 +292,12 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
     print_options(common_options);
     printf("\n\n----- sampling params -----\n\n");
     print_options(sparam_options);
-    // TODO: maybe convert enum llama_example to string
+    // TODO: maybe convert enum jarvis_example to string
     printf("\n\n----- example-specific params -----\n\n");
     print_options(specific_options);
 }
 
-bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **)) {
     auto ctx_arg = common_params_parser_init(params, ex, print_usage);
     const common_params params_org = ctx_arg.params; // the example can modify the default params
 
@@ -322,7 +322,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
     return true;
 }
 
-common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **)) {
     common_params_context ctx_arg(params);
     ctx_arg.print_usage = print_usage;
     ctx_arg.ex          = ex;
@@ -339,12 +339,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     /**
      * filter options by example
      * rules:
-     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
-     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
-     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+     * - all examples inherit options from JARVIS_EXAMPLE_COMMON
+     * - if JARVIS_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+     * - if both {JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_*,} are set, we will prioritize the JARVIS_EXAMPLE_* matching current example
      */
     auto add_opt = [&](common_arg arg) {
-        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
+        if (arg.in_example(ex) || arg.in_example(JARVIS_EXAMPLE_COMMON)) {
             ctx_arg.options.push_back(std::move(arg));
         }
     };
@@ -361,8 +361,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--version"},
         "show version and build info",
         [](common_params &) {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "version: %d (%s)\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", JARVIS_COMPILER, JARVIS_BUILD_TARGET);
             exit(0);
         }
     ));
@@ -379,14 +379,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.display_prompt = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-co", "--color"},
         string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
         [](common_params & params) {
             params.use_color = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL, JARVIS_EXAMPLE_SPECULATIVE, JARVIS_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
         string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -396,7 +396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ).set_env("LLAMA_ARG_THREADS"));
+    ).set_env("JARVIS_ARG_THREADS"));
     add_opt(common_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
@@ -416,7 +416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -426,7 +426,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
@@ -524,7 +524,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 throw std::invalid_argument("invalid cpumask");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-Crd", "--cpu-range-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
@@ -534,14 +534,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 throw std::invalid_argument("invalid range");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--cpu-strict-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: same as --cpu-strict)",
         [](common_params & params, int value) {
             params.draft_cpuparams.strict_cpu = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-draft"}, "N",
         string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
@@ -551,14 +551,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--poll-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: same as --poll])",
         [](common_params & params, int value) {
             params.draft_cpuparams.poll = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-Cbd", "--cpu-mask-batch-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -568,7 +568,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 throw std::invalid_argument("invalid cpumask");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
@@ -578,14 +578,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 throw std::invalid_argument("invalid cpumask");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--cpu-strict-batch-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
         [](common_params & params, int value) {
             params.draft_cpuparams_batch.strict_cpu = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-batch-draft"}, "N",
         string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
@@ -595,70 +595,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--poll-batch-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: --poll-draft)",
         [](common_params & params, int value) {
             params.draft_cpuparams_batch.poll = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--draft"}, "N",
         string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
         [](common_params & params, int value) {
             params.n_draft = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE, JARVIS_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-ps", "--p-split"}, "N",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
         [](common_params & params, const std::string & value) {
             params.p_split = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
         [](common_params & params, const std::string & value) {
             params.lookup_cache_static = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({JARVIS_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
         "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
         [](common_params & params, const std::string & value) {
             params.lookup_cache_dynamic = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({JARVIS_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-c", "--ctx-size"}, "N",
         string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
         [](common_params & params, int value) {
             params.n_ctx = value;
         }
-    ).set_env("LLAMA_ARG_CTX_SIZE"));
+    ).set_env("JARVIS_ARG_CTX_SIZE"));
     add_opt(common_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
         [](common_params & params, int value) {
             params.n_predict = value;
         }
-    ).set_env("LLAMA_ARG_N_PREDICT"));
+    ).set_env("JARVIS_ARG_N_PREDICT"));
     add_opt(common_arg(
         {"-b", "--batch-size"}, "N",
         string_format("logical maximum batch size (default: %d)", params.n_batch),
         [](common_params & params, int value) {
             params.n_batch = value;
         }
-    ).set_env("LLAMA_ARG_BATCH"));
+    ).set_env("JARVIS_ARG_BATCH"));
     add_opt(common_arg(
         {"-ub", "--ubatch-size"}, "N",
         string_format("physical maximum batch size (default: %d)", params.n_ubatch),
         [](common_params & params, int value) {
             params.n_ubatch = value;
         }
-    ).set_env("LLAMA_ARG_UBATCH"));
+    ).set_env("JARVIS_ARG_UBATCH"));
     add_opt(common_arg(
         {"--keep"}, "N",
         string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
@@ -672,24 +672,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX, JARVIS_EXAMPLE_PERPLEXITY, JARVIS_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
         [](common_params & params) {
             params.flash_attn = true;
         }
-    ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    ).set_env("JARVIS_ARG_FLASH_ATTN"));
     add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
-        ex == LLAMA_EXAMPLE_MAIN
+        ex == JARVIS_EXAMPLE_MAIN
             ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
             : "prompt to start generation with",
         [](common_params & params, const std::string & value) {
@@ -698,12 +698,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--no-perf"},
-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        string_format("disable internal libjarvis performance timings (default: %s)", params.no_perf ? "true" : "false"),
         [](common_params & params) {
             params.no_perf = true;
             params.sparams.no_perf = true;
         }
-    ).set_env("LLAMA_ARG_NO_PERF"));
+    ).set_env("JARVIS_ARG_NO_PERF"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
@@ -730,7 +730,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.in_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
@@ -767,42 +767,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_print = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
         [](common_params & params, const std::string & value) {
             params.path_prompt_cache = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
         [](common_params & params) {
             params.prompt_cache_all = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
         [](common_params & params) {
             params.prompt_cache_ro = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
         [](common_params & params, const std::string & value) {
             params.antiprompt.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-sp", "--special"},
         string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
         [](common_params & params) {
             params.special = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cnv", "--conversation"},
         string_format(
@@ -815,28 +815,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.conversation = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
         [](common_params & params) {
             params.interactive = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-if", "--interactive-first"},
         string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
         [](common_params & params) {
             params.interactive_first = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
         [](common_params & params) {
             params.multiline_input = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -844,7 +844,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_prefix_bos = true;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
@@ -852,7 +852,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
@@ -860,14 +860,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
@@ -877,7 +877,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.spm_infill = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({JARVIS_EXAMPLE_SERVER, JARVIS_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--samplers"}, "SAMPLERS",
         string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
@@ -888,7 +888,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"-s", "--seed"}, "SEED",
-        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, JARVIS_DEFAULT_SEED),
         [](common_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
@@ -1101,7 +1101,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
         [](common_params & params, const std::string & value) {
             std::stringstream ss(value);
-            llama_token key;
+            jarvis_token key;
             char sign;
             std::string value_str;
             try {
@@ -1149,103 +1149,103 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
         [](common_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
-            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
-            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
+            /**/ if (value == "none") { params.pooling_type = JARVIS_POOLING_TYPE_NONE; }
+            else if (value == "mean") { params.pooling_type = JARVIS_POOLING_TYPE_MEAN; }
+            else if (value == "cls")  { params.pooling_type = JARVIS_POOLING_TYPE_CLS;  }
+            else if (value == "last") { params.pooling_type = JARVIS_POOLING_TYPE_LAST; }
+            else if (value == "rank") { params.pooling_type = JARVIS_POOLING_TYPE_RANK; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
+    ).set_examples({JARVIS_EXAMPLE_EMBEDDING, JARVIS_EXAMPLE_RETRIEVAL, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_POOLING"));
     add_opt(common_arg(
         {"--attention"}, "{causal,non-causal}",
         "attention type for embeddings, use model default if unspecified",
         [](common_params & params, const std::string & value) {
-            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+            /**/ if (value == "causal") { params.attention_type = JARVIS_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { params.attention_type = JARVIS_ATTENTION_TYPE_NON_CAUSAL; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({JARVIS_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--rope-scaling"}, "{none,linear,yarn}",
         "RoPE frequency scaling method, defaults to linear unless specified by the model",
         [](common_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            /**/ if (value == "none") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_YARN; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
+    ).set_env("JARVIS_ARG_ROPE_SCALING_TYPE"));
     add_opt(common_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
         [](common_params & params, const std::string & value) {
             params.rope_freq_scale = 1.0f / std::stof(value);
         }
-    ).set_env("LLAMA_ARG_ROPE_SCALE"));
+    ).set_env("JARVIS_ARG_ROPE_SCALE"));
     add_opt(common_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
         [](common_params & params, const std::string & value) {
             params.rope_freq_base = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
+    ).set_env("JARVIS_ARG_ROPE_FREQ_BASE"));
     add_opt(common_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
         [](common_params & params, const std::string & value) {
             params.rope_freq_scale = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
+    ).set_env("JARVIS_ARG_ROPE_FREQ_SCALE"));
     add_opt(common_arg(
         {"--yarn-orig-ctx"}, "N",
         string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
         [](common_params & params, int value) {
             params.yarn_orig_ctx = value;
         }
-    ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
+    ).set_env("JARVIS_ARG_YARN_ORIG_CTX"));
     add_opt(common_arg(
         {"--yarn-ext-factor"}, "N",
         string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
         [](common_params & params, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
+    ).set_env("JARVIS_ARG_YARN_EXT_FACTOR"));
     add_opt(common_arg(
         {"--yarn-attn-factor"}, "N",
         string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
         [](common_params & params, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
+    ).set_env("JARVIS_ARG_YARN_ATTN_FACTOR"));
     add_opt(common_arg(
         {"--yarn-beta-slow"}, "N",
         string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
         [](common_params & params, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
+    ).set_env("JARVIS_ARG_YARN_BETA_SLOW"));
     add_opt(common_arg(
         {"--yarn-beta-fast"}, "N",
         string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
         [](common_params & params, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
+    ).set_env("JARVIS_ARG_YARN_BETA_FAST"));
     add_opt(common_arg(
         {"-gan", "--grp-attn-n"}, "N",
         string_format("group-attention factor (default: %d)", params.grp_attn_n),
         [](common_params & params, int value) {
             params.grp_attn_n = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
+    ).set_env("JARVIS_ARG_GRP_ATTN_N").set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         string_format("group-attention width (default: %d)", params.grp_attn_w),
         [](common_params & params, int value) {
             params.grp_attn_w = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_env("JARVIS_ARG_GRP_ATTN_W").set_examples({JARVIS_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
@@ -1259,7 +1259,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.no_kv_offload = true;
         }
-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    ).set_env("JARVIS_ARG_NO_KV_OFFLOAD"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
@@ -1267,7 +1267,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             // TODO: get the type right here
             params.cache_type_k = value;
         }
-    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
+    ).set_env("JARVIS_ARG_CACHE_TYPE_K"));
     add_opt(common_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
         string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
@@ -1275,141 +1275,141 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             // TODO: get the type right here
             params.cache_type_v = value;
         }
-    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
+    ).set_env("JARVIS_ARG_CACHE_TYPE_V"));
     add_opt(common_arg(
         {"--perplexity", "--all-logits"},
         string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
         [](common_params & params) {
             params.logits_all = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
         [](common_params & params) {
             params.hellaswag = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag-tasks"}, "N",
         string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
         [](common_params & params, int value) {
             params.hellaswag_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
         [](common_params & params) {
             params.winogrande = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--winogrande-tasks"}, "N",
         string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
         [](common_params & params, int value) {
             params.winogrande_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
         [](common_params & params) {
             params.multiple_choice = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--multiple-choice-tasks"}, "N",
         string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
         [](common_params & params, int value) {
             params.multiple_choice_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
         [](common_params & params) {
             params.kl_divergence = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
         "set logits file",
         [](common_params & params, const std::string & value) {
             params.logits_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-stride"}, "N",
         string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
         [](common_params & params, int value) {
             params.ppl_stride = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-output-type"}, "<0|1>",
         string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
         [](common_params & params, int value) {
             params.ppl_output_type = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({JARVIS_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
         string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
         [](common_params & params, const std::string & value) {
             params.defrag_thold = std::stof(value);
         }
-    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
+    ).set_env("JARVIS_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(
         {"-np", "--parallel"}, "N",
         string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
         [](common_params & params, int value) {
             params.n_parallel = value;
         }
-    ).set_env("LLAMA_ARG_N_PARALLEL"));
+    ).set_env("JARVIS_ARG_N_PARALLEL"));
     add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
         string_format("number of sequences to decode (default: %d)", params.n_sequences),
         [](common_params & params, int value) {
             params.n_sequences = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
+    ).set_examples({JARVIS_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-cb", "--cont-batching"},
         string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
         [](common_params & params) {
             params.cont_batching = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CONT_BATCHING"));
     add_opt(common_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
         [](common_params & params) {
             params.cont_batching = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_CONT_BATCHING"));
     add_opt(common_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({JARVIS_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-    if (llama_supports_rpc()) {
+    ).set_examples({JARVIS_EXAMPLE_LLAVA}));
+    if (jarvis_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
             "comma separated list of RPC servers",
             [](common_params & params, const std::string & value) {
                 params.rpc_servers = value;
             }
-        ).set_env("LLAMA_ARG_RPC"));
+        ).set_env("JARVIS_ARG_RPC"));
     }
     add_opt(common_arg(
         {"--mlock"},
@@ -1417,14 +1417,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_mlock = true;
         }
-    ).set_env("LLAMA_ARG_MLOCK"));
+    ).set_env("JARVIS_ARG_MLOCK"));
     add_opt(common_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
         [](common_params & params) {
             params.use_mmap = false;
         }
-    ).set_env("LLAMA_ARG_NO_MMAP"));
+    ).set_env("JARVIS_ARG_NO_MMAP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
@@ -1432,36 +1432,36 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- isolate: only spawn threads on CPUs on the node that execution started on\n"
         "- numactl: use the CPU map provided by numactl\n"
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
-        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        "see https://github.com/ggerganov/jarvis.cpp/issues/1437",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_env("LLAMA_ARG_NUMA"));
+    ).set_env("JARVIS_ARG_NUMA"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
-            if (!llama_supports_gpu_offload()) {
+            if (!jarvis_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    ).set_env("JARVIS_ARG_N_GPU_LAYERS"));
     add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
         [](common_params & params, int value) {
             params.n_gpu_layers_draft = value;
-            if (!llama_supports_gpu_offload()) {
+            if (!jarvis_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
@@ -1471,23 +1471,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::string arg_next = value;
             if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                params.split_mode = JARVIS_SPLIT_MODE_NONE;
             } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                params.split_mode = JARVIS_SPLIT_MODE_LAYER;
             } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
-                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+                fprintf(stderr, "warning: The split mode value:[row] is not supported by jarvis.cpp with SYCL. It's developing.\nExit!\n");
                 exit(1);
 #endif // GGML_USE_SYCL
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                params.split_mode = JARVIS_SPLIT_MODE_ROW;
             } else {
                 throw std::invalid_argument("invalid value");
             }
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+            if (!jarvis_supports_gpu_offload()) {
+                fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
             }
         }
-    ).set_env("LLAMA_ARG_SPLIT_MODE"));
+    ).set_env("JARVIS_ARG_SPLIT_MODE"));
     add_opt(common_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
@@ -1498,33 +1498,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             const std::regex regex{ R"([,/]+)" };
             std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
             std::vector<std::string> split_arg{ it, {} };
-            if (split_arg.size() >= llama_max_devices()) {
+            if (split_arg.size() >= jarvis_max_devices()) {
                 throw std::invalid_argument(
-                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)jarvis_max_devices())
                 );
             }
-            for (size_t i = 0; i < llama_max_devices(); ++i) {
+            for (size_t i = 0; i < jarvis_max_devices(); ++i) {
                 if (i < split_arg.size()) {
                     params.tensor_split[i] = std::stof(split_arg[i]);
                 } else {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+            if (!jarvis_supports_gpu_offload()) {
+                fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
             }
         }
-    ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
+    ).set_env("JARVIS_ARG_TENSOR_SPLIT"));
     add_opt(common_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
         [](common_params & params, int value) {
             params.main_gpu = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+            if (!jarvis_supports_gpu_offload()) {
+                fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
             }
         }
-    ).set_env("LLAMA_ARG_MAIN_GPU"));
+    ).set_env("JARVIS_ARG_MAIN_GPU"));
     add_opt(common_arg(
         {"--check-tensors"},
         string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -1549,7 +1549,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.lora_adapters.push_back({ std::string(value), 1.0 });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
@@ -1557,7 +1557,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.lora_adapters.push_back({ fname, std::stof(scale) });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
@@ -1587,10 +1587,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.model_alias = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ALIAS"));
     add_opt(common_arg(
         {"-m", "--model"}, "FNAME",
-        ex == LLAMA_EXAMPLE_EXPORT_LORA
+        ex == JARVIS_EXAMPLE_EXPORT_LORA
             ? std::string("model path from which to load base model")
             : string_format(
                 "model path (default: `models/$filename` with filename from `--hf-file` "
@@ -1599,35 +1599,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.model = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA}).set_env("JARVIS_ARG_MODEL"));
     add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
             params.model_draft = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({JARVIS_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
         [](common_params & params, const std::string & value) {
             params.model_url = value;
         }
-    ).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_env("JARVIS_ARG_MODEL_URL"));
     add_opt(common_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
         [](common_params & params, const std::string & value) {
             params.hf_repo = value;
         }
-    ).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_env("JARVIS_ARG_HF_REPO"));
     add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
         [](common_params & params, const std::string & value) {
             params.hf_file = value;
         }
-    ).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_env("JARVIS_ARG_HF_FILE"));
     add_opt(common_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -1645,41 +1645,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.context_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({JARVIS_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-size"}, "N",
         string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
         [](common_params & params, int value) {
             params.chunk_size = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({JARVIS_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-separator"}, "STRING",
         string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
         [](common_params & params, const std::string & value) {
             params.chunk_separator = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({JARVIS_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--junk"}, "N",
         string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
         [](common_params & params, int value) {
             params.n_junk = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({JARVIS_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"--pos"}, "N",
         string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
         [](common_params & params, int value) {
             params.i_pos = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({JARVIS_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-o", "--output", "--output-file"}, "FNAME",
         string_format("output file (default: '%s')",
-            ex == LLAMA_EXAMPLE_EXPORT_LORA
+            ex == JARVIS_EXAMPLE_EXPORT_LORA
                 ? params.lora_outfile.c_str()
-                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
+                : ex == JARVIS_EXAMPLE_CVECTOR_GENERATOR
                     ? params.cvector_outfile.c_str()
                     : params.out_file.c_str()),
         [](common_params & params, const std::string & value) {
@@ -1687,49 +1687,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cvector_outfile = value;
             params.lora_outfile = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX, JARVIS_EXAMPLE_CVECTOR_GENERATOR, JARVIS_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
         [](common_params & params, int value) {
             params.n_out_freq = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--save-frequency"}, "N",
         string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
         [](common_params & params, int value) {
             params.n_save_freq = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--process-output"},
         string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
         [](common_params & params) {
             params.process_output = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--no-ppl"},
         string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
         [](common_params & params) {
             params.compute_ppl = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--chunk", "--from-chunk"}, "N",
         string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
         [](common_params & params, int value) {
             params.i_chunk = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({JARVIS_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"-pps"},
         string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
         [](common_params & params) {
             params.is_pp_shared = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({JARVIS_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
@@ -1737,7 +1737,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             auto p = string_split<int>(value, ',');
             params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({JARVIS_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
@@ -1745,7 +1745,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             auto p = string_split<int>(value, ',');
             params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({JARVIS_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
@@ -1753,70 +1753,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             auto p = string_split<int>(value, ',');
             params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({JARVIS_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--embd-normalize"}, "N",
         string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
         [](common_params & params, int value) {
             params.embd_normalize = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({JARVIS_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-output-format"}, "FORMAT",
         "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
         [](common_params & params, const std::string & value) {
             params.embd_out = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({JARVIS_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-separator"}, "STRING",
         "separator of embeddings (default \\n) for example \"<#sep#>\"",
         [](common_params & params, const std::string & value) {
             params.embd_sep = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({JARVIS_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
         string_format("ip address to listen (default: %s)", params.hostname.c_str()),
         [](common_params & params, const std::string & value) {
             params.hostname = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_HOST"));
     add_opt(common_arg(
         {"--port"}, "PORT",
         string_format("port to listen (default: %d)", params.port),
         [](common_params & params, int value) {
             params.port = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_PORT"));
     add_opt(common_arg(
         {"--path"}, "PATH",
         string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
         [](common_params & params, const std::string & value) {
             params.public_path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_STATIC_PATH"));
     add_opt(common_arg(
         {"--embedding", "--embeddings"},
         string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
         [](common_params & params) {
             params.embedding = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_EMBEDDINGS"));
     add_opt(common_arg(
         {"--reranking", "--rerank"},
         string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
         [](common_params & params) {
             params.reranking = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_RERANKING"));
     add_opt(common_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
         [](common_params & params, const std::string & value) {
             params.api_keys.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_API_KEY"));
     add_opt(common_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
@@ -1833,21 +1833,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             key_file.close();
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
         [](common_params & params, const std::string & value) {
             params.ssl_file_key = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_SSL_KEY_FILE"));
     add_opt(common_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
         [](common_params & params, const std::string & value) {
             params.ssl_file_cert = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_SSL_CERT_FILE"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -1855,49 +1855,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.timeout_read  = value;
             params.timeout_write = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_TIMEOUT"));
     add_opt(common_arg(
         {"--threads-http"}, "N",
         string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
         [](common_params & params, int value) {
             params.n_threads_http = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_THREADS_HTTP"));
     add_opt(common_arg(
         {"--cache-reuse"}, "N",
         string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
         [](common_params & params, int value) {
             params.n_cache_reuse = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CACHE_REUSE"));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_metrics = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_METRICS"));
     add_opt(common_arg(
         {"--slots"},
         string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_slots = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--props"},
         string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_props = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_PROPS"));
     add_opt(common_arg(
         {"--no-slots"},
         "disables slots monitoring endpoint",
         [](common_params & params) {
             params.endpoint_slots = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
@@ -1908,44 +1908,44 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.slot_save_path += DIRECTORY_SEPARATOR;
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         "set custom jinja chat template (default: template taken from model's metadata)\n"
         "if suffix/prefix are specified, template will be disabled\n"
-        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template",
         [](common_params & params, const std::string & value) {
             if (!common_chat_verify_template(value)) {
                 throw std::runtime_error(string_format(
                     "error: the supplied chat template is not supported: %s\n"
-                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
+                    "note: jarvis.cpp does not use jinja parser, we only support commonly used templates\n",
                     value.c_str()
                 ));
             }
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
         [](common_params & params, const std::string & value) {
             params.slot_prompt_similarity = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--lora-init-without-apply"},
         string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
         [](common_params & params) {
             params.lora_init_without_apply = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({JARVIS_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
         [](common_params & params) {
             params.simple_io = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",
@@ -1963,28 +1963,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.cvector_positive_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--negative-file"}, "FNAME",
         string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
         [](common_params & params, const std::string & value) {
             params.cvector_negative_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-batch"}, "N",
         string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
         [](common_params & params, int value) {
             params.n_pca_batch = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-iter"}, "N",
         string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
         [](common_params & params, int value) {
             params.n_pca_iterations = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--method"}, "{pca, mean}",
         "dimensionality reduction method to be used (default: pca)",
@@ -1993,7 +1993,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
             else { throw std::invalid_argument("invalid value"); }
         }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--output-format"}, "{md,jsonl}",
         "output format for batched-bench results (default: md)",
@@ -2002,7 +2002,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
             else { std::invalid_argument("invalid value"); }
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({JARVIS_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--log-disable"},
         "Log disable",
@@ -2023,7 +2023,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params &) {
             common_log_set_colors(common_log_main(), true);
         }
-    ).set_env("LLAMA_LOG_COLORS"));
+    ).set_env("JARVIS_LOG_COLORS"));
     add_opt(common_arg(
         {"-v", "--verbose", "--log-verbose"},
         "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -2039,21 +2039,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.verbosity = value;
             common_log_set_verbosity_thold(value);
         }
-    ).set_env("LLAMA_LOG_VERBOSITY"));
+    ).set_env("JARVIS_LOG_VERBOSITY"));
     add_opt(common_arg(
         {"--log-prefix"},
         "Enable prefx in log messages",
         [](common_params &) {
             common_log_set_prefix(common_log_main(), true);
         }
-    ).set_env("LLAMA_LOG_PREFIX"));
+    ).set_env("JARVIS_LOG_PREFIX"));
     add_opt(common_arg(
         {"--log-timestamps"},
         "Enable timestamps in log messages",
         [](common_params &) {
             common_log_set_timestamps(common_log_main(), true);
         }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+    ).set_env("JARVIS_LOG_TIMESTAMPS"));
 
     return ctx_arg;
 }
diff --git a/common/arg.h b/common/arg.h
index a6700d323cc14..7c6f1eeea3308 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -11,7 +11,7 @@
 //
 
 struct common_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::set<enum jarvis_example> examples = {JARVIS_EXAMPLE_COMMON};
     std::vector<const char *> args;
     const char * value_hint   = nullptr; // help text or example for arg value
     const char * value_hint_2 = nullptr; // for second arg value
@@ -52,17 +52,17 @@ struct common_arg {
         void (*handler)(common_params & params, const std::string &, const std::string &)
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_examples(std::initializer_list<enum jarvis_example> examples);
     common_arg & set_env(const char * env);
     common_arg & set_sparam();
-    bool in_example(enum llama_example ex);
+    bool in_example(enum jarvis_example ex);
     bool get_value_from_env(std::string & output);
     bool has_value_from_env();
     std::string to_string();
 };
 
 struct common_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    enum jarvis_example ex = JARVIS_EXAMPLE_COMMON;
     common_params & params;
     std::vector<common_arg> options;
     void(*print_usage)(int, char **) = nullptr;
@@ -71,7 +71,7 @@ struct common_params_context {
 
 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
 
 // function to be used by test-arg-parser
-common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in
index 0b945aa68fff3..aac4ba7e9e33a 100644
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@;
+char const *JARVIS_COMMIT = "@BUILD_COMMIT@";
+char const *JARVIS_COMPILER = "@BUILD_COMPILER@";
+char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@";
diff --git a/common/common.cpp b/common/common.cpp
index ff8cc4076e95d..fa32f671eb6f5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,7 +8,7 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -48,7 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
+#if defined(JARVIS_USE_CURL)
 #include <curl/curl.h>
 #include <curl/easy.h>
 #include <future>
@@ -58,7 +58,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if defined(LLAMA_USE_CURL)
+#if defined(JARVIS_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -66,8 +66,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-#endif // LLAMA_USE_CURL
+#define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+#endif // JARVIS_USE_CURL
 
 using json = nlohmann::ordered_json;
 
@@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }
 
 void common_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+    jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) {
             common_log_add(common_log_main(), level, "%s", text);
         }
     }, NULL);
@@ -376,7 +376,7 @@ void common_init() {
     const char * build_type = " (debug)";
 #endif
 
-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type);
 }
 
 std::string common_params_get_system_info(const common_params & params) {
@@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) {
 #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
     // TODO: windows + arm64 + mingw64
     DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
+    os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info();
 #else
-    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+    os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info();
 #endif
 
     return os.str();
@@ -483,7 +483,7 @@ std::string string_from(const std::vector<int> & values) {
     return buf.str();
 }
 
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens) {
     std::stringstream buf;
 
     buf << "[ ";
@@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
     return buf.str();
 }
 
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch) {
     std::stringstream buf;
 
     buf << "[ ";
@@ -586,27 +586,27 @@ void string_process_escapes(std::string & input) {
     input.resize(output_idx);
 }
 
-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr || sep - data >= 128) {
         LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
         return false;
     }
-    llama_model_kv_override kvo;
+    jarvis_model_kv_override kvo;
     std::strncpy(kvo.key, data, sep - data);
     kvo.key[sep - data] = 0;
     sep++;
     if (strncmp(sep, "int:", 4) == 0) {
         sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
         kvo.val_i64 = std::atol(sep);
     } else if (strncmp(sep, "float:", 6) == 0) {
         sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT;
         kvo.val_f64 = std::atof(sep);
     } else if (strncmp(sep, "bool:", 5) == 0) {
         sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL;
         if (std::strcmp(sep, "true") == 0) {
             kvo.val_bool = true;
         } else if (std::strcmp(sep, "false") == 0) {
@@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
         }
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
         if (strlen(sep) > 127) {
             LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
             return false;
@@ -788,8 +788,8 @@ std::string fs_get_cache_directory() {
         }
         return p;
     };
-    if (getenv("LLAMA_CACHE")) {
-        cache_directory = std::getenv("LLAMA_CACHE");
+    if (getenv("JARVIS_CACHE")) {
+        cache_directory = std::getenv("JARVIS_CACHE");
     } else {
 #ifdef __linux__
         if (std::getenv("XDG_CACHE_HOME")) {
@@ -803,7 +803,7 @@ std::string fs_get_cache_directory() {
         cache_directory = std::getenv("LOCALAPPDATA");
 #endif // __linux__
         cache_directory = ensure_trailing_slash(cache_directory);
-        cache_directory += "llama.cpp";
+        cache_directory += "jarvis.cpp";
     }
     return ensure_trailing_slash(cache_directory);
 }
@@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
-    auto mparams = common_model_params_to_llama(params);
+    auto mparams = common_model_params_to_jarvis(params);
 
-    llama_model * model = nullptr;
+    jarvis_model * model = nullptr;
 
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
         model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else if (!params.model_url.empty()) {
         model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = jarvis_load_model_from_file(params.model.c_str(), mparams);
     }
 
     if (model == NULL) {
@@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) {
     if (params.reranking) {
         bool ok = true;
 
-        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) {
             LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
             ok = false;
         }
 
-        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
             LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
             ok = false;
         }
 
-        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) {
             LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
             ok = false;
         }
 
         if (!ok) {
-            llama_free_model(model);
+            jarvis_free_model(model);
 
             return iparams;
         }
     }
 
-    auto cparams = common_context_params_to_llama(params);
+    auto cparams = common_context_params_to_jarvis(params);
 
-    llama_context * lctx = llama_new_context_with_model(model, cparams);
+    jarvis_context * lctx = jarvis_new_context_with_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        jarvis_free_model(model);
         return iparams;
     }
 
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
-        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = jarvis_n_layer(model);
 
         const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);
 
             return iparams;
         }
 
-        int err = llama_control_vector_apply(lctx,
+        int err = jarvis_control_vector_apply(lctx,
                                              cvec.data.data(),
                                              cvec.data.size(),
                                              cvec.n_embd,
                                              params.control_vector_layer_start,
                                              params.control_vector_layer_end);
         if (err) {
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);
 
             return iparams;
         }
@@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) {
         common_lora_adapter_container loaded_la;
         loaded_la.path = la.path;
         loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+        loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str());
         if (loaded_la.adapter == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);
             return iparams;
         }
         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
@@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         common_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
         params.sparams.ignore_eos = false;
     }
@@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) {
     if (params.warmup) {
         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
-        std::vector<llama_token> tmp;
-        llama_token bos = llama_token_bos(model);
-        llama_token eos = llama_token_eos(model);
+        std::vector<jarvis_token> tmp;
+        jarvis_token bos = jarvis_token_bos(model);
+        jarvis_token eos = jarvis_token_eos(model);
         // some models (e.g. T5) don't have a BOS token
-        if (bos != LLAMA_TOKEN_NULL) {
+        if (bos != JARVIS_TOKEN_NULL) {
             tmp.push_back(bos);
         }
-        if (eos != LLAMA_TOKEN_NULL) {
+        if (eos != JARVIS_TOKEN_NULL) {
             tmp.push_back(eos);
         }
         if (tmp.empty()) {
             tmp.push_back(0);
         }
 
-        if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
-            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (jarvis_model_has_encoder(model)) {
+            jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size()));
+            jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
             if (decoder_start_token_id == -1) {
                 decoder_start_token_id = bos;
             }
             tmp.clear();
             tmp.push_back(decoder_start_token_id);
         }
-        if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
+        if (jarvis_model_has_decoder(model)) {
+            jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(lctx);
-        llama_synchronize(lctx);
-        llama_perf_context_reset(lctx);
+        jarvis_kv_cache_clear(lctx);
+        jarvis_synchronize(lctx);
+        jarvis_perf_context_reset(lctx);
     }
 
     iparams.model   = model;
@@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) {
     return iparams;
 }
 
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
-    llama_lora_adapter_clear(ctx);
+void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+    jarvis_lora_adapter_clear(ctx);
     for (auto & la : lora_adapters) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            jarvis_lora_adapter_set(ctx, la.adapter, la.scale);
         }
     }
 }
 
-struct llama_model_params common_model_params_to_llama(const common_params & params) {
-    auto mparams = llama_model_default_params();
+struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) {
+    auto mparams = jarvis_model_default_params();
 
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
@@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
     throw std::runtime_error("Unsupported cache type: " + s);
 }
 
-struct llama_context_params common_context_params_to_llama(const common_params & params) {
-    auto cparams = llama_context_default_params();
+struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) {
+    auto cparams = jarvis_context_default_params();
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_seq_max         = params.n_parallel;
@@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
 
     if (params.reranking) {
         cparams.embeddings    = true;
-        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
+        cparams.pooling_type  = JARVIS_POOLING_TYPE_RANK;
     }
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
     return tpp;
 }
 
-#ifdef LLAMA_USE_CURL
+#ifdef JARVIS_USE_CURL
 
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
@@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
         curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
 
         // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+        auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string {
             std::size_t protocol_pos = url.find("://");
             if (protocol_pos == std::string::npos) {
                 return url;  // Malformed URL
@@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 
         // start the download
         LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+            jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
         bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
         if (!was_perform_successful) {
             return false;
@@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
     return true;
 }
 
-struct llama_model * common_load_model_from_url(
+struct jarvis_model * common_load_model_from_url(
         const char * model_url,
         const char * path_model,
         const char * hf_token,
-        const struct llama_model_params & params) {
+        const struct jarvis_model_params & params) {
     // Basic validation of the model_url
     if (!model_url || strlen(model_url) == 0) {
         LOG_ERR("%s: invalid model_url\n", __func__);
@@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url(
 
     if (n_split > 1) {
         char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+        char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0};
 
         // Verify the first split file format
         // and extract split URL and PATH prefixes
         {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+            if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
                 LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
                 return NULL;
             }
 
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+            if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
                 LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
                 return NULL;
             }
@@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url(
         for (int idx = 1; idx < n_split; idx++) {
             futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
                 char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+                jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
 
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+                char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0};
+                jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
 
                 return common_download_file(split_url, split_path, hf_token);
             }, idx));
@@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url(
         }
     }
 
-    return llama_load_model_from_file(path_model, params);
+    return jarvis_load_model_from_file(path_model, params);
 }
 
-struct llama_model * common_load_model_from_hf(
+struct jarvis_model * common_load_model_from_hf(
         const char * repo,
         const char * model,
         const char * path_model,
         const char * hf_token,
-        const struct llama_model_params & params) {
+        const struct jarvis_model_params & params) {
     // construct hugging face model url:
     //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //  --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf
     //
     //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
     //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
@@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf(
 
 #else
 
-struct llama_model * common_load_model_from_url(
+struct jarvis_model * common_load_model_from_url(
         const char * /*model_url*/,
         const char * /*path_model*/,
         const char * /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+        const struct jarvis_model_params & /*params*/) {
+    LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 
-struct llama_model * common_load_model_from_hf(
+struct jarvis_model * common_load_model_from_hf(
         const char * /*repo*/,
         const char * /*model*/,
         const char * /*path_model*/,
         const char * /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+        const struct jarvis_model_params & /*params*/) {
+    LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
     return nullptr;
 }
 
-#endif // LLAMA_USE_CURL
+#endif // JARVIS_USE_CURL
 
 //
 // Batch utils
 //
 
-void common_batch_clear(struct llama_batch & batch) {
+void common_batch_clear(struct jarvis_batch & batch) {
     batch.n_tokens = 0;
 }
 
 void common_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
+                 struct jarvis_batch & batch,
+                        jarvis_token   id,
+                          jarvis_pos   pos,
+    const std::vector<jarvis_seq_id> & seq_ids,
                                bool   logits) {
-    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded");
 
     batch.token   [batch.n_tokens] = id;
     batch.pos     [batch.n_tokens] = pos;
@@ -1485,26 +1485,26 @@ void common_batch_add(
 // Vocab utils
 //
 
-std::vector<llama_token> common_tokenize(
-  const struct llama_context * ctx,
+std::vector<jarvis_token> common_tokenize(
+  const struct jarvis_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
-    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+    return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special);
 }
 
-std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+std::vector<jarvis_token> common_tokenize(
+    const struct jarvis_model * model,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
     // upper limit for the number of tokens
     int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    std::vector<jarvis_token> result(n_tokens);
+    n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -1512,13 +1512,13 @@ std::vector<llama_token> common_tokenize(
     return result;
 }
 
-std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+    const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
     if (n_chars < 0) {
         piece.resize(-n_chars);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+        int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
         GGML_ASSERT(check == -n_chars);
     }
     else {
@@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
     return piece;
 }
 
-std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(jarvis_context * ctx, const std::vector<jarvis_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
@@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 //
 
 bool common_chat_verify_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    jarvis_chat_message chat[] = {{"user", "test"}};
+    int res = jarvis_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
-std::string common_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct jarvis_model * model,
         const std::string & tmpl,
         const std::vector<common_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
     bool fallback = false; // indicate if we must fallback to default chatml
-    std::vector<llama_chat_message> chat;
+    std::vector<jarvis_chat_message> chat;
     for (auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
         alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
@@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
     std::vector<char> buf(alloc_size);
 
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
     // error: chat template is not supported
     if (res < 0) {
         if (ptr_tmpl != nullptr) {
             // if the custom "tmpl" is not supported, we throw an error
-            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template()
             throw std::runtime_error("this custom template is not supported");
         } else {
             // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
             fallback = true;
         }
     }
@@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
-        res = llama_chat_apply_template(
+        res = jarvis_chat_apply_template(
             fallback ? nullptr : model,
             fallback ? "chatml" : ptr_tmpl,
             chat.data(), chat.size(), add_ass, buf.data(), buf.size());
@@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
     return formatted_chat;
 }
 
-std::string common_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct jarvis_model * model,
         const std::string & tmpl,
         const std::vector<common_chat_msg> & past_msg,
         const common_chat_msg & new_msg,
@@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model,
     return ss.str();
 }
 
-std::string common_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct jarvis_model * model,
         const std::string & tmpl) {
     std::vector<common_chat_msg> msgs = {
         {"system",    "You are a helpful assistant"},
@@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model,
 // KV cache utils
 //
 
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
         view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
+    jarvis_kv_cache_view_cell * c_curr = view.cells;
+    jarvis_seq_id * cs_curr = view.cells_sequences;
 
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
         if (i % row_size == 0) {
@@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     printf("\n=== Done dumping\n");
 }
 
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
         view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
+    std::unordered_map<jarvis_seq_id, size_t> seqs;
+    jarvis_kv_cache_view_cell * c_curr = view.cells;
+    jarvis_seq_id * cs_curr = view.cells_sequences;
 
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
         for (int j = 0; j < view.n_seq_max; j++) {
@@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
     }
 }
 
-void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
+void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
     const auto & sparams = params.sparams;
 
-    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
+    fprintf(stream, "build_commit: %s\n",        JARVIS_COMMIT);
+    fprintf(stream, "build_number: %d\n",        JARVIS_BUILD_NUMBER);
     fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
     fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
     fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
@@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
 #endif // NDEBUG
 
     fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
+    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx)));
 
 #ifdef __OPTIMIZE__
     fprintf(stream, "optimize: true\n");
@@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
     fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
 
-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
+    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices());
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
diff --git a/common/common.h b/common/common.h
index 18b2121ed89b0..e3e41053ff3d6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "llama.h"
+#include "jarvis.h"
 
 #include <string>
 #include <vector>
@@ -18,8 +18,8 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 
 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT);      \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET);    \
 } while(0)
 
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
@@ -30,14 +30,14 @@ struct common_lora_adapter_info {
 };
 
 struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct jarvis_lora_adapter * adapter;
 };
 
 // build info
-extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern int JARVIS_BUILD_NUMBER;
+extern char const * JARVIS_COMMIT;
+extern char const * JARVIS_COMPILER;
+extern char const * JARVIS_BUILD_TARGET;
 
 struct common_control_vector_load_info;
 
@@ -61,25 +61,25 @@ int32_t cpu_get_num_math();
 // Common params
 //
 
-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_LOOKUP,
-    LLAMA_EXAMPLE_PARALLEL,
-
-    LLAMA_EXAMPLE_COUNT,
+enum jarvis_example {
+    JARVIS_EXAMPLE_COMMON,
+    JARVIS_EXAMPLE_SPECULATIVE,
+    JARVIS_EXAMPLE_MAIN,
+    JARVIS_EXAMPLE_INFILL,
+    JARVIS_EXAMPLE_EMBEDDING,
+    JARVIS_EXAMPLE_PERPLEXITY,
+    JARVIS_EXAMPLE_RETRIEVAL,
+    JARVIS_EXAMPLE_PASSKEY,
+    JARVIS_EXAMPLE_IMATRIX,
+    JARVIS_EXAMPLE_BENCH,
+    JARVIS_EXAMPLE_SERVER,
+    JARVIS_EXAMPLE_CVECTOR_GENERATOR,
+    JARVIS_EXAMPLE_EXPORT_LORA,
+    JARVIS_EXAMPLE_LLAVA,
+    JARVIS_EXAMPLE_LOOKUP,
+    JARVIS_EXAMPLE_PARALLEL,
+
+    JARVIS_EXAMPLE_COUNT,
 };
 
 enum common_sampler_type {
@@ -103,7 +103,7 @@ enum dimre_method {
 
 // sampler parameters
 struct common_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+    uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler
 
     int32_t n_prev             = 64;    // number of previous tokens to remember
     int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
@@ -149,7 +149,7 @@ struct common_sampler_params {
 
     std::string grammar; // optional BNF-like grammar to constrain sampling
 
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    std::vector<jarvis_logit_bias> logit_bias; // logit biases to apply
 
     // print the parameters into a string
     std::string print() const;
@@ -192,10 +192,10 @@ struct common_params {
 
     ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
 
-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
-    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+    enum jarvis_split_mode        split_mode        = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum jarvis_pooling_type      pooling_type      = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum jarvis_attention_type    attention_type    = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
     struct common_sampler_params sparams;
 
@@ -219,9 +219,9 @@ struct common_params {
 
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
-    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<jarvis_model_kv_override> kv_overrides;
 
-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply)
     std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
 
     std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio);
 
 #ifdef __GNUC__
 #ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 
-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2)
 std::string string_format(const char * fmt, ...);
 
 std::string string_strip(const std::string & str);
@@ -424,13 +424,13 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
     return parts;
 }
 
-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
 std::string string_from(bool value);
 std::string string_from(const std::vector<int> & values);
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens);
+std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch);
 
 //
 // Filesystem utils
@@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename);
 //
 
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
+    struct jarvis_model   * model   = nullptr;
+    struct jarvis_context * context = nullptr;
     std::vector<common_lora_adapter_container> lora_adapters;
 };
 
 struct common_init_result     common_init_from_params(common_params & params);
 
-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
-struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct jarvis_model_params     common_model_params_to_jarvis  (const common_params & params);
+struct jarvis_context_params   common_context_params_to_jarvis(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
+struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 
 // Batch utils
 
-void common_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct jarvis_batch & batch);
 
 void common_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
+                 struct jarvis_batch & batch,
+                        jarvis_token   id,
+                          jarvis_pos   pos,
+    const std::vector<jarvis_seq_id> & seq_ids,
                                bool   logits);
 
 //
@@ -481,14 +481,14 @@ void common_batch_add(
 
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> common_tokenize(
-  const struct llama_context * ctx,
+std::vector<jarvis_token> common_tokenize(
+  const struct jarvis_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special = false);
 
-std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+std::vector<jarvis_token> common_tokenize(
+    const struct jarvis_model * model,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special = false);
@@ -496,23 +496,23 @@ std::vector<llama_token> common_tokenize(
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string common_token_to_piece(
-        const struct llama_context * ctx,
-                       llama_token   token,
+        const struct jarvis_context * ctx,
+                       jarvis_token   token,
                        bool          special = true);
 
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string common_detokenize(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens,
+                         jarvis_context * ctx,
+        const std::vector<jarvis_token> & tokens,
                                   bool   special = true);
 
 //
 // Chat template utils
 //
 
-// same with llama_chat_message, but uses std::string
+// same with jarvis_chat_message, but uses std::string
 struct common_chat_msg {
     std::string role;
     std::string content;
@@ -521,23 +521,23 @@ struct common_chat_msg {
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
 
-// CPP wrapper for llama_chat_apply_template
+// CPP wrapper for jarvis_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct jarvis_model * model,
         const std::string & tmpl,
         const std::vector<common_chat_msg> & chat,
         bool add_ass);
 
 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct jarvis_model * model,
         const std::string & tmpl,
         const std::vector<common_chat_msg> & past_msg,
         const common_chat_msg & new_msg,
         bool add_ass);
 
 // Returns an example of formatted chat
-std::string common_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct jarvis_model * model,
         const std::string & tmpl);
 
 //
@@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model,
 //
 
 // Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40);
 
 //
 // Embedding utils
@@ -596,5 +596,5 @@ void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 
 void yaml_dump_non_result_info(
-    FILE * stream, const common_params & params, const llama_context * lctx,
+    FILE * stream, const common_params & params, const jarvis_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
diff --git a/common/console.cpp b/common/console.cpp
index 078a8d678d933..d7c1d46d8dd09 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -435,7 +435,7 @@ namespace console {
                 fputc('\n', out);
                 has_more = !has_more;
             } else {
-                // llama will just eat the single space, it won't act as a space
+                // jarvis will just eat the single space, it won't act as a space
                 if (line.length() == 1 && line.back() == ' ') {
                     line.clear();
                     pop_cursor();
diff --git a/common/json.hpp b/common/json.hpp
index a858728c4ceb8..a6f53f0b45aca 100644
--- a/common/json.hpp
+++ b/common/json.hpp
@@ -5336,7 +5336,7 @@ template<typename IteratorType> class iteration_proxy
 };
 
 // Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
 auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
@@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decl
     return i.key();
 }
 // Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
 auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
@@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // The Addition to the STD Namespace is required to add
 // Structured Bindings Support to the iteration_proxy_value class
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 namespace std
 {
diff --git a/common/log.cpp b/common/log.cpp
index 04c7c0ed10595..3b022ad9ff3c6 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -8,7 +8,7 @@
 #include <thread>
 #include <vector>
 
-int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+int common_log_verbosity_thold = LOG_DEFAULT_JARVIS;
 
 void common_log_set_verbosity_thold(int verbosity) {
     common_log_verbosity_thold = verbosity;
diff --git a/common/log.h b/common/log.h
index 66605cc69a314..37d7a0146f5d1 100644
--- a/common/log.h
+++ b/common/log.h
@@ -11,7 +11,7 @@
 #endif
 
 #define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#define LOG_DEFAULT_JARVIS 0
 
 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 // set via common_log_set_verbosity()
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index a9dfb67142528..c1576b136fccd 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -9,7 +9,7 @@
 #include <thread>
 
 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
-                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+                              std::vector<jarvis_token> & inp, int nnew, bool print_progress) {
     const int64_t t_start_ms = ggml_time_ms();
     const int64_t inp_size = inp.size();
 
@@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
         for (int64_t i = i_start; i < inp_size; ++i) {
             const int64_t ngram_start = i - ngram_size;
             common_ngram ngram(&inp[ngram_start], ngram_size);
-            const llama_token token = inp[i];
+            const jarvis_token token = inp[i];
 
             common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
             if (part_it == ngram_cache.end()) {
@@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
 }
 
 // Helper function to get a token from the combined, speculative sequence of inp and draft.
-static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
+static jarvis_token get_token(const std::vector<jarvis_token> & inp, const std::vector<jarvis_token> & draft, const size_t i) {
     return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
 }
 
 // If sample size or percentage are below these thresholds the draft is aborted early:
-constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
-constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
-constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
-constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
+constexpr int    draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2,  2,  1,  1};
+constexpr int        draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50};
+constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4,  3,  2,  2};
+constexpr int     draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66};
 
 // Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
+static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
     common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
     if (part_static_it == nc_static.end()) {
         return -1;
@@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
 
     int max_count_static  = 0;
     int sum_count_static  = 0;
-    llama_token max_token = -1;
+    jarvis_token max_token = -1;
 
-    for (std::pair<llama_token, int> token_count_static : part_static) {
-        const llama_token token = token_count_static.first;
+    for (std::pair<jarvis_token, int> token_count_static : part_static) {
+        const jarvis_token token = token_count_static.first;
         const int32_t count_static  = token_count_static.second;
 
         if (count_static > max_count_static) {
@@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
         sum_count_static += count_static;
     }
 
-    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
+    if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) {
         return -1;
     }
-    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
+    if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) {
         return -1;
     }
     return max_token;
 }
 
 // Try to draft a token from primary cache (context/dynamic), validate with static cache:
-static llama_token try_draft(
+static jarvis_token try_draft(
     common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
     const int * min_sample_size, const int * min_percent) {
 
-    llama_token drafted_token = -1;
+    jarvis_token drafted_token = -1;
 
     for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
         const common_ngram ngram_primary = ngrams_primary[i];
@@ -112,10 +112,10 @@ static llama_token try_draft(
         int max_count_primary = 0;
         int max_count_static  = 0;
         int sum_count_primary = 0;
-        llama_token max_token = -1;
+        jarvis_token max_token = -1;
 
-        for (std::pair<llama_token, int> token_count_primary : part_primary) {
-            const llama_token token = token_count_primary.first;
+        for (std::pair<jarvis_token, int> token_count_primary : part_primary) {
+            const jarvis_token token = token_count_primary.first;
 
             common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
 
@@ -143,22 +143,22 @@ static llama_token try_draft(
 }
 
 void common_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
     common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
 ) {
     GGML_ASSERT(draft.size() == 1);
     const int inp_size = inp.size();
 
-    if (inp_size < LLAMA_NGRAM_STATIC) {
+    if (inp_size < JARVIS_NGRAM_STATIC) {
         return;
     }
 
     while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
+        jarvis_token drafted_token = -1;
 
-        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
+        const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1;
         common_ngram ngram_static;
-        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+        for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) {
             ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
         }
         common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
@@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
 
         file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
         file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
-        for (std::pair<llama_token, int32_t> item2 : token_counts) {
-            const llama_token token = item2.first;
+        for (std::pair<jarvis_token, int32_t> item2 : token_counts) {
+            const jarvis_token token = item2.first;
             const int32_t     count = item2.second;
             GGML_ASSERT(count > 0);
 
-            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(jarvis_token));
             file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
         }
     }
@@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
 
     common_ngram ngram;
     int32_t     ntokens;
-    llama_token token;
+    jarvis_token token;
     int32_t     count;
 
     char * ngramc   = reinterpret_cast<char*>(&ngram);
@@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
 
         for (int i = 0; i < ntokens; ++i) {
             GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token)));
             GGML_ASSERT(!hashmap_file.eof());
             GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
             GGML_ASSERT(count > 0);
@@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
             continue;
         }
 
-        for (std::pair<llama_token, int32_t> token_count : part) {
-            const llama_token token = token_count.first;
+        for (std::pair<jarvis_token, int32_t> token_count : part) {
+            const jarvis_token token = token_count.first;
             const int32_t     count = token_count.second;
             GGML_ASSERT(count > 0);
 
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
index 09c2b0319f2c0..c3fb21c6ace95 100644
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -1,34 +1,34 @@
 #pragma once
 
-#include "llama.h"
+#include "jarvis.h"
 
 #include <unordered_map>
 #include <string>
 #include <vector>
 
-#define LLAMA_NGRAM_MIN    1
-#define LLAMA_NGRAM_MAX    4
-#define LLAMA_NGRAM_STATIC 2
+#define JARVIS_NGRAM_MIN    1
+#define JARVIS_NGRAM_MAX    4
+#define JARVIS_NGRAM_STATIC 2
 
 // Data structures to map n-grams to empirical token probabilities:
 
 struct common_ngram {
-    llama_token tokens[LLAMA_NGRAM_MAX];
+    jarvis_token tokens[JARVIS_NGRAM_MAX];
 
     common_ngram() {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
             tokens[i] = -1;
         }
     }
 
-    common_ngram(const llama_token * input, const int ngram_size) {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+    common_ngram(const jarvis_token * input, const int ngram_size) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
             tokens[i] = i < ngram_size ? input[i] : -1;
         }
     }
 
     bool operator==(const common_ngram & other) const {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
             if (tokens[i] != other.tokens[i]) {
                 return false;
             }
@@ -38,7 +38,7 @@ struct common_ngram {
 };
 
 struct common_token_hash_function {
-    size_t operator()(const llama_token token) const {
+    size_t operator()(const jarvis_token token) const {
         // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
         return token * 11400714819323198485llu;
     }
@@ -47,7 +47,7 @@ struct common_token_hash_function {
 struct common_ngram_hash_function {
     size_t operator()(const common_ngram & ngram) const {
         size_t hash = common_token_hash_function{}(ngram.tokens[0]);
-        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) {
             hash ^= common_token_hash_function{}(ngram.tokens[i]);
         }
         return hash;
@@ -55,7 +55,7 @@ struct common_ngram_hash_function {
 };
 
 // token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
+typedef std::unordered_map<jarvis_token, int32_t> common_ngram_cache_part;
 
 // n-gram -> empirical distribution of following tokens
 typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
@@ -71,7 +71,7 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
 void common_ngram_cache_update(
-    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp_data, int nnew, bool print_progress);
 
 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@@ -82,7 +82,7 @@ void common_ngram_cache_update(
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
 void common_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
     common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
 
 // Save an ngram cache to a file.
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 48a9df8ba5b88..b6cad63334e7b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -6,7 +6,7 @@
 #include <unordered_map>
 
 // the ring buffer works similarly to std::deque, but with a fixed capacity
-// TODO: deduplicate with llama-impl.h
+// TODO: deduplicate with jarvis-impl.h
 template<typename T>
 struct ring_buffer {
     ring_buffer(size_t cap) : capacity(cap), data(cap) {}
@@ -101,24 +101,24 @@ struct ring_buffer {
 struct common_sampler {
     common_sampler_params params;
 
-    struct llama_sampler * grmr;
-    struct llama_sampler * chain;
+    struct jarvis_sampler * grmr;
+    struct jarvis_sampler * chain;
 
-    ring_buffer<llama_token> prev;
+    ring_buffer<jarvis_token> prev;
 
-    std::vector<llama_token_data> cur;
+    std::vector<jarvis_token_data> cur;
 
-    llama_token_data_array cur_p;
+    jarvis_token_data_array cur_p;
 
-    void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
+    void set_logits(struct jarvis_context * ctx, int idx) {
+        const auto * logits = jarvis_get_logits_ith(ctx, idx);
 
-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+        const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
         cur.resize(n_vocab);
 
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f};
         }
 
         cur_p = { cur.data(), cur.size(), -1, false };
@@ -141,31 +141,31 @@ std::string common_sampler_params::print() const {
     return std::string(result);
 }
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
-    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
+struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) {
+    jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params();
 
     lparams.no_perf = params.no_perf;
 
     auto * result = new common_sampler {
         /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .grmr   = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .chain  = */ jarvis_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<jarvis_token>(std::max(32, params.n_prev)),
         /* .cur    = */ {},
         /* .cur_p  = */ {},
     };
 
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
+    jarvis_sampler_chain_add(result->chain,
+            jarvis_sampler_init_logit_bias(
+                jarvis_n_vocab(model),
                 params.logit_bias.size(),
                 params.logit_bias.data()));
 
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
+    jarvis_sampler_chain_add(result->chain,
+            jarvis_sampler_init_penalties(
+                jarvis_n_vocab  (model),
+                jarvis_token_eos(model),
+                jarvis_token_nl (model),
                 params.penalty_last_n,
                 params.penalty_repeat,
                 params.penalty_freq,
@@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                             c_breakers.push_back(str.c_str());
                         }
 
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                     }
                         break;
                 case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k    (params.top_k));
                     break;
                 case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p    (params.top_p, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p    (params.min_p, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                     break;
                 case COMMON_SAMPLER_TYPE_TFS_Z:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical  (params.typ_p, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                     break;
                 case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill   (model));
                     break;
                 default:
                     GGML_ASSERT(false && "unknown sampler type");
             }
         }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed));
     } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
     } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
     } else {
         GGML_ASSERT(false && "unknown mirostat version");
     }
@@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
+        jarvis_sampler_free(gsmpl->grmr);
 
-        llama_sampler_free(gsmpl->chain);
+        jarvis_sampler_free(gsmpl->chain);
 
         delete gsmpl;
     }
 }
 
-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) {
     if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
+        jarvis_sampler_accept(gsmpl->grmr, token);
     }
 
-    llama_sampler_accept(gsmpl->chain, token);
+    jarvis_sampler_accept(gsmpl->chain, token);
 
     gsmpl->prev.push_back(token);
 }
 
 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
+    jarvis_sampler_reset(gsmpl->grmr);
 
-    llama_sampler_reset(gsmpl->chain);
+    jarvis_sampler_reset(gsmpl->chain);
 }
 
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
         /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .grmr   = */ jarvis_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ jarvis_sampler_clone(gsmpl->chain),
         /* .prev   = */ gsmpl->prev,
         /* .cur    = */ gsmpl->cur,
         /* .cur_p  = */ gsmpl->cur_p,
     };
 }
 
-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
+void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) {
     // TODO: measure grammar performance
 
     if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        jarvis_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_context_print(ctx);
+        jarvis_perf_context_print(ctx);
     }
 }
 
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) {
     gsmpl->set_logits(ctx, idx);
 
     auto & grmr  = gsmpl->grmr;
@@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     auto & cur_p = gsmpl->cur_p; // initialized by set_logits
 
     if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
+        jarvis_sampler_apply(grmr, &cur_p);
     }
 
-    llama_sampler_apply(chain, &cur_p);
+    jarvis_sampler_apply(chain, &cur_p);
 
     GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
 
-    const llama_token id = cur_p.data[cur_p.selected].id;
+    const jarvis_token id = cur_p.data[cur_p.selected].id;
 
     if (grammar_first) {
         return id;
@@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 
     // check if it the sampled token fits the grammar
     {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+        jarvis_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
 
-        llama_sampler_apply(grmr, &single_token_data_array);
+        jarvis_sampler_apply(grmr, &single_token_data_array);
 
         const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
         if (is_valid) {
@@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
     gsmpl->set_logits(ctx, idx);
 
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
+    jarvis_sampler_apply(grmr,  &cur_p);
+    jarvis_sampler_apply(chain, &cur_p);
 
     GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
 
@@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 }
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
+    return jarvis_sampler_get_seed(gsmpl->chain);
 }
 
 // helpers
 
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
     return &gsmpl->cur_p;
 }
 
-llama_token common_sampler_last(const struct common_sampler * gsmpl) {
+jarvis_token common_sampler_last(const struct common_sampler * gsmpl) {
     return gsmpl->prev.rat(0);
 }
 
 std::string common_sampler_print(const struct common_sampler * gsmpl) {
     std::string result = "logits ";
 
-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+    for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i);
+        result += std::string("-> ") + jarvis_sampler_name(smpl) + " ";
     }
 
     return result;
 }
 
-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
+std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) {
     n = std::min(n, (int) gsmpl->prev.size());
 
     if (n <= 0) {
@@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
     result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
 
     for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = gsmpl->prev.rat(i);
+        const jarvis_token id = gsmpl->prev.rat(i);
 
-        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+        GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen");
 
         result += common_token_to_piece(ctx_main, id);
     }
diff --git a/common/sampling.h b/common/sampling.h
index d37f25ad37c4a..9dc17ed24b69f 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "llama.h"
+#include "jarvis.h"
 
 #include "common.h"
 
 #include <string>
 #include <vector>
 
-// common_sampler extends llama_sampler with additional functionality:
+// common_sampler extends jarvis_sampler with additional functionality:
 //
 //  - grammar support
 //  - custom sampler logic based on the parameters
@@ -24,7 +24,7 @@
 // grammar constraints are applied to the full vocabulary and the token is resampled.
 //
 // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
-// be moved into the core llama library.
+// be moved into the core jarvis library.
 //
 // For convenience, the common_sampler also maintains a container with the current candidate tokens.
 // This can be used to access the probabilities of the rest of the non-sampled tokens.
@@ -34,19 +34,19 @@
 
 struct common_sampler;
 
-// llama_sampler API overloads
+// jarvis_sampler API overloads
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params);
 
 void common_sampler_free(struct common_sampler * gsmpl);
 
 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 
 // arguments can be nullptr to skip printing
-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl);
 
 // extended sampling implementation:
 //
@@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false);
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 
 // helpers
 
 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
 
 // get the last accepted token
-llama_token common_sampler_last(const struct common_sampler * gsmpl);
+jarvis_token common_sampler_last(const struct common_sampler * gsmpl);
 
 // print the sampler chain into a string
 std::string common_sampler_print(const struct common_sampler * gsmpl);
 
 // get a string representation of the last accepted tokens
-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n);
 
 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
diff --git a/common/train.cpp b/common/train.cpp
index 661ad8382eab6..c913f6dbd8521 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -34,7 +34,7 @@ struct train_state  * init_train_state() {
     state->opt = new struct ggml_opt_context;
     state->opt->ctx = NULL;
     state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
+    state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES;
     state->opt->loss_after = 0.0f;
 
     return state;
@@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
 }
 
 int64_t get_example_targets_batch(
-    struct llama_context * lctx,
+    struct jarvis_context * lctx,
     struct ggml_tensor   * tokens_input,
     struct ggml_tensor   * target_probs,
     int64_t                example_id,
@@ -221,7 +221,7 @@ int64_t get_example_targets_batch(
     const size_t         * samples_begin,
     const size_t         * samples_size,
           size_t           samples_count,
-    const llama_token    * train_data,
+    const jarvis_token    * train_data,
     size_t                 n_train_data,
     bool                   separate_with_eos,
     bool                   separate_with_bos,
@@ -241,8 +241,8 @@ int64_t get_example_targets_batch(
     int64_t used_samples = 0;
 
     ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(llama_get_model(lctx));
+    jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx));
+    jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx));
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
@@ -259,7 +259,7 @@ int64_t get_example_targets_batch(
         bool sample_separation_eos = !separate_with_eos;
         bool sample_separation_bos = !separate_with_bos;
         for (int64_t i=0; i<n_tokens; ++i) {
-            llama_token token = eos;
+            jarvis_token token = eos;
             if (sample_offs >= sample_size && fill_with_next_samples) {
                 if (!sample_separation_eos) {
                     // insert eos token to separate samples
@@ -281,7 +281,7 @@ int64_t get_example_targets_batch(
             }
             // note: no else-if here
             if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
+                token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1));
                 ++sample_offs;
             }
             ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
@@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
 }
 
 
-struct llama_file {
+struct jarvis_file {
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
     size_t size;
 
-    llama_file(const char * fname, const char * mode) {
+    jarvis_file(const char * fname, const char * mode) {
         fp = std::fopen(fname, mode);
         if (fp == NULL) {
             size = 0;
@@ -788,7 +788,7 @@ struct llama_file {
         write_raw(&val, sizeof(val));
     }
 
-    ~llama_file() {
+    ~jarvis_file() {
         if (fp) {
             std::fclose(fp);
         }
@@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
 }
 
 size_t tokenize_file(
-        struct llama_context     * lctx,
+        struct jarvis_context     * lctx,
         const char               * filename,
         const std::string        & sample_start,
         bool                       include_sample_start,
         bool                       overlapping_samples,
         unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
+        std::vector<jarvis_token> & out_tokens,
         std::vector<size_t>      & out_samples_begin,
         std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
+    struct jarvis_file f(filename, "rb");
 
     if (f.size == 0) {
         out_tokens.clear();
@@ -844,7 +844,7 @@ size_t tokenize_file(
     }
 
     // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
+    // e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12]
     const int n_max_tokens_overhead = 1;
 
     std::vector<char> buf;
@@ -862,8 +862,8 @@ size_t tokenize_file(
         // tokenize all data at once
         out_tokens.resize(buf.size() + n_max_tokens_overhead);
 
-        int n_tokens = llama_tokenize(
-            llama_get_model(lctx),
+        int n_tokens = jarvis_tokenize(
+            jarvis_get_model(lctx),
             buf.data(),
             (int) buf.size(),
             out_tokens.data(),
@@ -871,8 +871,8 @@ size_t tokenize_file(
             false, false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(
-                llama_get_model(lctx),
+            n_tokens = jarvis_tokenize(
+                jarvis_get_model(lctx),
                 buf.data(),
                 (int) buf.size(),
                 out_tokens.data(),
@@ -915,7 +915,7 @@ size_t tokenize_file(
         out_samples_size.resize(out_samples_begin.size(), 0);
 
         std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
+        std::vector<jarvis_token> tok_sample;
 
         const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
         size_t found_too_big_sample   = 0;
@@ -925,11 +925,11 @@ size_t tokenize_file(
         size_t found_max_sample_size  = 0;
 
         size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(llama_get_model(lctx));
-        for (llama_token token=0; token < n_vocab; ++token) {
+        int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx));
+        for (jarvis_token token=0; token < n_vocab; ++token) {
             max_token_text_size = std::max(
                 max_token_text_size,
-                strlen(llama_token_get_text(llama_get_model(lctx), token)));
+                strlen(jarvis_token_get_text(jarvis_get_model(lctx), token)));
         }
 
         // upper bound of context byte length.
@@ -957,7 +957,7 @@ size_t tokenize_file(
             }
 
             if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
+                // jarvis_tokenize expects zero terminated string,
                 // copy sample into buffer and zero terminate it.
                 buf_sample.resize(sample_size);
                 memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
@@ -966,7 +966,7 @@ size_t tokenize_file(
 
                 // tokenize the sample
                 tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(llama_get_model(lctx),
+                int n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
                     buf_sample.data(),
                     (int) buf_sample.size(),
                     tok_sample.data(),
@@ -974,7 +974,7 @@ size_t tokenize_file(
                     false, false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(llama_get_model(lctx),
+                    n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
                         buf_sample.data(),
                         (int) buf_sample.size(),
                         tok_sample.data(),
@@ -1365,7 +1365,7 @@ bool consume_common_train_arg(
                 *invalid_param = true;
                 return true;
             }
-            if (llama_supports_gpu_offload()) {
+            if (jarvis_supports_gpu_offload()) {
                 params->n_gpu_layers = std::stoi(argv[i]);
             } else {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
diff --git a/common/train.h b/common/train.h
index 263d940c04298..82c4a24c5d3ee 100644
--- a/common/train.h
+++ b/common/train.h
@@ -7,9 +7,9 @@
 #include <vector>
 
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 
-#define LLAMA_TRAIN_MAX_NODES 16384
+#define JARVIS_TRAIN_MAX_NODES 16384
 
 typedef std::string mt19937_state;
 
@@ -92,9 +92,9 @@ struct train_opt_callback_data {
     struct train_state         * train;
     save_train_files_callback    save_cb;
     void                       * save_data;
-    struct llama_context       * lctx;
+    struct jarvis_context       * lctx;
     int                          last_save_iter;
-    llama_token                * tokens_data;
+    jarvis_token                * tokens_data;
     size_t                       tokens_size;
     size_t                     * samples_begin;
     size_t                     * samples_size;
@@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
 
 size_t tokenize_file(
-        struct llama_context     * lctx,
+        struct jarvis_context     * lctx,
         const char               * filename,
         const std::string        & sample_start,
         bool                       include_sample_start,
         bool                       overlapping_samples,
         unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
+        std::vector<jarvis_token> & out_tokens,
         std::vector<size_t>      & out_samples_begin,
         std::vector<size_t>      & out_samples_size);
 
 int64_t get_example_targets_batch(
-        struct llama_context * lctx,
+        struct jarvis_context * lctx,
         struct ggml_tensor   * tokens_input,
         struct ggml_tensor   * target_probs,
         int64_t                example_id,
@@ -165,7 +165,7 @@ int64_t get_example_targets_batch(
         const size_t         * samples_begin,
         const size_t         * samples_size,
               size_t           samples_count,
-        const llama_token    * train_data,
+        const jarvis_token    * train_data,
         size_t                 n_train_data,
         bool                   separate_with_eos,
         bool                   separate_with_bos,
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a34dabe235a34..bc25aab73df1f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -49,7 +49,7 @@ class Model:
     _model_classes: dict[str, type[Model]] = {}
 
     dir_model: Path
-    ftype: gguf.LlamaFileType
+    ftype: gguf.JarvisFileType
     fname_out: Path
     is_big_endian: bool
     endianess: gguf.GGUFEndian
@@ -69,7 +69,7 @@ class Model:
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@@ -96,15 +96,15 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
-        if self.ftype == gguf.LlamaFileType.GUESSED:
+        if self.ftype == gguf.JarvisFileType.GUESSED:
             # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
             _, first_tensor = next(self.get_tensors())
             if first_tensor.dtype == torch.float16:
                 logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+                self.ftype = gguf.JarvisFileType.MOSTLY_F16
             else:
                 logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+                self.ftype = gguf.JarvisFileType.MOSTLY_BF16
 
         # Configure GGUF Writer
         self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
@@ -308,7 +308,7 @@ def prepare_tensors(self):
                 if n_dims <= 1 or new_name.endswith("_norm.weight"):
                     data_qtype = gguf.GGMLQuantizationType.F32
 
-                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                # Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp
                 # Some tensor types are always in float32
                 if data_qtype is False and (
                     any(
@@ -337,25 +337,25 @@ def prepare_tensors(self):
                     )
                 ):
                     if self.ftype in (
-                        gguf.LlamaFileType.MOSTLY_TQ1_0,
-                        gguf.LlamaFileType.MOSTLY_TQ2_0,
+                        gguf.JarvisFileType.MOSTLY_TQ1_0,
+                        gguf.JarvisFileType.MOSTLY_TQ2_0,
                     ):
                         # TODO: use Q4_K and Q6_K
                         data_qtype = gguf.GGMLQuantizationType.F16
 
                 # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                 if isinstance(data_qtype, bool):
-                    if self.ftype == gguf.LlamaFileType.ALL_F32:
+                    if self.ftype == gguf.JarvisFileType.ALL_F32:
                         data_qtype = gguf.GGMLQuantizationType.F32
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_F16:
                         data_qtype = gguf.GGMLQuantizationType.F16
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16:
                         data_qtype = gguf.GGMLQuantizationType.BF16
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0:
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0:
                         data_qtype = gguf.GGMLQuantizationType.TQ1_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0:
                         data_qtype = gguf.GGMLQuantizationType.TQ2_0
                     else:
                         raise ValueError(f"Unknown file type: {self.ftype.name}")
@@ -394,7 +394,7 @@ def prepare_metadata(self, vocab_only: bool):
         if self.metadata.size_label is None and total_params > 0:
             self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
 
-        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
+        # Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0'
         output_type: str = self.ftype.name.partition("_")[2]
 
         # Filename Output
@@ -537,13 +537,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
 
     # NOTE: this function is generated by convert_hf_to_gguf_update.py
     #       do not modify it manually!
-    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # ref:  https://github.com/ggerganov/jarvis.cpp/pull/6920
     # Marker: Start get_vocab_base_pre
     def get_vocab_base_pre(self, tokenizer) -> str:
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
+        # use in jarvis.cpp to implement the same pre-tokenizer
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
@@ -559,8 +559,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         #       or pull the latest version of the model from Huggingface
         #       don't edit the hashes manually!
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
-            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-            res = "llama-bpe"
+            # ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B
+            res = "jarvis-bpe"
         if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
             # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
             res = "deepseek-llm"
@@ -616,7 +616,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
-            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
+            # ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct
             res = "smaug-bpe"
         if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
             # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
@@ -666,7 +666,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
             logger.warning("**          - the pre-tokenization config has changed upstream")
             logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/jarvis.cpp/pull/6920")
             logger.warning("**")
             logger.warning(f"** chkhsh:  {chkhsh}")
             logger.warning("**************************************************************************************")
@@ -746,7 +746,7 @@ def _set_vocab_qwen(self):
     def _set_vocab_sentencepiece(self, add_to_gguf=True):
         tokens, scores, toktypes = self._create_vocab_sentencepiece()
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -835,8 +835,8 @@ def _create_vocab_sentencepiece(self):
 
         return tokens, scores, toktypes
 
-    def _set_vocab_llama_hf(self):
-        vocab = gguf.LlamaHfVocab(self.dir_model)
+    def _set_vocab_jarvis_hf(self):
+        vocab = gguf.JarvisHfVocab(self.dir_model)
         tokens = []
         scores = []
         toktypes = []
@@ -848,7 +848,7 @@ def _set_vocab_llama_hf(self):
 
         assert len(tokens) == vocab.vocab_size
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -857,7 +857,7 @@ def _set_vocab_llama_hf(self):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
+    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int):
         tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
         logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
         vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
@@ -875,7 +875,7 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
         assert field  # token list
         self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
 
-        if model_name == "llama-spm":
+        if model_name == "jarvis-spm":
             field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
             assert field  # token scores
             self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
@@ -884,7 +884,7 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
         assert field  # token types
         self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
 
-        if model_name != "llama-spm":
+        if model_name != "jarvis-spm":
             field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
             assert field  # token merges
             self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
@@ -1226,7 +1226,7 @@ def set_vocab(self):
             tokens.append(token_text)
             toktypes.append(toktype)
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
@@ -1515,21 +1515,21 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed norms: {norms}")
 
 
-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
-class LlamaModel(Model):
-    model_arch = gguf.MODEL_ARCH.LLAMA
+@Model.register("JARVISForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+class JarvisModel(Model):
+    model_arch = gguf.MODEL_ARCH.JARVIS
 
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
         except FileNotFoundError:
             try:
-                self._set_vocab_llama_hf()
+                self._set_vocab_jarvis_hf()
             except (FileNotFoundError, TypeError):
-                # Llama 3
+                # Jarvis 3
                 self._set_vocab_gpt2()
 
-        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        # Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256)
         if self.hparams.get("vocab_size", 32000) == 32016:
             special_vocab = gguf.SpecialVocab(
                 self.dir_model, load_merges=False,
@@ -1583,9 +1583,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_kv_head = self.hparams.get("num_key_value_heads")
 
         if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
@@ -1625,7 +1625,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
+            if rope_scaling.get("rope_type", '').lower() == "jarvis3":
                 base = self.hparams.get("rope_theta", 10000.0)
                 dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@@ -1793,7 +1793,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
         # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
-        # But llama.cpp moe graph works differently
+        # But jarvis.cpp moe graph works differently
         # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
         # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
         exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
@@ -1842,7 +1842,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
 
     def set_vocab(self):
-        self._set_vocab_llama_hf()
+        self._set_vocab_jarvis_hf()
 
     def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
         if n_kv_head is not None and n_head != n_kv_head:
@@ -2188,7 +2188,7 @@ def set_vocab(self):
                     if foken_data.get("special"):
                         toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -2456,7 +2456,7 @@ def set_vocab(self):
                     if foken_data.get("special"):
                         toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -2468,7 +2468,7 @@ def set_vocab(self):
         if chat_eos_token_id is not None:
             # For the chat model, we replace the eos with '<|im_end|>'.
             # TODO: this is a hack, should be fixed
-            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
+            #       https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048
             special_vocab.special_token_ids["eos"] = chat_eos_token_id
             logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
                            " in chat mode so that the conversation can end normally.")
@@ -2505,8 +2505,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
 
             # The model weights of q and k equire additional reshape.
-            q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
-            k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
+            q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
+            k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
             v = v.reshape((-1, v.shape[-1]))
 
             return [
@@ -2769,7 +2769,7 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
-        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
         # To prevent errors, skip loading lm_head.weight.
         if name == "lm_head.weight":
             logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@@ -2816,7 +2816,7 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
-        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
         # To prevent errors, skip loading lm_head.weight.
         if name == "lm_head.weight":
             logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@@ -2894,7 +2894,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_feed_forward_length(intermediate_size)
         self.gguf_writer.add_file_type(self.ftype)
 
-        # required by llama.cpp, unused
+        # required by jarvis.cpp, unused
         self.gguf_writer.add_head_count(0)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -3024,7 +3024,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_clamp_kqv(clip_qkv)
 
     # Same as super class, but permuting q_proj, k_proj
-    # Copied from: LlamaModel
+    # Copied from: JarvisModel
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
@@ -3032,9 +3032,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_kv_head = self.hparams.get("num_key_value_heads")
 
         if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
         if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
 
         return [(self.map_tensor_name(name), data_torch)]
 
@@ -3174,12 +3174,12 @@ def __init__(self, *args, **kwargs):
         assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
         assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
 
-    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
+    # Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
         except FileNotFoundError:
-            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
+            self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"])
 
     def set_gguf_parameters(self):
         n_embd = self._n_embd
@@ -3300,7 +3300,7 @@ def set_vocab(self):
                         toktypes[token_id] = token_type
                         scores[token_id] = token_score
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -3322,9 +3322,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_kv_head = self.hparams.get("num_key_value_heads")
 
         if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
         if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
@@ -3882,7 +3882,7 @@ def set_vocab_chatglm3(self):
             scores.append(score)
             toktypes.append(toktype)
 
-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
         # glm3 needs prefix and suffix formatted as:
         # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
         self.gguf_writer.add_tokenizer_pre("chatglm-spm")
@@ -4087,7 +4087,7 @@ def set_gguf_parameters(self):
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
+            if rope_scaling.get("rope_type", '').lower() == "jarvis3":
                 base = self.hparams.get("rope_theta", 10000.0)
                 dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@@ -4116,12 +4116,12 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 
 
 @Model.register("GraniteForCausalLM")
-class GraniteModel(LlamaModel):
+class GraniteModel(JarvisModel):
     """Conversion for IBM's GraniteForCausalLM"""
     model_arch = gguf.MODEL_ARCH.GRANITE
 
     def set_gguf_parameters(self):
-        """Granite uses standard llama parameters with the following differences:
+        """Granite uses standard jarvis parameters with the following differences:
 
         - No head_dim support
         - New multiplier params:
@@ -4196,9 +4196,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         hidden_dim = self.hparams.get("hidden_size")
 
         if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
         if name.endswith(("q_norm.weight", "q_norm.bias")):
             data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
         if name.endswith(("k_norm.weight", "k_norm.bias")):
@@ -4379,14 +4379,14 @@ def main() -> None:
         logger.error(f'Error: {args.model} is not a directory')
         sys.exit(1)
 
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
-        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
-        "auto": gguf.LlamaFileType.GUESSED,
+    ftype_map: dict[str, gguf.JarvisFileType] = {
+        "f32": gguf.JarvisFileType.ALL_F32,
+        "f16": gguf.JarvisFileType.MOSTLY_F16,
+        "bf16": gguf.JarvisFileType.MOSTLY_BF16,
+        "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
+        "tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0,
+        "tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0,
+        "auto": gguf.JarvisFileType.GUESSED,
     }
 
     is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 28cd02e5a7f66..b4324a3cd1922 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -5,10 +5,10 @@
 # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# provide the necessary information to jarvis.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
 #
 # Instructions:
 #
@@ -18,9 +18,9 @@
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
 # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
-# - Update llama.cpp with the new pre-tokenizer if necessary
+# - Update jarvis.cpp with the new pre-tokenizer if necessary
 #
-# TODO: generate tokenizer tests for llama.cpp
+# TODO: generate tokenizer tests for jarvis.cpp
 #
 
 import logging
@@ -65,8 +65,8 @@ class TOKENIZER_TYPE(IntEnum):
 
 # TODO: add models here, base models preferred
 models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "jarvis-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", },
+    {"name": "jarvis-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", },
     {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
     {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
     {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
@@ -86,7 +86,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", },
     {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
     {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
@@ -215,7 +215,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
+        # use in jarvis.cpp to implement the same pre-tokenizer
 
         chktxt = {repr(CHK_TXT)}
 
@@ -239,7 +239,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
             logger.warning("**          - the pre-tokenization config has changed upstream")
             logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/jarvis.cpp/pull/6920")
             logger.warning("**")
             logger.warning(f"** chkhsh:  {{chkhsh}}")
             logger.warning("**************************************************************************************")
@@ -311,7 +311,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
     "3333333",
     "33333333",
     "333333333",
-    "Cửa Việt", # llama-bpe fails on this
+    "Cửa Việt", # jarvis-bpe fails on this
     " discards",
     CHK_TXT,
 ]
diff --git a/convert_llama_ggml_to_gguf.py b/convert_jarvis_ggml_to_gguf.py
old mode 100755
new mode 100644
similarity index 96%
rename from convert_llama_ggml_to_gguf.py
rename to convert_jarvis_ggml_to_gguf.py
index 29b14e98dd237..788a595cc8549
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_jarvis_ggml_to_gguf.py
@@ -223,13 +223,13 @@ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override
                 assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
                 logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
         self.n_kv_head = n_kv_head
-        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer)
 
     def save(self):
         logger.info('* Preparing to save GGUF file')
         gguf_writer = gguf.GGUFWriter(
             self.cfg.output,
-            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
+            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS],
             use_temp_file = False)
         self.add_params(gguf_writer)
         self.add_vocab(gguf_writer)
@@ -286,7 +286,7 @@ def add_params(self, gguf_writer):
 
     def add_vocab(self, gguf_writer):
         hp = self.model.hyperparameters
-        gguf_writer.add_tokenizer_model('llama')
+        gguf_writer.add_tokenizer_model('jarvis')
         gguf_writer.add_tokenizer_pre('default')
         tokens = []
         scores = []
@@ -358,7 +358,7 @@ def add_tensors(self, gguf_writer):
 
 
 def handle_metadata(cfg, hp):
-    import examples.convert_legacy_llama as convert
+    import examples.convert_legacy_jarvis as convert
 
     assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
     hf_config_path   = cfg.model_metadata_dir / "config.json"
@@ -396,11 +396,11 @@ def handle_args():
     parser.add_argument('--desc',
                         help = 'Set model description')
     parser.add_argument('--gqa', type = int, default = 1,
-                        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+                        help = 'grouped-query attention factor (use 8 for JARVIS2 70B)')
     parser.add_argument('--eps', default = '5.0e-06',
-                        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+                        help = 'RMS norm eps: Use 1e-6 for JARVIS1 and OpenJARVIS, use 1e-5 for JARVIS2')
     parser.add_argument('--context-length', '-c', type=int, default = 2048,
-                        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+                        help = 'Default max context length: JARVIS1 is typically 2048, JARVIS2 is typically 4096')
     parser.add_argument('--model-metadata-dir', '-m', type = Path,
                         help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
     parser.add_argument("--vocab-dir", type=Path,
@@ -417,7 +417,7 @@ def main():
     logger.info(f'* Using config: {cfg}')
     logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
     if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
+        logger.info('- Note: If converting JARVIS2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
     data = np.memmap(cfg.input, mode = 'r')
     model = GGMLModel()
     logger.info('* Scanning GGML input file')
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index bc68f68afb768..f0eabf62bf2a3 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -271,12 +271,12 @@ def parse_args() -> argparse.Namespace:
     args = parse_args()
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
 
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
+    ftype_map: dict[str, gguf.JarvisFileType] = {
+        "f32": gguf.JarvisFileType.ALL_F32,
+        "f16": gguf.JarvisFileType.MOSTLY_F16,
+        "bf16": gguf.JarvisFileType.MOSTLY_BF16,
+        "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
+        "auto": gguf.JarvisFileType.GUESSED,
     }
 
     ftype = ftype_map[args.outtype]
@@ -372,9 +372,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
                 dest = list(super().modify_tensors(data_torch, name, bid))
                 # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # in this case, adapters targeting lm_head will fail when using jarvis-export-lora
                 # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                # see: https://github.com/ggerganov/jarvis.cpp/issues/9065
                 if name == "lm_head.weight" and len(dest) == 0:
                     raise ValueError("lm_head is present in adapter, but is ignored in base model")
                 for dest_name, dest_data in dest:
diff --git a/docs/android.md b/docs/android.md
index 320b62240382f..e4a071396921d 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -5,14 +5,14 @@
 
 [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
 
-With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
+With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell:
 
 ```
 $ apt update && apt upgrade -y
 $ apt install git cmake
 ```
 
-Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake.
 
 Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
 
@@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F
 $ curl -L {model-url} -o ~/{model}.gguf
 ```
 
-Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
+Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and:
 
 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
 
 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 
 ## Cross-compile using Android NDK
-It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
+It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
 
-Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
+Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory:
 
 ```
 $ cmake \
@@ -45,15 +45,15 @@ $ cmake \
   -DCMAKE_C_FLAGS="-march=armv8.7a" \
   -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
   -DGGML_OPENMP=OFF \
-  -DGGML_LLAMAFILE=OFF \
+  -DGGML_JARVISFILE=OFF \
   -B build-android
 ```
 
 Notes:
   - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
-  - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
+  - `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325)
 
-The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
+The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use.
 
 Feel free to adjust the Android ABI for your target. Once the project is configured:
 
@@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release
 After installing, go ahead and download the model of your choice to your host system. Then:
 
 ```
-$ adb shell "mkdir /data/local/tmp/llama.cpp"
-$ adb push {install-dir} /data/local/tmp/llama.cpp/
-$ adb push {model}.gguf /data/local/tmp/llama.cpp/
+$ adb shell "mkdir /data/local/tmp/jarvis.cpp"
+$ adb push {install-dir} /data/local/tmp/jarvis.cpp/
+$ adb push {model}.gguf /data/local/tmp/jarvis.cpp/
 $ adb shell
 ```
 
 In the `adb shell`:
 
 ```
-$ cd /data/local/tmp/llama.cpp
-$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
+$ cd /data/local/tmp/jarvis.cpp
+$ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
 That's it!
diff --git a/docs/backend/BLIS.md b/docs/backend/BLIS.md
index 35d06bd0f303d..7e9048135a2de 100644
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@@ -25,13 +25,13 @@ sudo make install
 
 We recommend using openmp since it's easier to modify the cores being used.
 
-### llama.cpp compilation
+### jarvis.cpp compilation
 
 Makefile:
 
 ```bash
 make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
+# make GGML_BLIS=1 jarvis-benchmark-matmult
 ```
 
 CMake:
@@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
 make -j
 ```
 
-### llama.cpp execution
+### jarvis.cpp execution
 
 According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 6bdd9d2daab90..ee92299473de0 100644
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -1,4 +1,4 @@
-# llama.cpp for CANN
+# jarvis.cpp for CANN
 
  - [Background](#background)
  - [News](#news)
@@ -17,9 +17,9 @@
 
 **CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
 
-**Llama.cpp + CANN**
+**Jarvis.cpp + CANN**
 
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
 
 ## News
 
@@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 | GritLM-7B                   |   √   |   √  |   √  |
 | internlm2_5-7b-chat         |   √   |   √  |   √  |
 | koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
+| Jarvis-2-7b-chat-hf          |   √   |   √  |   √  |
+| Jarvis-3-Smaug-8B            |   √   |   √  |   √  |
+| Jarvis2-Chinese-7b-Chat      |   √   |   √  |   √  |
+| Jarvis3-8B                   |   √   |   √  |   √  |
+| Jarvis3-8b-chinese           |   √   |   √  |   √  |
 | mamba-130m-hf               |   √   |   √  |   √  |
 | Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
 | Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
@@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 ## Docker
 
 ### Build Images
-You can get a image with llama.cpp in one command.
+You can get a image with jarvis.cpp in one command.
 ```sh
-docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
+docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile .
 ```
 
 ### Run container
@@ -133,7 +133,7 @@ npu-smi info
 
 # Select the cards that you want to use, make sure these cards are not used by someone.
 # Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
+docker run --name jarviscpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
 ```
 
 *Notes:*
@@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
 
 Upon a successful installation, CANN is enabled for the available ascend devices.
 
-### II. Build llama.cpp
+### II. Build jarvis.cpp
 
 ```sh
 cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
@@ -242,13 +242,13 @@ cmake --build build --config release
     - Use device 0:
 
     ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+    ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
     ```
 
     - Use multiple devices:
 
     ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+    ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
     ```
 
 ### **GitHub contribution**:
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index ea34182e41a4c..541fe043b23cb 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -1,4 +1,4 @@
-# llama.cpp for SYCL
+# jarvis.cpp for SYCL
 
 - [Background](#background)
 - [Recommended Release](#recommended-release)
@@ -24,9 +24,9 @@
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
 
-### Llama.cpp + SYCL
+### Jarvis.cpp + SYCL
 
-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
 
 ## Recommended Release
 
@@ -36,7 +36,7 @@ The following release is verified with good quality:
 
 |Commit ID|Tag|Release|Verified  Platform|
 |-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
 
 
 ## News
@@ -46,7 +46,7 @@ The following release is verified with good quality:
   - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
 
 - 2024.5
-  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
+  - Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770.
   - Arch Linux is verified successfully.
 
 - 2024.4
@@ -54,8 +54,8 @@ The following release is verified with good quality:
 
 - 2024.3
   - Release binary files of Windows.
-  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
+  - A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd).
+  - New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437).
   - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
   - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
   - Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family:
 *Notes:*
 
 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`.
 
-  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
+  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
 
 - **Execution Unit (EU)**
   - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
@@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile .
 ```
 
 *Notes*:
 
 To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
 
-You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative.
 
 ### Run container
 
@@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *
 # First, find all the DRI cards
 ls -la /dev/dri
 # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 
 *Notes:*
@@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
 [hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
 ```
 
-### II. Build llama.cpp
+### II. Build jarvis.cpp
 
 #### Intel GPU
 
@@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
+# Build JARVIS with Nvidia BLAS acceleration through SYCL
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
 
-# Build LLAMA with rocBLAS acceleration through SYCL
+# Build JARVIS with rocBLAS acceleration through SYCL
 
 ## AMD
 # Use FP32, FP16 is not supported
@@ -344,7 +344,7 @@ cmake --build build --config Release -j -v
 
 #### Retrieve and prepare model
 
-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
 
 ##### Check device
 
@@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
 
 ```sh
-./build/bin/llama-ls-sycl-device
+./build/bin/jarvis-ls-sycl-device
 ```
 
 This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@@ -390,12 +390,12 @@ Choose one of following methods to run.
 - Use device 0:
 
 ```sh
-./examples/sycl/run-llama2.sh 0
+./examples/sycl/run-jarvis2.sh 0
 ```
 - Use multiple devices:
 
 ```sh
-./examples/sycl/run-llama2.sh
+./examples/sycl/run-jarvis2.sh
 ```
 
 2. Command line
@@ -418,13 +418,13 @@ Examples:
 - Use device 0:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```
 
 - Use multiple devices:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
 
 *Notes:*
@@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
 
 
-### II. Build llama.cpp
+### II. Build jarvis.cpp
 
 You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
 
@@ -506,7 +506,7 @@ Choose one of following methods to build from source code.
 
 2. CMake
 
-On the oneAPI command line window, step into the llama.cpp main directory and run the following:
+On the oneAPI command line window, step into the jarvis.cpp main directory and run the following:
 
 ```
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
@@ -524,34 +524,34 @@ Or, use CMake presets to build:
 
 ```sh
 cmake --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
 
 cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
 
 cmake --preset x64-windows-sycl-debug
-cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli
 ```
 
 3. Visual Studio
 
-You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
 
 *Notes:*
 
-- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`.
 
 ### III. Run the inference
 
 #### Retrieve and prepare model
 
-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
 
 ##### Check device
 
 1. Enable oneAPI running environment
 
-On the oneAPI command line window, run the following and step into the llama.cpp directory:
+On the oneAPI command line window, run the following and step into the jarvis.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
@@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
 
 ```
-build\bin\llama-ls-sycl-device.exe
+build\bin\jarvis-ls-sycl-device.exe
 ```
 
 This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@@ -589,7 +589,7 @@ Choose one of following methods to run.
 1. Script
 
 ```
-examples\sycl\win-run-llama2.bat
+examples\sycl\win-run-jarvis2.bat
 ```
 
 2. Command line
@@ -613,13 +613,13 @@ Examples:
 - Use device 0:
 
 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```
 
 - Use multiple devices:
 
 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
 
 
@@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512
   ```
   Otherwise, please double-check the GPU driver installation steps.
 
-- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
+- Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend?
 
-  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
+  No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis.
 
-  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
+  Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it.
 
-  It's same for other projects including llama.cpp SYCL backend.
+  It's same for other projects including jarvis.cpp SYCL backend.
 
 - Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
 
diff --git a/docs/build.md b/docs/build.md
index 4e362ebc78fa3..5fd1e051a1a01 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -1,13 +1,13 @@
-# Build llama.cpp locally
+# Build jarvis.cpp locally
 
 **To get the Code:**
 
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+git clone https://github.com/ggerganov/jarvis.cpp
+cd jarvis.cpp
 ```
 
-In order to build llama.cpp you have four different options.
+In order to build jarvis.cpp you have four different options.
 
 - Using `make`:
   - On Linux or MacOS:
@@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options.
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
     3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
+    4. Use the `cd` command to reach the `jarvis.cpp` folder.
     5. From here you can run:
         ```bash
         make
         ```
 
   - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
+    - For debug builds, run `make JARVIS_DEBUG=1`
 
 - Using `CMake`:
 
@@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
 
   **Notes**:
 
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, there are two cases:
@@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
     4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
     5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
     6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
+    7. Use the `cd` command to reach the `jarvis.cpp` folder.
     8. From here you can run:
 
         ```bash
@@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information.
 
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
 
-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
 
-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
 
 ### Intel oneMKL
 
-Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
 
 - Using manual oneAPI installation:
   By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
@@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 - Using oneAPI docker image:
   If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
 
-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+Check [Optimizing and Running JARVIS2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information.
 
 ### CUDA
 
@@ -300,7 +300,7 @@ Libs: -lvulkan-1
 EOF
 
 ```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
+Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`.
 
 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
@@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
       mingw-w64-ucrt-x86_64-vulkan-devel \
       mingw-w64-ucrt-x86_64-shaderc
   ```
-Switch into `llama.cpp` directory and build using CMake.
+Switch into `jarvis.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
@@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
 
 ```sh
 # Build the image
-docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
+docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile .
 
 # Then, use it:
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 
 **Without docker**:
@@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr
 For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
 For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
 
-Then, build llama.cpp using the cmake command below:
+Then, build jarvis.cpp using the cmake command below:
 
 ```bash
 cmake -B build -DGGML_VULKAN=1
 cmake --build build --config Release
 # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
 
 # You should see in the output, ggml_vulkan detected your GPU. For example:
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend
 
 Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
 
-Go to `llama.cpp` directory and build using CMake.
+Go to `jarvis.cpp` directory and build using CMake.
 ```bash
 cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
 cmake --build build --config release
@@ -375,15 +375,15 @@ cmake --build build --config release
 
 You can test with:
 
-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+`./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
 
-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
+If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`:
 ```bash
 llm_load_tensors:       CANN buffer size = 13313.00 MiB
-llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
+jarvis_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
 ```
 
-For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
+For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md).
 
 ### Android
 
@@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 ### Arm CPU optimized mulmat kernels
 
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
 
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`).
diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md
index 04c5ccbbe60c3..d72c70b30e5e0 100644
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -1,9 +1,9 @@
-# Add a new model architecture to `llama.cpp`
+# Add a new model architecture to `jarvis.cpp`
 
 Adding a model requires few steps:
 
 1. Convert the model to GGUF
-2. Define the model architecture in `llama.cpp`
+2. Define the model architecture in `jarvis.cpp`
 3. Build the GGML graph implementation
 
 After following these steps, you can open PR.
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF
 
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format).
 
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
 
@@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
 
 NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
 
-### 2. Define the model architecture in `llama.cpp`
+### 2. Define the model architecture in `jarvis.cpp`
 
-The model params and tensors layout must be defined in `llama.cpp`:
+The model params and tensors layout must be defined in `jarvis.cpp`:
 1. Define a new `llm_arch`
 2. Define the tensors layout in `LLM_TENSOR_NAMES`
 3. Add any non standard metadata in `llm_load_hparams`
 4. Create the tensors for inference in `llm_load_tensors`
-5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type`
 
 NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
 
 ### 3. Build the GGML graph implementation
 
-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`.
 
-Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`.
 
 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
 
-Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
+Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/).
 
 ## GGUF specification
 
@@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
 
 ## Resources
 
-- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
-- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
-- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
-- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
-- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
-- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
-- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
-- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
-- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
+- YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948
diff --git a/docs/development/debugging-tests.md b/docs/development/debugging-tests.md
index 18407f688f9db..38b6767622c85 100644
--- a/docs/development/debugging-tests.md
+++ b/docs/development/debugging-tests.md
@@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
 Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
 
 ```bash
-cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON ..
 make -j
 ```
 
@@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention
 
 ```bash
 ...
-1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
 1: Working Directory: .
 Labels: main
-  Test  #1: test-tokenizer-0-llama-spm
+  Test  #1: test-tokenizer-0-jarvis-spm
 ...
-4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
+4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf"
 4: Working Directory: .
 Labels: main
   Test  #4: test-tokenizer-0-falcon
@@ -86,8 +86,8 @@ Labels: main
 #### Step 4: Identify Test Command for Debugging
 
 So for test #1 above we can tell these two pieces of relevant information:
-* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
-* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
+* Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0`
+* Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf`
 
 #### Step 5: Run GDB on test command
 
@@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model}
 Example:
 
 ```bash
-gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
 ```
diff --git a/docs/development/llama-star/idea-arch.key b/docs/development/jarvis-star/idea-arch.key
old mode 100755
new mode 100644
similarity index 100%
rename from docs/development/llama-star/idea-arch.key
rename to docs/development/jarvis-star/idea-arch.key
diff --git a/docs/development/llama-star/idea-arch.pdf b/docs/development/jarvis-star/idea-arch.pdf
similarity index 100%
rename from docs/development/llama-star/idea-arch.pdf
rename to docs/development/jarvis-star/idea-arch.pdf
diff --git a/docs/development/token_generation_performance_tips.md b/docs/development/token_generation_performance_tips.md
index 41b7232c976b3..62aeb11789fdb 100644
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@@ -1,23 +1,23 @@
 # Token generation performance troubleshooting
 
 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
+./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 
-When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
+When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
 ```shell
-llama_model_load_internal: [cublas] offloading 60 layers to GPU
-llama_model_load_internal: [cublas] offloading output layer to GPU
-llama_model_load_internal: [cublas] total VRAM used: 17223 MB
+jarvis_model_load_internal: [cublas] offloading 60 layers to GPU
+jarvis_model_load_internal: [cublas] offloading output layer to GPU
+jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB
 ... rest of inference
 ```
 
 If you see these lines, then the GPU is being used.
 
 ## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
+jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
 
 # Example of runtime flags effect on inference speed benchmark
 These runs were tested on the following machine:
@@ -27,7 +27,7 @@ RAM: 32GB
 
 Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
 
-Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 
 Result:
 
diff --git a/docs/docker.md b/docs/docker.md
index 8d90e6ded5738..4015000245953 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -2,26 +2,26 @@
 
 ## Prerequisites
 * Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
+* Create a folder to store big models & intermediate files (ex. /jarvis/models)
 
 ## Images
 We have three Docker images available for this project:
 
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
 
 Additionally, there the following images, similar to the above:
 
-- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
 
 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
 
@@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
 Replace `/path/to/models` below with the actual path where you downloaded the models.
 
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B
 ```
 
 On completion, you are ready to play!
 
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 or with a light image:
 
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 or with a server image:
 
 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```
 
 ## Docker With CUDA
@@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ## Building Docker locally
 
 ```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile .
 ```
 
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -74,18 +74,18 @@ The defaults are:
 
 The resulting images, are essentially the same as the non-CUDA images:
 
-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
+1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization.
+2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file.
+3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file.
 
 ## Usage
 
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
 
 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 
 ## Docker With MUSA
@@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
 ## Building Docker locally
 
 ```bash
-docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
-docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
-docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile .
+docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile .
 ```
 
 You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
@@ -108,16 +108,16 @@ The defaults are:
 
 The resulting images, are essentially the same as the non-MUSA images:
 
-1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
-3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization.
+2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file.
+3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file.
 
 ## Usage
 
 After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
 
 ```bash
-docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
diff --git a/docs/install.md b/docs/install.md
index 10a568506835b..e5baee4a7f495 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -1,39 +1,39 @@
-# Install pre-built version of llama.cpp
+# Install pre-built version of jarvis.cpp
 
 ## Homebrew
 
 On Mac and Linux, the homebrew package manager can be used via
 
 ```sh
-brew install llama.cpp
+brew install jarvis.cpp
 ```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668
 
 ## Nix
 
 On Mac and Linux, the Nix package manager can be used via
 
 ```sh
-nix profile install nixpkgs#llama-cpp
+nix profile install nixpkgs#jarvis-cpp
 ```
 For flake enabled installs.
 
 Or
 
 ```sh
-nix-env --file '<nixpkgs>' --install --attr llama-cpp
+nix-env --file '<nixpkgs>' --install --attr jarvis-cpp
 ```
 
 For non-flake enabled installs.
 
-This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164).
 
 ## Flox
 
-On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via
 
 ```sh
-flox install llama-cpp
+flox install jarvis-cpp
 ```
 
-Flox follows the nixpkgs build of llama.cpp.
+Flox follows the nixpkgs build of jarvis.cpp.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ead630661c8e2..5755f879a45d0 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
     add_subdirectory(cvector-generator)
-    add_subdirectory(baby-llama)
+    add_subdirectory(baby-jarvis)
     add_subdirectory(batched-bench)
     add_subdirectory(batched)
-    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(convert-jarvis2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
     add_subdirectory(export-lora)
@@ -27,7 +27,7 @@ else()
     add_subdirectory(gritlm)
     add_subdirectory(imatrix)
     add_subdirectory(infill)
-    add_subdirectory(llama-bench)
+    add_subdirectory(jarvis-bench)
     add_subdirectory(llava)
     add_subdirectory(lookahead)
     add_subdirectory(lookup)
@@ -41,7 +41,7 @@ else()
     if (GGML_RPC)
         add_subdirectory(rpc)
     endif()
-    if (LLAMA_BUILD_SERVER)
+    if (JARVIS_BUILD_SERVER)
     add_subdirectory(server)
     endif()
     if (GGML_SYCL)
diff --git a/examples/Miku.sh b/examples/Miku.sh
index 0f6c8c8787107..1725dbf0099aa 100755
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -2,7 +2,7 @@
 set -e
 
 AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
+MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}"
 USER_NAME="${USER_NAME:-Anon}"
 
 # Uncomment and adjust to the number of CPU cores you want to use.
@@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
     GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
 
-./llama-cli "${GEN_OPTIONS[@]}" \
+./jarvis-cli "${GEN_OPTIONS[@]}" \
     --model "$MODEL" \
     --in-prefix " " \
     --in-suffix "${AI_NAME}:" \
diff --git a/examples/baby-jarvis/CMakeLists.txt b/examples/baby-jarvis/CMakeLists.txt
new file mode 100644
index 0000000000000..a0703600b3d7a
--- /dev/null
+++ b/examples/baby-jarvis/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET jarvis-baby-jarvis)
+add_executable(${TARGET} baby-jarvis.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-jarvis/baby-jarvis.cpp
similarity index 96%
rename from examples/baby-llama/baby-llama.cpp
rename to examples/baby-jarvis/baby-jarvis.cpp
index 3ce91070b4ed7..03f22bac8461c 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-jarvis/baby-jarvis.cpp
@@ -11,8 +11,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#ifdef LLAMA_DEFAULT_RMS_EPS
-constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+#ifdef JARVIS_DEFAULT_RMS_EPS
+constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS;
 #else
 constexpr float rms_norm_eps = 5e-6f;
 #endif
@@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor(
     return tensor;
 }
 
-struct llama_hparams {
+struct jarvis_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
     uint32_t n_embd  = 4096;
@@ -80,17 +80,17 @@ struct llama_hparams {
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
 
-    bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
+    bool operator!=(const jarvis_hparams & other) const {
+        return memcmp(this, &other, sizeof(jarvis_hparams));
     }
 };
 
-static uint32_t get_n_ff(const struct llama_hparams* hparams) {
+static uint32_t get_n_ff(const struct jarvis_hparams* hparams) {
     const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
     return n_ff;
 }
 
-struct llama_hparams_lora {
+struct jarvis_hparams_lora {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
     uint32_t n_embd  = 4096;
@@ -100,12 +100,12 @@ struct llama_hparams_lora {
     uint32_t n_rot   = 64;
     uint32_t n_lora  = 64;
 
-    bool operator!=(const llama_hparams_lora & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
+    bool operator!=(const jarvis_hparams_lora & other) const {
+        return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0;
     }
 };
 
-struct llama_layer {
+struct jarvis_layer {
     // normalization
     struct ggml_tensor * attention_norm;
 
@@ -124,7 +124,7 @@ struct llama_layer {
     struct ggml_tensor * w3;
 };
 
-struct llama_layer_lora {
+struct jarvis_layer_lora {
     // normalization
     struct ggml_tensor * attention_norm;
 
@@ -148,34 +148,34 @@ struct llama_layer_lora {
 };
 
 
-struct llama_kv_cache {
+struct jarvis_kv_cache {
     struct ggml_context * ctx = NULL;
 
     struct ggml_tensor * k;
     struct ggml_tensor * v;
 
-    // llama_ctx_buffer buf;
+    // jarvis_ctx_buffer buf;
 
     int n; // number of tokens currently in the cache
 };
 
-struct llama_model {
+struct jarvis_model {
     struct ggml_context * ctx = NULL;
 
-    llama_hparams hparams;
+    jarvis_hparams hparams;
 
     struct ggml_tensor * tok_embeddings;
 
     struct ggml_tensor * norm;
     struct ggml_tensor * output;
 
-    std::vector<llama_layer> layers;
+    std::vector<jarvis_layer> layers;
 };
 
-struct llama_model_lora {
+struct jarvis_model_lora {
     struct ggml_context * ctx = NULL;
 
-    llama_hparams_lora hparams;
+    jarvis_hparams_lora hparams;
 
     struct ggml_tensor * tok_embeddings;
 
@@ -183,10 +183,10 @@ struct llama_model_lora {
     struct ggml_tensor * outputa;
     struct ggml_tensor * outputb;
 
-    std::vector<llama_layer_lora> layers;
+    std::vector<jarvis_layer_lora> layers;
 };
 
-static void init_model(struct llama_model * model) {
+static void init_model(struct jarvis_model * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) {
 }
 
 
-static void init_model_lora(struct llama_model_lora * model) {
+static void init_model_lora(struct jarvis_model_lora * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) {
     }
 }
 
-static void set_param_model(struct llama_model * model) {
+static void set_param_model(struct jarvis_model * model) {
     const auto& hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) {
     }
 }
 
-static void set_param_model_lora(struct llama_model_lora * model) {
+static void set_param_model_lora(struct jarvis_model_lora * model) {
     const auto& hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
     }
 }
 
-static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
 
 
 static void randomize_model_lora(
-    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
+    struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max
 ) {
     const auto & hparams = model->hparams;
 
@@ -391,7 +391,7 @@ static void randomize_model_lora(
     free_random_normal_distribution(rnd);
 }
 
-static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_ctx   = hparams.n_ctx;
@@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
     cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
 }
 
-static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_ctx   = hparams.n_ctx;
@@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
 }
 
 static struct ggml_tensor * forward(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
+    struct jarvis_model    * model,
+    struct jarvis_kv_cache * cache,
     struct ggml_context   * ctx0,
     struct ggml_cgraph    * gf,
     struct ggml_tensor    * tokens_input,
@@ -472,7 +472,7 @@ static struct ggml_tensor * forward(
 ) {
     const int N = n_tokens;
 
-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
     const auto & hparams = model->hparams;
     const int n_ctx   = hparams.n_ctx;
     const int n_embd  = hparams.n_embd;
@@ -692,8 +692,8 @@ static struct ggml_tensor * forward(
 }
 
 static struct ggml_tensor * forward_batch(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
+    struct jarvis_model    * model,
+    struct jarvis_kv_cache * cache,
     struct ggml_context   * ctx0,
     struct ggml_cgraph    * gf,
     struct ggml_tensor    * tokens_input,
@@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch(
 ) {
     const int N = n_tokens;
 
-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
     const auto & hparams = model->hparams;
     const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
@@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch(
 }
 
 static struct ggml_tensor * forward_lora(
-    struct llama_model_lora * model,
-    struct llama_kv_cache   * cache,
+    struct jarvis_model_lora * model,
+    struct jarvis_kv_cache   * cache,
     struct ggml_context     * ctx0,
     struct ggml_cgraph      * gf,
     struct ggml_tensor      * tokens_input,
@@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora(
 ) {
     const int N = n_tokens;
 
-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
     const auto & hparams = model->hparams;
 
     const int n_ctx   = hparams.n_ctx;
@@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) {
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
-    struct llama_model model;
+    struct jarvis_model model;
     model.hparams.n_vocab = 8;
     model.hparams.n_ctx   = 8;
     model.hparams.n_embd  = 32;
@@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) {
     randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
 
 /*
-    struct llama_model_lora model_lora;
+    struct jarvis_model_lora model_lora;
     // model.hparams.n_vocab = 6;
     // model.hparams.n_ctx   = 64;
     // model.hparams.n_embd  = 128;
@@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) {
 */
     int n_batch = 8;
     // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
+    struct jarvis_kv_cache kv_self;
     printf("init_kv_cache\n");
     kv_self.ctx = model.ctx;
     init_kv_cache(&kv_self, &model, n_batch);
@@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         struct ggml_cgraph * gf = NULL;
-        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+        gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
 
         get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
 
@@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(params);
 
             struct ggml_cgraph * gf = NULL;
-            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+            gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
deleted file mode 100644
index 71b82105c8863..0000000000000
--- a/examples/baby-llama/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
index 103a52f55e6db..1db10dfd59036 100755
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -5,7 +5,7 @@
 #
 # Usage:
 #
-#   cd llama.cpp
+#   cd jarvis.cpp
 #   make -j
 #
 #   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
@@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then
   eargs="${@:3}"
 fi
 
-ftmp="__llama.cpp_example_tmp__.txt"
+ftmp="__jarvis.cpp_example_tmp__.txt"
 trap "rm -f $ftmp" EXIT
 
 echo "Translate from English to French:
@@ -58,4 +58,4 @@ echo "$2
 model=$1
 
 # generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
index 959acaeeebc38..f84e368f22422 100644
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-batched-bench)
+set(TARGET jarvis-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index df67c47e378cf..b8d3152666d0d 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -1,6 +1,6 @@
-# llama.cpp/example/batched-bench
+# jarvis.cpp/example/batched-bench
 
-Benchmark the batched decoding performance of `llama.cpp`
+Benchmark the batched decoding performance of `jarvis.cpp`
 
 ## Usage
 
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 
 ```bash
-./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 
-# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+# JARVIS 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 
-# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+# JARVIS 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 
 # custom set of batches
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 
 ## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index a3b21ad6bce44..349f16aade71e 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) {
         return 1;
     }
 
@@ -31,42 +31,42 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // initialize the model
 
-    llama_model_params model_params = common_model_params_to_llama(params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(params);
 
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
-    llama_context_params ctx_params = common_context_params_to_llama(params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
 
     // ensure enough sequences are available
     ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
 
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
         return 1;
     }
 
-    const int32_t n_kv_max = llama_n_ctx(ctx);
+    const int32_t n_kv_max = jarvis_n_ctx(ctx);
 
-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+    jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+    auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) {
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
-            llama_batch batch_view = {
+            jarvis_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
                 batch.logits   + i,
             };
 
-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = jarvis_decode(ctx, batch_view);
             if (ret != 0) {
                 LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 return false;
             }
 
-            llama_synchronize(ctx);
+            jarvis_synchronize(ctx);
         }
 
         return true;
@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return 1;
         }
     }
@@ -132,16 +132,16 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_clear(ctx);
+                jarvis_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: jarvis_decode() failed\n", __func__);
                     return 1;
                 }
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                        jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                     }
                 }
 
@@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: jarvis_decode() failed\n", __func__);
                         return 1;
                     }
                 }
@@ -189,14 +189,14 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile
index 1f9156e583fdd..f6efa6ed62536 100755
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +1,6 @@
 .PHONY: build
 
 build:
-	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./llama-batched-swift
-	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
+	xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./jarvis-batched-swift
+	ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift
diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift
index 7e8afd0843c5b..8130a77e66ebd 100644
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -4,17 +4,17 @@
 import PackageDescription
 
 let package = Package(
-    name: "llama-batched-swift",
+    name: "jarvis-batched-swift",
     platforms: [.macOS(.v12)],
     dependencies: [
-        .package(name: "llama", path: "../../"),
+        .package(name: "jarvis", path: "../../"),
     ],
     targets: [
         // Targets are the basic building blocks of a package, defining a module or a test suite.
         // Targets can depend on other targets in this package and products from dependencies.
         .executableTarget(
-            name: "llama-batched-swift",
-            dependencies: ["llama"],
+            name: "jarvis-batched-swift",
+            dependencies: ["jarvis"],
             path: "Sources",
             linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
         ),
diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md
index 7f2e2fcdcf4a7..03ec340ab0522 100644
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 
 $ `make`
-$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./jarvis-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 10f2e7fd117a1..92eedbac7f6e8 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -1,5 +1,5 @@
 import Foundation
-import llama
+import jarvis
 
 let arguments = CommandLine.arguments
 
@@ -17,56 +17,56 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
 let n_len: Int = 32
 
 // init LLM
-llama_backend_init()
+jarvis_backend_init()
 defer {
-    llama_backend_free()
+    jarvis_backend_free()
 }
 
-let model_params = llama_model_default_params()
-guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
+let model_params = jarvis_model_default_params()
+guard let model = jarvis_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
     print("Failed to load model")
     exit(1)
 }
 defer {
-    llama_free_model(model)
+    jarvis_free_model(model)
 }
 
 var tokens = tokenize(text: prompt, add_bos: true)
 
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
 
-var context_params = llama_context_default_params()
+var context_params = jarvis_context_default_params()
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
 
-let context = llama_new_context_with_model(model, context_params)
+let context = jarvis_new_context_with_model(model, context_params)
 guard context != nil else {
     print("Failed to initialize context")
     exit(1)
 }
 defer {
-    llama_free(context)
+    jarvis_free(context)
 }
 
-var sparams = llama_sampler_chain_default_params()
+var sparams = jarvis_sampler_chain_default_params()
 
-let smpl = llama_sampler_chain_init(sparams)
+let smpl = jarvis_sampler_chain_init(sparams)
 guard smpl != nil else {
     print("Failed to initialize sampling")
     exit(1)
 }
 defer {
-    llama_sampler_free(smpl)
+    jarvis_sampler_free(smpl)
 }
 
-llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
-llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
-llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
+jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_k(40));
+jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_p(0.9, 1));
+jarvis_sampler_chain_add(smpl, jarvis_sampler_init_temp (0.4));
+jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist (1234));
 
-let n_ctx = llama_n_ctx(context)
+let n_ctx = jarvis_n_ctx(context)
 
 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
 
@@ -76,15 +76,15 @@ if n_kv_req > n_ctx {
 }
 
 var buffer: [CChar] = []
-for id: llama_token in tokens {
+for id: jarvis_token in tokens {
     print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
 }
 
 print("\n")
 
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
+var batch = jarvis_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
 defer {
-    llama_batch_free(batch)
+    jarvis_batch_free(batch)
 }
 
 // evaluate the initial prompt
@@ -102,16 +102,16 @@ for (i, token) in tokens.enumerated() {
     batch.logits[i] = 0
 }
 
-// llama_decode will output logits only for the last token of the prompt
+// jarvis_decode will output logits only for the last token of the prompt
 batch.logits[Int(batch.n_tokens) - 1] = 1
 
-if llama_decode(context, batch) != 0 {
-    print("llama_decode() failed")
+if jarvis_decode(context, batch) != 0 {
+    print("jarvis_decode() failed")
     exit(1)
 }
 
 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    jarvis_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 
 if n_parallel > 1 {
@@ -138,10 +138,10 @@ while n_cur <= n_len {
             continue
         }
 
-        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
+        let new_token_id = jarvis_sampler_sample(smpl, context, i_batch[i])
 
         // is it an end of stream? -> mark the stream as finished
-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
+        if jarvis_token_is_eog(model, new_token_id) || n_cur == n_len {
             i_batch[i] = -1
             // print("")
             if n_parallel > 1 {
@@ -183,8 +183,8 @@ while n_cur <= n_len {
     n_cur += 1
 
     // evaluate the current batch with the transformer model
-    if llama_decode(context, batch) != 0 {
-        print("llama_decode() failed")
+    if jarvis_decode(context, batch) != 0 {
+        print("jarvis_decode() failed")
         exit(1)
     }
 }
@@ -200,15 +200,15 @@ let t_main_end = ggml_time_us()
 
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_perf_sampler_print(smpl)
-llama_perf_context_print(context)
+jarvis_perf_sampler_print(smpl)
+jarvis_perf_context_print(context)
 
-private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
+private func tokenize(text: String, add_bos: Bool) -> [jarvis_token] {
     let utf8Count = text.utf8.count
     let n_tokens = utf8Count + (add_bos ? 1 : 0)
-    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
-    var swiftTokens: [llama_token] = []
+    let tokens = UnsafeMutablePointer<jarvis_token>.allocate(capacity: n_tokens)
+    let tokenCount = jarvis_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    var swiftTokens: [jarvis_token] = []
     for i in 0 ..< tokenCount {
         swiftTokens.append(tokens[Int(i)])
     }
@@ -216,13 +216,13 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     return swiftTokens
 }
 
-private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
+private func token_to_piece(token: jarvis_token, buffer: inout [CChar]) -> String? {
     var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
+    let nTokens = jarvis_token_to_piece(model, token, &result, Int32(result.count), 0, false)
     if nTokens < 0 {
         let actualTokensCount = -Int(nTokens)
         result = .init(repeating: 0, count: actualTokensCount)
-        let check = llama_token_to_piece(
+        let check = jarvis_token_to_piece(
             model,
             token,
             &result,
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
index 77e33343b6673..9c78d7f13544d 100644
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-batched)
+set(TARGET jarvis-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/batched/README.md b/examples/batched/README.md
index 6013aab01fddc..ebc3ebdab319d 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -1,9 +1,9 @@
-# llama.cpp/example/batched
+# jarvis.cpp/example/batched
 
 The example demonstrates batched generation from a given prompt
 
 ```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./jarvis-batched -m ./models/jarvis-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 
 ...
 
@@ -36,9 +36,9 @@ Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat
 
 main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
 
-llama_print_timings:        load time =   587.00 ms
-llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
-llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
-llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_print_timings:       total time =  4156.04 ms
+jarvis_print_timings:        load time =   587.00 ms
+jarvis_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
+jarvis_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
+jarvis_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
+jarvis_print_timings:       total time =  4156.04 ms
 ```
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 3b554033e7ee4..d651730b2c582 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -20,7 +20,7 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON, print_usage)) {
         return 1;
     }
 
@@ -34,14 +34,14 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // initialize the model
 
-    llama_model_params model_params = common_model_params_to_llama(params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(params);
 
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -50,35 +50,35 @@ int main(int argc, char ** argv) {
 
     // tokenize the prompt
 
-    std::vector<llama_token> tokens_list;
+    std::vector<jarvis_token> tokens_list;
     tokens_list = common_tokenize(model, params.prompt, true);
 
     const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
 
     // initialize the context
 
-    llama_context_params ctx_params = common_context_params_to_llama(params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
 
     ctx_params.n_ctx   = n_kv_req;
     ctx_params.n_batch = std::max(n_predict, n_parallel);
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
 
-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = jarvis_sampler_chain_default_params();
 
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_k(params.sparams.top_k));
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_temp (params.sparams.temp));
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist (params.sparams.seed));
 
     if (ctx == NULL) {
-        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the jarvis_context\n" , __func__);
         return 1;
     }
 
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
@@ -97,11 +97,11 @@ int main(int argc, char ** argv) {
         LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
-    // create a llama_batch
+    // create a jarvis_batch
     // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
+    jarvis_batch batch = jarvis_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
 
-    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+    std::vector<jarvis_seq_id> seq_ids(n_parallel, 0);
     for (int32_t i = 0; i < n_parallel; ++i) {
         seq_ids[i] = i;
     }
@@ -112,33 +112,33 @@ int main(int argc, char ** argv) {
     }
     GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
 
-    if (llama_model_has_encoder(model)) {
-        if (llama_encode(ctx, batch)) {
+    if (jarvis_model_has_encoder(model)) {
+        if (jarvis_encode(ctx, batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
 
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
         if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
+            decoder_start_token_id = jarvis_token_bos(model);
         }
 
         common_batch_clear(batch);
         common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
     }
 
-    // llama_decode will output logits only for the last token of the prompt
+    // jarvis_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
 
-    if (llama_decode(ctx, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+    if (jarvis_decode(ctx, batch) != 0) {
+        LOG_ERR("%s: jarvis_decode() failed\n", __func__);
         return 1;
     }
 
     //// assign the system KV cache to all parallel sequences
     //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
     //for (int32_t i = 1; i < n_parallel; ++i) {
-    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    //    jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
     //}
 
     if (n_parallel > 1) {
@@ -170,10 +170,10 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
+            const jarvis_token new_token_id = jarvis_sampler_sample(smpl, ctx, i_batch[i]);
 
             // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
+            if (jarvis_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
                 LOG("\n");
                 if (n_parallel > 1) {
@@ -206,7 +206,7 @@ int main(int argc, char ** argv) {
         n_cur += 1;
 
         // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (jarvis_decode(ctx, batch)) {
             LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
@@ -226,18 +226,18 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    jarvis_perf_sampler_print(smpl);
+    jarvis_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_sampler_free(smpl);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat
index c5c8ac6efa81a..e398912f0f69b 100644
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -10,7 +10,7 @@ if not "%errorlevel%"=="0" (
 
 if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
 if not defined USER_NAME set "USER_NAME=User"
-if not defined AI_NAME set "AI_NAME=ChatLLaMa"
+if not defined AI_NAME set "AI_NAME=ChatJarvis"
 rem Adjust to the number of CPU cores you want to use.
 rem if not defined N_THREAD set "N_THREAD=8"
 rem Number of tokens to predict (made it larger than default because we want a long interaction)
diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index 1828903c31670..96785bd4b2ccc 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -7,7 +7,7 @@ cd "$(dirname "$0")/.." || exit
 MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
 PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
 USER_NAME="${USER_NAME:-USER}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
+AI_NAME="${AI_NAME:-ChatJarvis}"
 
 # Adjust to the number of CPU cores you want to use.
 N_THREAD="${N_THREAD:-8}"
@@ -15,13 +15,13 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"
 
 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+# For example, override the context size by doing: ./chatJarvis --ctx_size 1024
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 
 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
 
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+PROMPT_FILE=$(mktemp -t jarviscpp_prompt.XXXXXXX.txt)
 
 sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-cli $GEN_OPTIONS \
+./jarvis-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index d9cab9836482e..016e6d06f58e0 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -9,10 +9,10 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
     exit 1
 fi
 
-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
+MODEL="${MODEL:-./models/jarvis-13b/ggml-model-q4_0.gguf}"
 PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
 USER_NAME="${USER_NAME:-User}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
+AI_NAME="${AI_NAME:-ChatJarvis}"
 DATE_TIME="$(date +%H:%M)"
 DATE_YEAR="$(date +%Y)"
 
@@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
     echo 'Prompt cache does not exist, building...'
     # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama-cli 2>>"$LOG" \
+    ./jarvis-cli 2>>"$LOG" \
         --batch_size 64 \
         "${OPTS[@]}" \
         --prompt-cache "$PROMPT_CACHE_FILE" \
@@ -109,13 +109,13 @@ while read -e line; do
 
     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
 
-    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
+    ./jarvis-cli 2>>"$LOG" "${OPTS[@]}" \
             --prompt-cache "$CUR_PROMPT_CACHE" \
             --prompt-cache-all \
             --file "$CUR_PROMPT_FILE" \
             --reverse-prompt "${USER_NAME}:" \
             --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
+        skip_bytes 1 |                  # skip BOS token added by ./jarvis-cli
         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
         skip_bytes "$n_prompt_len_pre"  # print generation
 
@@ -133,7 +133,7 @@ while read -e line; do
     # TODO get both messages in one go
     if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
         ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
+        echo >&2 "Couldn't get number of tokens from ./jarvis-cli output!"
         exit 1
     fi
 
@@ -144,7 +144,7 @@ while read -e line; do
     fi
 
     # Update cache for next prompt in background, ideally during user input
-    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./jarvis-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
           --prompt-cache "$NEXT_PROMPT_CACHE" \
           --file "$NEXT_PROMPT_FILE" \
           --n_predict 1 &
diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh
index ffdd200849503..2d059adac0338 100755
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -15,13 +15,13 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"
 
 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+# For example, override the context size by doing: ./chatJarvis --ctx_size 1024
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 
 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
 
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+PROMPT_FILE=$(mktemp -t jarviscpp_prompt.XXXXXXX.txt)
 
 sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama-cli $GEN_OPTIONS \
+./bin/jarvis-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat.sh b/examples/chat.sh
index 9f85d1e265d00..0eb4b2e21bbce 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./jarvis-cli -m ./models/jarvis-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
     --repeat_penalty 1.0 --color -i \
     -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/convert-jarvis2c-to-ggml/CMakeLists.txt b/examples/convert-jarvis2c-to-ggml/CMakeLists.txt
new file mode 100644
index 0000000000000..f88ca32c7d617
--- /dev/null
+++ b/examples/convert-jarvis2c-to-ggml/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET jarvis-convert-jarvis2c-to-ggml)
+add_executable(${TARGET} convert-jarvis2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/convert-jarvis2c-to-ggml/README.md b/examples/convert-jarvis2c-to-ggml/README.md
new file mode 100644
index 0000000000000..0cb1cbbe7cebb
--- /dev/null
+++ b/examples/convert-jarvis2c-to-ggml/README.md
@@ -0,0 +1,28 @@
+## Convert jarvis2.c model to ggml
+
+This example reads weights from project [jarvis2.c](https://github.com/karpathy/jarvis2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
+
+To convert the model first download the models from the [jarvis2.c](https://github.com/karpathy/jarvis2.c) repository:
+
+`$ make -j`
+
+After successful compilation, following usage options are available:
+```
+usage: ./jarvis-convert-jarvis2c-to-ggml [options]
+
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    path of gguf jarvis model or jarvis2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
+  --jarvis2c-model FNAME            [REQUIRED] model path from which to load Karpathy's jarvis2.c model
+  --jarvis2c-output-model FNAME     model path to save the converted jarvis2.c model (default ak_jarvis_model.bin')
+```
+
+An example command using a model from [karpathy/tinyjarviss](https://huggingface.co/karpathy/tinyjarviss) is as follows:
+
+`$ ./jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model jarvis-2-7b-chat.gguf.q2_K.bin --jarvis2c-model stories42M.bin --jarvis2c-output-model stories42M.gguf.bin`
+
+Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyjarviss/stories260K](https://huggingface.co/karpathy/tinyjarviss/tree/main/stories260K).
+
+Now you can use the model with a command like:
+
+`$ ./jarvis-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp
similarity index 87%
rename from examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
rename to examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp
index 988a584c99a25..6eb760a0939e3 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "log.h"
 
@@ -33,14 +33,14 @@
 #define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
 #define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
 
-#define KV_CONTEXT_LENGTH                "llama.context_length"
-#define KV_EMBEDDING_LENGTH              "llama.embedding_length"
-#define KV_BLOCK_COUNT                   "llama.block_count"
-#define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
-#define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
-#define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
-#define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
-#define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
+#define KV_CONTEXT_LENGTH                "jarvis.context_length"
+#define KV_EMBEDDING_LENGTH              "jarvis.embedding_length"
+#define KV_BLOCK_COUNT                   "jarvis.block_count"
+#define KV_FEED_FORWARD_LENGTH           "jarvis.feed_forward_length"
+#define KV_ATTENTION_HEAD_COUNT          "jarvis.attention.head_count"
+#define KV_ATTENTION_HEAD_COUNT_KV       "jarvis.attention.head_count_kv"
+#define KV_ATTENTION_LAYERNORM_RMS_EPS   "jarvis.attention.layer_norm_rms_epsilon"
+#define KV_ROPE_DIMENSION_COUNT          "jarvis.rope.dimension_count"
 
 #define TN_TOKEN_EMBD  "token_embd.weight"
 #define TN_OUTPUT_NORM "output_norm.weight"
@@ -59,15 +59,15 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-#define LLAMA_FILE_VERSION_GGJT_V3   3
+#define JARVIS_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define JARVIS_FILE_VERSION_GGJT_V3   3
 
-#define TOKENIZER_NAME "llama"
+#define TOKENIZER_NAME "jarvis"
 #define UNKNOWN_TOKEN_ID 0
 #define BOS_TOKEN_ID 1
 #define EOS_TOKEN_ID 2
 
-//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
+//////////////////////////////////////// jarvis2.c model structs and functions to load models, alloc memory etc.
 typedef struct {
     int dim; // transformer dimension
     int hidden_dim; // for ffn layers
@@ -201,10 +201,10 @@ static void print_sample_weights(TransformerWeights *w){
 
 //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
 
-struct my_llama_vocab {
+struct my_jarvis_vocab {
     using id    = int32_t;
     using token = std::string;
-    using ttype = llama_token_type;
+    using ttype = jarvis_token_type;
 
     struct token_data {
         token text;
@@ -216,7 +216,7 @@ struct my_llama_vocab {
     std::vector<token_data> id_to_token;
 };
 
-struct my_llama_hparams {
+struct my_jarvis_hparams {
     uint32_t n_vocab   = 32000;
     uint32_t n_ctx     = 512;   // this is provided as user input?
     uint32_t n_embd    = 4096;
@@ -227,12 +227,12 @@ struct my_llama_hparams {
     uint32_t n_layer   = 32;
     uint32_t n_rot     = 64;
 
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
+    bool operator!=(const my_jarvis_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_jarvis_hparams));
     }
 };
 
-struct my_llama_layer {
+struct my_jarvis_layer {
     // normalization
     struct ggml_tensor * attention_norm;
 
@@ -251,19 +251,19 @@ struct my_llama_layer {
     struct ggml_tensor * w3;
 };
 
-struct my_llama_model {
+struct my_jarvis_model {
     struct ggml_context * ctx = NULL;
 
     std::string name;
 
-    my_llama_hparams hparams;
+    my_jarvis_hparams hparams;
 
     struct ggml_tensor * tok_embeddings;
 
     struct ggml_tensor * norm;
     struct ggml_tensor * output;
 
-    std::vector<my_llama_layer> layers;
+    std::vector<my_jarvis_layer> layers;
 
     uint32_t train_its = 0;
     uint32_t train_samples = 0;
@@ -272,8 +272,8 @@ struct my_llama_model {
 
 struct train_params {
     const char * fn_vocab_model;
-    const char * fn_llama2c_model;
-    const char * fn_llama2c_output_model;
+    const char * fn_jarvis2c_model;
+    const char * fn_jarvis2c_output_model;
     const char * fn_train_data;
     const char * fn_checkpoint_in;
     const char * fn_checkpoint_out;
@@ -318,7 +318,7 @@ struct train_params {
     int mem_compute1_gb;
 };
 
-static void print_params(struct my_llama_hparams * params) {
+static void print_params(struct my_jarvis_hparams * params) {
     LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
     LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
     LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
@@ -345,7 +345,7 @@ static void print_tensor_info(const struct ggml_context * ctx) {
     }
 }
 
-static void init_model(struct my_llama_model * model) {
+static void init_model(struct my_jarvis_model * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
     }
 }
 
-struct llama_file {
+struct jarvis_file {
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
     size_t size;
 
-    llama_file(const char * fname, const char * mode) {
+    jarvis_file(const char * fname, const char * mode) {
         fp = std::fopen(fname, mode);
         if (fp == NULL) {
             size = 0;
@@ -500,7 +500,7 @@ struct llama_file {
         return std::string(chars.data(), len);
     }
 
-    ~llama_file() {
+    ~jarvis_file() {
         if (fp) {
             std::fclose(fp);
         }
@@ -508,7 +508,7 @@ struct llama_file {
 };
 
 static bool is_ggml_file(const char * filename) {
-    llama_file file(filename, "rb");
+    jarvis_file file(filename, "rb");
     if (file.size < 4) {
         return false;
     }
@@ -516,7 +516,7 @@ static bool is_ggml_file(const char * filename) {
     return magic == GGUF_MAGIC;
 }
 
-static std::string llama_escape_whitespaces(const std::string & text) {
+static std::string jarvis_escape_whitespaces(const std::string & text) {
     std::ostringstream out;
     for (char c : text) {
         if (c == ' ') out << "\xe2\x96\x81";
@@ -525,7 +525,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
     return out.str();
 }
 
-static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
+static void load_vocab(const char * filename, const Config * config, struct my_jarvis_vocab * vocab) {
     if (is_ggml_file(filename)) {
         LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
         struct ggml_context * ctx_data = NULL;
@@ -556,7 +556,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
 
         const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
         if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
-            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
+            die_fmt("vocab size mismatch: (gguf) %u != (jarvis2c) %d", n_vocab, config->vocab_size);
         }
 
         vocab->id_to_token.resize(n_vocab);
@@ -569,45 +569,45 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
             auto & token_data = vocab->id_to_token[i];
             token_data.text  = std::move(word);
             token_data.score = scores[i];
-            token_data.type  = (llama_token_type) toktypes[i];
+            token_data.type  = (jarvis_token_type) toktypes[i];
         }
         ggml_free(ctx_data);
         gguf_free(ctx);
     } else {
-        // assume llama2.c vocabulary
-        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
-        llama_file file(filename, "rb");
+        // assume jarvis2.c vocabulary
+        LOG_INF("%s: Assuming jarvis2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        jarvis_file file(filename, "rb");
         if (!file.fp) {
             die_fmt("%s: %s", strerror(errno), filename);
         }
         const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
-        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
+        for (my_jarvis_vocab::id id=0; id<n_vocab; ++id) {
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
             std::string text = file.read_string(len);
 
             unsigned char byte_val;
-            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            my_jarvis_vocab::ttype type = JARVIS_TOKEN_TYPE_NORMAL;
             if (id == UNKNOWN_TOKEN_ID) {
                 text = "<unk>";
-                type = LLAMA_TOKEN_TYPE_UNKNOWN;
+                type = JARVIS_TOKEN_TYPE_UNKNOWN;
             } else if (id == BOS_TOKEN_ID) {
                 text = "<s>";
-                type = LLAMA_TOKEN_TYPE_CONTROL;
+                type = JARVIS_TOKEN_TYPE_CONTROL;
             } else if (id == EOS_TOKEN_ID) {
                 text = "</s>";
-                type = LLAMA_TOKEN_TYPE_CONTROL;
+                type = JARVIS_TOKEN_TYPE_CONTROL;
             } else if (text.empty()) {
-                type = LLAMA_TOKEN_TYPE_CONTROL;
+                type = JARVIS_TOKEN_TYPE_CONTROL;
             } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
                 // Text of byte tokens is already in the expected format.
-                type = LLAMA_TOKEN_TYPE_BYTE;
+                type = JARVIS_TOKEN_TYPE_BYTE;
             } else {
-                type = LLAMA_TOKEN_TYPE_NORMAL;
+                type = JARVIS_TOKEN_TYPE_NORMAL;
             }
-            text = llama_escape_whitespaces(text);
+            text = jarvis_escape_whitespaces(text);
 
             vocab->id_to_token[id].text = text;
             vocab->id_to_token[id].score = score;
@@ -630,8 +630,8 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
     }
 }
 
-static void save_as_llama_model(
-    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+static void save_as_jarvis_model(
+    struct my_jarvis_vocab * vocab, struct my_jarvis_model * model, TransformerWeights* w, const char * filename
 ) {
     // convert AK weights into GG weights one by one.
     // w->token_embedding_table -> model->tok_embeddings
@@ -670,8 +670,8 @@ static void save_as_llama_model(
 
     std::vector<const char*> tokens;
     std::vector<float> scores;
-    std::vector<llama_token_type> token_types;
-    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
+    std::vector<jarvis_token_type> token_types;
+    for (const my_jarvis_vocab::token_data & token_data : vocab->id_to_token) {
         tokens.push_back(token_data.text.c_str());
         scores.push_back(token_data.score);
         token_types.push_back(token_data.type);
@@ -682,8 +682,8 @@ static void save_as_llama_model(
 
     gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
 
-    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
-    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
+    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "jarvis");
+    gguf_set_val_str(ctx, KV_GENERAL_NAME, "jarvis");
 
     // special tokens
     gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
@@ -750,7 +750,7 @@ static void save_as_llama_model(
 static struct train_params get_default_train_params() {
     struct train_params params;
     params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
-    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_jarvis2c_output_model = "ak_jarvis_model.bin";
     params.fn_train_data           = "shakespeare.txt";
     params.fn_checkpoint_in        = "checkpoint.bin";
     params.fn_checkpoint_out       = "checkpoint.bin";
@@ -802,9 +802,9 @@ static void print_usage(int /*argc*/, char ** argv, const struct train_params *
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
-    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf jarvis model or jarvis2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --jarvis2c-model FNAME            [REQUIRED] model path from which to load Karpathy's jarvis2.c model\n");
+    fprintf(stderr, "  --jarvis2c-output-model FNAME     model path to save the converted jarvis2.c model (default %s')\n", params->fn_jarvis2c_output_model);
     fprintf(stderr, "\n");
 }
 
@@ -827,19 +827,19 @@ static bool params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->fn_vocab_model = argv[i];
-        } else if (arg == "--llama2c-model") {
+        } else if (arg == "--jarvis2c-model") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             reqd_param_found = true;
-            params->fn_llama2c_model = argv[i];
-        } else if (arg == "--llama2c-output-model") {
+            params->fn_jarvis2c_model = argv[i];
+        } else if (arg == "--jarvis2c-output-model") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_llama2c_output_model = argv[i];
+            params->fn_jarvis2c_output_model = argv[i];
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv, &default_params);
             exit(0);
@@ -855,7 +855,7 @@ static bool params_parse(int argc, char ** argv, struct train_params * params) {
         exit(1);
     }
     if (!reqd_param_found){
-        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        fprintf(stderr, "error: please specify a jarvis2.c .bin file to be converted with argument --jarvis2c-model\n");
         print_usage(argc, argv, &default_params);
         exit(1);
     }
@@ -882,15 +882,15 @@ int main(int argc, char ** argv) {
     Config config;
     TransformerWeights weights = {};
     {
-        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
-        FILE * file = fopen(params.fn_llama2c_model, "rb");
+        LOG_INF("%s: Loading jarvis2c model from %s\n", __func__, params.fn_jarvis2c_model);
+        FILE * file = fopen(params.fn_jarvis2c_model, "rb");
         if (!file) {
-            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_jarvis2c_model);
             return 1;
         }
         // read in the config header
         if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to read jarvis2c config from %s!\n",__func__,params.fn_jarvis2c_model);
             return 1;
         }
         auto shared_weights = config.vocab_size > 0;
@@ -899,17 +899,17 @@ int main(int argc, char ** argv) {
         // read in the Transformer weights
         alloc_weights(&weights, &config, shared_weights);
         if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_jarvis2c_model);
             return 1;
         }
         fclose(file);
     }
 
-    struct my_llama_vocab vocab;
+    struct my_jarvis_vocab vocab;
     load_vocab(params.fn_vocab_model, &config, &vocab);
 
-    struct my_llama_model model;
-    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
+    struct my_jarvis_model model;
+    model.hparams.n_vocab   = config.vocab_size; //jarvis_n_vocab(lctx);
     model.hparams.n_ctx     = params.n_ctx;
     model.hparams.n_embd    = config.dim; //params.n_embd;
     model.hparams.n_ff      = config.hidden_dim;
@@ -929,10 +929,10 @@ int main(int argc, char ** argv) {
     model.ctx = ggml_init(lcparams);
 
     init_model(&model);
-    model.name = basename(params.fn_llama2c_model);
-    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+    model.name = basename(params.fn_jarvis2c_model);
+    save_as_jarvis_model(&vocab, &model, &weights, params.fn_jarvis2c_output_model);
 
-    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving jarvis.c model file %s in ggml format at %s\n", __func__, params.fn_jarvis2c_model, params.fn_jarvis2c_output_model);
 
     ggml_free(model.ctx);
     return 0;
diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
deleted file mode 100644
index a6790e617217e..0000000000000
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-convert-llama2c-to-ggml)
-add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
deleted file mode 100644
index 5774ac83c32c8..0000000000000
--- a/examples/convert-llama2c-to-ggml/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-## Convert llama2.c model to ggml
-
-This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
-
-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
-
-`$ make -j`
-
-After successful compilation, following usage options are available:
-```
-usage: ./llama-convert-llama2c-to-ggml [options]
-
-options:
-  -h, --help                       show this help message and exit
-  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
-  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
-  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
-```
-
-An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-
-`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
-
-Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
-
-Now you can use the model with a command like:
-
-`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
index 9ab9ab06edf8f..df8508790211a 100755
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -33,7 +33,7 @@
     sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
 
 import gguf
-from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
+from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, JarvisHfVocab
 
 if TYPE_CHECKING:
     from typing_extensions import Self, TypeAlias
@@ -45,7 +45,7 @@
 
 NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 
-ARCH = gguf.MODEL_ARCH.LLAMA
+ARCH = gguf.MODEL_ARCH.JARVIS
 
 DEFAULT_CONCURRENCY = 8
 
@@ -130,8 +130,8 @@ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
     'I32': DT_I32,
 }
 
-# TODO: match this with `llama_ftype`
-# TODO: rename to LLAMAFileType
+# TODO: match this with `jarvis_ftype`
+# TODO: rename to JARVISFileType
 # TODO: move to `gguf.py`
 
 
@@ -276,7 +276,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
             rope_finetuned    = rope_finetuned,
         )
 
-    # LLaMA v2 70B params.json
+    # JARVIS v2 70B params.json
     # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
     @staticmethod
     def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
@@ -288,18 +288,18 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         f_rope_freq_base = None
         n_ff = None
 
-        # hack to determine LLaMA v1 vs v2 vs CodeLlama
+        # hack to determine JARVIS v1 vs v2 vs CodeJarvis
         if config.get("moe"):
             # Mixtral
             n_ctx = 32768
         elif config.get("rope_theta") == 1000000:
-            # CodeLlama
+            # CodeJarvis
             n_ctx = 16384
         elif config["norm_eps"] == 1e-05:
-            # LLaMA v2
+            # JARVIS v2
             n_ctx = 4096
         else:
-            # LLaMA v1
+            # JARVIS v1
             n_ctx = 2048
 
         if "layers.0.feed_forward.w1.weight" in model:
@@ -467,7 +467,7 @@ class ModelPlus:
 
 
 def merge_sharded(models: list[LazyModel]) -> LazyModel:
-    # Original LLaMA models have each file contain one part of each tensor.
+    # Original JARVIS models have each file contain one part of each tensor.
     # Use a dict instead of a set to preserve order.
     names = {name: None for model in models for name in model}
 
@@ -772,14 +772,14 @@ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.
 
     def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
         # Metadata About The Model And Its Provenence
-        name = "LLaMA"
+        name = "JARVIS"
         if metadata is not None and metadata.name is not None:
             name = metadata.name
         elif params.path_model is not None:
             name = params.path_model.name
         elif params.n_ctx == 4096:
-            # Heuristic detection of LLaMA v2 model
-            name = "LLaMA v2"
+            # Heuristic detection of JARVIS v2 model
+            name = "JARVIS v2"
 
         self.gguf.add_name(name)
 
@@ -1199,7 +1199,7 @@ def load_some_model(path: Path) -> ModelPlus:
 
 
 class VocabFactory:
-    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, JarvisHfVocab]
 
     def __init__(self, path: Path):
         self.path = path
@@ -1289,7 +1289,7 @@ def main(args_in: list[str] | None = None) -> None:
     if np.uint32(1) == np.uint32(1).newbyteorder("<"):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
-    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
+    parser = argparse.ArgumentParser(description="Convert a JARVIS model to a GGML compatible file")
     parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
     parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
     parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
@@ -1366,8 +1366,8 @@ def main(args_in: list[str] | None = None) -> None:
                 msg = """\
                     The model doesn't have a context size, and you didn't specify one with --ctx
                     Please specify one with --ctx:
-                     - LLaMA v1: --ctx 2048
-                     - LLaMA v2: --ctx 4096"""
+                     - JARVIS v1: --ctx 2048
+                     - JARVIS v2: --ctx 4096"""
                 parser.error(textwrap.dedent(msg))
             params.n_ctx = args.ctx
 
diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt
index 0a559d60c2a6d..ed3bb6abba599 100644
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-cvector-generator)
+set(TARGET jarvis-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md
index be4dd5250f15f..e7a4f734761e6 100644
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@@ -3,24 +3,24 @@
 This example demonstrates how to generate a control vector using gguf models.
 
 Related PRs:
-- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
-- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
-- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
+- [Add support for control vectors](https://github.com/ggerganov/jarvis.cpp/pull/5970)
+- (Issue) [Generate control vector using jarvis.cpp](https://github.com/ggerganov/jarvis.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggerganov/jarvis.cpp/pull/7514)
 
 ## Examples
 
 ```sh
 # CPU only
-./cvector-generator -m ./llama-3.Q4_K_M.gguf
+./cvector-generator -m ./jarvis-3.Q4_K_M.gguf
 
 # With GPU
-./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
+./cvector-generator -m ./jarvis-3.Q4_K_M.gguf -ngl 99
 
 # With advanced options
-./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
+./cvector-generator -m ./jarvis-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
 
 # Using mean value instead of PCA
-./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
+./cvector-generator -m ./jarvis-3.Q4_K_M.gguf --method mean
 
 # To see help message
 ./cvector-generator -h
@@ -36,10 +36,10 @@ If you have multiple lines per prompt, you can escape the newline character (cha
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
 
-Example to use output file with `llama-cli`:
+Example to use output file with `jarvis-cli`:
 
 (Tips: The control vector works better when apply to layers higher than 10)
 
 ```sh
-./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
+./jarvis-cli -m ./jarvis-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
 ```
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index d1731bba64e1b..e09304aed1058 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,6 +1,6 @@
 #include "arg.h"
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 #include "pca.hpp"
 #include "mean.hpp"
@@ -28,7 +28,7 @@
 // utils
 
 template <class Iter>
-static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+static std::string tokens_to_str(jarvis_context * ctx, Iter begin, Iter end) {
     std::string ret;
     for (; begin != end; ++begin) {
         ret += common_token_to_piece(ctx, *begin);
@@ -39,10 +39,10 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 
 static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
-    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
-    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
-    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
-    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
+    printf("\n    CPU only:   %s -m ./jarvis-3.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./jarvis-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./jarvis-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    using mean: %s -m ./jarvis-3.Q4_K_M.gguf --method mean\n", argv[0]);
     printf("\n");
 }
 
@@ -266,12 +266,12 @@ struct train_context {
 };
 
 struct tokenized_prompt {
-    std::vector<llama_token> tokens_pos;
-    std::vector<llama_token> tokens_neg;
+    std::vector<jarvis_token> tokens_pos;
+    std::vector<jarvis_token> tokens_neg;
     size_t max_seq_len;
 
-    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
-        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    tokenized_prompt(jarvis_context * ctx, std::string pos, std::string neg) {
+        const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
         tokens_pos = common_tokenize(ctx, pos, add_bos, true);
         tokens_neg = common_tokenize(ctx, neg, add_bos, true);
         max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -279,10 +279,10 @@ struct tokenized_prompt {
         padding_seq(ctx, tokens_neg, max_seq_len);
     }
 
-    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+    void padding_seq(jarvis_context * ctx, std::vector<jarvis_token> & tokens, size_t len) {
         // TODO: customize padding token
-        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
-        llama_token pad_tok = pad_tokens.back();
+        std::vector<jarvis_token> pad_tokens = common_tokenize(ctx, " ", false);
+        jarvis_token pad_tok = pad_tokens.back();
         while (tokens.size() < len) {
             tokens.push_back(pad_tok);
         }
@@ -337,9 +337,9 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
     return true;
 }
 
-static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+static bool get_hidden_layers(jarvis_context * ctx, std::vector<jarvis_token> & tokens) {
+    jarvis_kv_cache_clear(ctx);
+    if (jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
     }
@@ -390,7 +390,7 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
         return 1;
     }
 
@@ -409,21 +409,21 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     print_build_info();
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model to get hparams
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
-    // int n_ctx = llama_n_ctx(ctx);
-    int n_layers = llama_n_layer(model);
-    int n_embd = llama_n_embd(model);
+    // int n_ctx = jarvis_n_ctx(ctx);
+    int n_layers = jarvis_n_layer(model);
+    int n_embd = jarvis_n_embd(model);
     // get model hint param (a.k.a model arch name)
     char model_hint[128];
-    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+    jarvis_model_meta_val_str(model, "general.architecture", model_hint, 128);
 
     // init train_context
     train_context ctx_train(n_embd, n_layers);
@@ -474,8 +474,8 @@ int main(int argc, char ** argv) {
 
     // done with the model, we can now free it to make gain some memory
     printf("Done evaluate prompts, unload model...\n");
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
     bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
 
@@ -497,7 +497,7 @@ int main(int argc, char ** argv) {
     // write output vectors to gguf
     export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp
index 16be5ce3eecf1..f95fb2dcce6cf 100644
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@@ -1,5 +1,5 @@
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #include <string>
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index f6e307fbc4970..3ea5dc4738570 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -1,5 +1,5 @@
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #ifdef GGML_USE_CUDA
@@ -290,7 +290,7 @@ static void power_iteration(
     ggml_gallocr_free(allocr);
 
     // TODO @ngxson : The output vector is randomly inverted
-    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
+    // Solution: https://github.com/ggerganov/jarvis.cpp/pull/8069#issuecomment-2185328171
 }
 
 static void run_pca(
diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md
index 59918ec2bbf72..2790c72fb7052 100644
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -1,7 +1,7 @@
 # Migration notice for binary filenames
 
 > [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+[2024 Jun 12] Binaries have been renamed w/ a `jarvis-` prefix. `main` is now `jarvis-cli`, `server` is `jarvis-server`, etc (https://github.com/ggerganov/jarvis.cpp/pull/7809)
 
 This migration was important, but it is a breaking change that may not always be immediately obvious to users.
 
@@ -9,41 +9,41 @@ Please update all scripts and workflows to use the new binary names.
 
 | Old Filename | New Filename |
 | ---- | ---- |
-| main | llama-cli |
-| server | llama-server |
-| llama-bench | llama-bench |
-| embedding | llama-embedding |
-| quantize | llama-quantize |
-| tokenize | llama-tokenize |
-| export-lora | llama-export-lora |
+| main | jarvis-cli |
+| server | jarvis-server |
+| jarvis-bench | jarvis-bench |
+| embedding | jarvis-embedding |
+| quantize | jarvis-quantize |
+| tokenize | jarvis-tokenize |
+| export-lora | jarvis-export-lora |
 | libllava.a | libllava.a |
-| baby-llama | llama-baby-llama |
-| batched | llama-batched |
-| batched-bench | llama-batched-bench |
-| benchmark-matmult | llama-benchmark-matmult |
-| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
-| eval-callback | llama-eval-callback |
-| gbnf-validator | llama-gbnf-validator |
-| gguf | llama-gguf |
-| gguf-split | llama-gguf-split |
-| gritlm | llama-gritlm |
-| imatrix | llama-imatrix |
-| infill | llama-infill |
-| llava-cli | llama-llava-cli |
-| lookahead | llama-lookahead |
-| lookup | llama-lookup |
-| lookup-create | llama-lookup-create |
-| lookup-merge | llama-lookup-merge |
-| lookup-stats | llama-lookup-stats |
-| parallel | llama-parallel |
-| passkey | llama-passkey |
-| perplexity | llama-perplexity |
-| q8dot | llama-q8dot |
-| quantize-stats | llama-quantize-stats |
-| retrieval | llama-retrieval |
-| save-load-state | llama-save-load-state |
-| simple | llama-simple |
-| speculative | llama-speculative |
-| vdot | llama-vdot |
+| baby-jarvis | jarvis-baby-jarvis |
+| batched | jarvis-batched |
+| batched-bench | jarvis-batched-bench |
+| benchmark-matmult | jarvis-benchmark-matmult |
+| convert-jarvis2c-to-ggml | jarvis-convert-jarvis2c-to-ggml |
+| eval-callback | jarvis-eval-callback |
+| gbnf-validator | jarvis-gbnf-validator |
+| gguf | jarvis-gguf |
+| gguf-split | jarvis-gguf-split |
+| gritlm | jarvis-gritlm |
+| imatrix | jarvis-imatrix |
+| infill | jarvis-infill |
+| llava-cli | jarvis-llava-cli |
+| lookahead | jarvis-lookahead |
+| lookup | jarvis-lookup |
+| lookup-create | jarvis-lookup-create |
+| lookup-merge | jarvis-lookup-merge |
+| lookup-stats | jarvis-lookup-stats |
+| parallel | jarvis-parallel |
+| passkey | jarvis-passkey |
+| perplexity | jarvis-perplexity |
+| q8dot | jarvis-q8dot |
+| quantize-stats | jarvis-quantize-stats |
+| retrieval | jarvis-retrieval |
+| save-load-state | jarvis-save-load-state |
+| simple | jarvis-simple |
+| speculative | jarvis-speculative |
+| vdot | jarvis-vdot |
 | tests/test-c.o | tests/test-c.o |
 
diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp
index 11b35d2c22500..088364cd4105c 100644
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -17,18 +17,18 @@ int main(int argc, char** argv) {
         filename = filename.substr(pos+1);
     }
 
-    // Append "llama-" to the beginning of filename to get the replacemnt filename
-    auto replacement_filename = "llama-" + filename;
+    // Append "jarvis-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "jarvis-" + filename;
 
-    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    // The exception is if the filename is "main", then our replacement filename is "jarvis-cli"
     if (filename == "main") {
-        replacement_filename = "llama-cli";
+        replacement_filename = "jarvis-cli";
     }
 
     fprintf(stdout, "\n");
     fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
     fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
-    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, " See https://github.com/ggerganov/jarvis.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
     fprintf(stdout, "\n");
 
     return EXIT_FAILURE;
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 8256e789ad33a..3c43d82e38f4f 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-embedding)
+set(TARGET jarvis-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
index 12b372bf1df42..40589f6ce4f81 100644
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -1,6 +1,6 @@
-# llama.cpp/example/embedding
+# jarvis.cpp/example/embedding
 
-This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
+This example demonstrates generate high-dimensional embedding vector of a given text with jarvis.cpp.
 
 ## Quick Start
 
@@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
+./jarvis-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
 ```
 
 ### Windows:
 
 ```powershell
-llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
+jarvis-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
 ```
 
 The above command will output space-separated float values.
@@ -50,11 +50,11 @@ The above command will output space-separated float values.
 ### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+./jarvis-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
 
 ### Windows:
 
 ```powershell
-llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+jarvis-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 3f18fc6a70878..77dafad011a79 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <ctime>
 
@@ -25,30 +25,30 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
     return lines;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+static void batch_add_seq(jarvis_batch & batch, const std::vector<int32_t> & tokens, jarvis_seq_id seq_id) {
     size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
         common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
-    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);
+static void batch_decode(jarvis_context * ctx, jarvis_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+    const enum jarvis_pooling_type pooling_type = jarvis_pooling_type(ctx);
+    const struct jarvis_model * model = jarvis_get_model(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    jarvis_kv_cache_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+    if (jarvis_model_has_encoder(model) && !jarvis_model_has_decoder(model)) {
         // encoder-only model
-        if (llama_encode(ctx, batch) < 0) {
+        if (jarvis_encode(ctx, batch) < 0) {
             LOG_ERR("%s : failed to encode\n", __func__);
         }
-    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+    } else if (!jarvis_model_has_encoder(model) && jarvis_model_has_decoder(model)) {
         // decoder-only model
-        if (llama_decode(ctx, batch) < 0) {
+        if (jarvis_decode(ctx, batch) < 0) {
             LOG_ERR("%s : failed to decode\n", __func__);
         }
     }
@@ -61,14 +61,14 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         const float * embd = nullptr;
         int embd_pos = 0;
 
-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        if (pooling_type == JARVIS_POOLING_TYPE_NONE) {
             // try to get token embeddings
-            embd = llama_get_embeddings_ith(ctx, i);
+            embd = jarvis_get_embeddings_ith(ctx, i);
             embd_pos = i;
             GGML_ASSERT(embd != NULL && "failed to get token embeddings");
         } else {
             // try to get sequence embeddings - supported only when pooling_type is not NONE
-            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            embd = jarvis_get_embeddings_seq(ctx, batch.seq_id[i][0]);
             embd_pos = batch.seq_id[i][0];
             GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
         }
@@ -81,7 +81,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_EMBEDDING)) {
         return 1;
     }
 
@@ -91,25 +91,25 @@ int main(int argc, char ** argv) {
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
-    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    const enum jarvis_pooling_type pooling_type = jarvis_pooling_type(ctx);
 
-    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+    if (jarvis_model_has_encoder(model) && jarvis_model_has_decoder(model)) {
         LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
         return 1;
     }
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
     // check if the last token is SEP
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_token_sep(model)) {
+        if (inp.empty() || inp.back() != jarvis_token_sep(model)) {
             LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
             LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
@@ -167,11 +167,11 @@ int main(int argc, char ** argv) {
 
     // initialize batch
     const int n_prompts = prompts.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    struct jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1);
 
     // count number of embeddings
     int n_embd_count = 0;
-    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+    if (pooling_type == JARVIS_POOLING_TYPE_NONE) {
         for (int k = 0; k < n_prompts; k++) {
             n_embd_count += inputs[k].size();
         }
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
     }
 
     // allocate output
-    const int n_embd = llama_n_embd(model);
+    const int n_embd = jarvis_n_embd(model);
     std::vector<float> embeddings(n_embd_count * n_embd, 0);
     float * emb = embeddings.data();
 
@@ -197,7 +197,7 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + e * n_embd;
             batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
-            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
+            e += pooling_type == JARVIS_POOLING_TYPE_NONE ? batch.n_tokens : s;
             s = 0;
             common_batch_clear(batch);
         }
@@ -214,7 +214,7 @@ int main(int argc, char ** argv) {
     if (params.embd_out.empty()) {
         LOG("\n");
 
-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        if (pooling_type == JARVIS_POOLING_TYPE_NONE) {
             for (int j = 0; j < n_embd_count; j++) {
                 LOG("embedding %d: ", j);
                 for (int i = 0; i < std::min(3, n_embd); i++) {
@@ -234,7 +234,7 @@ int main(int argc, char ** argv) {
                 }
                 LOG("\n");
             }
-        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+        } else if (pooling_type == JARVIS_POOLING_TYPE_RANK) {
             for (int j = 0; j < n_embd_count; j++) {
                 // NOTE: if you change this log - update the tests in ci/run.sh
                 LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
@@ -312,13 +312,13 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
     // clean up
-    llama_batch_free(batch);
-    llama_free(ctx);
-    llama_free_model(model);
-    llama_backend_free();
+    jarvis_batch_free(batch);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
index a48753d38e16e..46b47b90b94ba 100644
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -1,9 +1,9 @@
-set(TARGET llama-eval-callback)
+set(TARGET jarvis-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET} COMMAND jarvis-eval-callback --hf-repo ggml-org/models --hf-file tinyjarviss/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
index 63a57ad6b68e5..df7946f3abc3a 100644
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/examples/eval-callback
+# jarvis.cpp/examples/eval-callback
 
 A simple example which demonstrates how to use callback during the inference.
 It simply prints to the console all operations and tensor data.
@@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
 Usage:
 
 ```shell
-llama-eval-callback \
+jarvis-eval-callback \
   --hf-repo ggml-org/models \
   --hf-file phi-2/ggml-model-q4_0.gguf \
   --model phi-2-q4_0.gguf \
@@ -20,12 +20,12 @@ Will print:
 ```shell
 llm_load_tensors: offloaded 33/33 layers to GPU
 ...
-llama_new_context_with_model: n_ctx      = 512
+jarvis_new_context_with_model: n_ctx      = 512
 ...
-llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
-llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
-llama_new_context_with_model: graph nodes  = 1225
-llama_new_context_with_model: graph splits = 2
+jarvis_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
+jarvis_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
+jarvis_new_context_with_model: graph nodes  = 1225
+jarvis_new_context_with_model: graph splits = 2
 ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
                                      [
                                       [
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index c08e3e5f675ed..a4cb2d6131438 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #include <cstdio>
@@ -126,12 +126,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     return true;
 }
 
-static bool run(llama_context * ctx, const common_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+static bool run(jarvis_context * ctx, const common_params & params) {
+    const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
+    std::vector<jarvis_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+    if (jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), tokens.size()))) {
         LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
@@ -144,14 +144,14 @@ int main(int argc, char ** argv) {
 
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) {
         return 1;
     }
 
     common_init();
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // pass the callback to the backend scheduler
     // it will be executed for each node during the graph computation
@@ -160,10 +160,10 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
     if (model == nullptr || ctx == nullptr) {
         LOG_ERR("%s : failed to init\n", __func__);
         return 1;
@@ -182,12 +182,12 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
index 1cef6e71694e2..babb850e94ede 100644
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-export-lora)
+set(TARGET jarvis-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 7dce99c9a9e61..7df4426e973d2 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -3,7 +3,7 @@
 Apply LORA adapters to base model and export the resulting model.
 
 ```
-usage: llama-export-lora [options]
+usage: jarvis-export-lora [options]
 
 options:
   -m,    --model                  model path from which to load base model (default '')
@@ -16,16 +16,16 @@ options:
 For example:
 
 ```bash
-./bin/llama-export-lora \
-    -m open-llama-3b-v2.gguf \
-    -o open-llama-3b-v2-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
+./bin/jarvis-export-lora \
+    -m open-jarvis-3b-v2.gguf \
+    -o open-jarvis-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-jarvis-3b-v2-english2tokipona-chat-LATEST.gguf
 ```
 
 Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
 
 ```bash
-./bin/llama-export-lora \
+./bin/jarvis-export-lora \
     -m your_base_model.gguf \
     -o your_merged_model.gguf \
     --lora-scaled lora_task_A.gguf 0.5 \
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 67662313d075c..d024a7e85d574 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -186,10 +186,10 @@ struct lora_merge_ctx {
         // prepare metadata
         gguf_set_kv(ctx_out, base_model.ctx_gguf);
         // output is forced to f16 for now
-        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
+        gguf_set_val_u32(ctx_out, "general.file_type", JARVIS_FTYPE_MOSTLY_F16);
 
         // check if all lora adapters have the same tensors
-        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/jarvis.cpp/pull/8607#discussion_r1686027777
         static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
         if (adapters.size() > 1) {
             for (size_t i = 1; i < adapters.size(); ++i) {
@@ -402,7 +402,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_EXPORT_LORA, print_usage)) {
         return 1;
     }
 
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
index 4edd6ec7394c5..870d93220a544 100644
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-gbnf-validator)
+set(TARGET jarvis-gbnf-validator)
 add_executable(${TARGET} gbnf-validator.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index 7493af9d3aec3..bc4e028e3342d 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,5 +1,5 @@
 #include "unicode.h"
-#include "llama-grammar.h"
+#include "jarvis-grammar.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -8,17 +8,17 @@
 #include <string>
 #include <vector>
 
-static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+static bool jarvis_grammar_validate(struct jarvis_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
     const auto cpts = unicode_cpts_from_utf8(input_str);
 
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+    const jarvis_grammar_rules  & rules      = jarvis_grammar_get_rules (grammar);
+          jarvis_grammar_stacks & stacks_cur = jarvis_grammar_get_stacks(grammar);
 
     size_t pos = 0;
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+        const jarvis_grammar_stacks stacks_prev = jarvis_grammar_get_stacks(grammar); // copy
 
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        jarvis_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
 
         if (stacks_cur.empty()) {
             error_pos = pos;
@@ -80,9 +80,9 @@ int main(int argc, char** argv) {
         grammar_str = buffer.str();
     }
 
-    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
+    jarvis_grammar * grammar = jarvis_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
     if (grammar == nullptr) {
-        throw std::runtime_error("Failed to initialize llama_grammar");
+        throw std::runtime_error("Failed to initialize jarvis_grammar");
     }
     // Read the input file
     std::string input_str;
@@ -97,7 +97,7 @@ int main(int argc, char** argv) {
     // Validate the input string against the grammar
     size_t error_pos;
     std::string error_msg;
-    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+    bool is_valid = jarvis_grammar_validate(grammar, input_str, error_pos, error_msg);
 
     if (is_valid) {
         fprintf(stdout, "Input string is valid according to the grammar.\n");
@@ -106,7 +106,7 @@ int main(int argc, char** argv) {
     }
 
     // Clean up
-    llama_grammar_free_impl(grammar);
+    jarvis_grammar_free_impl(grammar);
 
     return 0;
 }
diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
index c94cda7764341..45c2a215c43c1 100644
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-gen-docs)
+set(TARGET jarvis-gen-docs)
 add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
index 77c59a836e50a..b02918844f690 100644
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -47,7 +47,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
     }
 }
 
-static void export_md(std::string fname, llama_example ex) {
+static void export_md(std::string fname, jarvis_example ex) {
     std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
 
     common_params params;
@@ -57,7 +57,7 @@ static void export_md(std::string fname, llama_example ex) {
     std::vector<common_arg *> sparam_options;
     std::vector<common_arg *> specific_options;
     for (auto & opt : ctx_arg.options) {
-        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        // in case multiple JARVIS_EXAMPLE_* are set, we prioritize the JARVIS_EXAMPLE_* matching current example
         if (opt.is_sparam) {
             sparam_options.push_back(&opt);
         } else if (opt.in_example(ctx_arg.ex)) {
@@ -76,8 +76,8 @@ static void export_md(std::string fname, llama_example ex) {
 }
 
 int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+    export_md("autogen-main.md", JARVIS_EXAMPLE_MAIN);
+    export_md("autogen-server.md", JARVIS_EXAMPLE_SERVER);
 
     return 0;
 }
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 633f4553594bb..c51249495fccf 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET llama-gguf-hash)
+set(TARGET jarvis-gguf-hash)
 add_executable(${TARGET} gguf-hash.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 
diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md
index 9871651e38ba8..a9ceb24af3183 100644
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@@ -1,5 +1,5 @@
 
-# llama-gguf-hash
+# jarvis-gguf-hash
 
 CLI to hash GGUF files to detect difference on a per model and per tensor level.
 
@@ -38,8 +38,8 @@ For Maintainers:
 For Model Creators:
 - Optional consistent UUID generation based on model tensor content
     - This is served by UUIDv5 which is useful for databases keys
-        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
-            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
+        - jarvis.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
+            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Jarvis.cpp`
 
 For Model Users:
 - Assurance of tensor layer integrity even if metadata was updated
@@ -57,14 +57,14 @@ For Model Users:
 ## Compile Example
 
 ```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
+cmake -B build -DCMAKE_BUILD_TYPE=Debug -DJARVIS_FATAL_WARNINGS=ON
 make -C build clean
-make -C build llama-gguf-hash VERBOSE=1
-./build/bin/llama-gguf-hash test.gguf
-./build/bin/llama-gguf-hash --xxh64 test.gguf
-./build/bin/llama-gguf-hash --sha1 test.gguf
-./build/bin/llama-gguf-hash --uuid test.gguf
-./build/bin/llama-gguf-hash --sha256 test.gguf
+make -C build jarvis-gguf-hash VERBOSE=1
+./build/bin/jarvis-gguf-hash test.gguf
+./build/bin/jarvis-gguf-hash --xxh64 test.gguf
+./build/bin/jarvis-gguf-hash --sha1 test.gguf
+./build/bin/jarvis-gguf-hash --uuid test.gguf
+./build/bin/jarvis-gguf-hash --sha256 test.gguf
 ```
 
 ## Generation and Verification Example
@@ -72,7 +72,7 @@ make -C build llama-gguf-hash VERBOSE=1
 To generate we may use this command
 
 ```bash
-./llama-gguf-hash --all test.gguf > test.gguf.manifest
+./jarvis-gguf-hash --all test.gguf > test.gguf.manifest
 ```
 
 Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
@@ -117,7 +117,7 @@ sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test
 We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
 
 ```bash
-$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
+$ ./jarvis-gguf-hash --check test.gguf.manifest test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
@@ -137,7 +137,7 @@ Verification results for test.gguf.manifest - Success
 Or we may explicitly ask for a faster hash like:
 
 ```bash
-$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
+$ ./jarvis-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
@@ -157,7 +157,7 @@ Verification results for test.gguf.manifest - Success
 Or maybe we want to just check that all the hash is valid:
 
 ```bash
-$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
+$./jarvis-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
index e96c75117f533..e7e3cd576c3da 100644
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -24,9 +24,9 @@ extern "C" {
 #endif
 
 
-// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
-#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
-#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
+// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Jarvis.cpp')
+#define UUID_NAMESPACE_JARVIS_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
+#define UUID_NAMESPACE_JARVIS_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
 
 
 #define HASH_TYPE_SHA256_STR "sha256"
@@ -320,7 +320,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     // sha1 for uuid init
     SHA1_CTX sha1_for_uuid_ctx;
     if (hash_params.uuid) {
-        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
+        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_JARVIS_CPP_HEX};
         SHA1Init(&sha1_for_uuid_ctx);
         SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
     }
diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt
index f63887da7dfca..e1ed69f8df477 100644
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-gguf-split)
+set(TARGET jarvis-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 7e62657e118a4..e44fc83f1bed9 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 
 #include <algorithm>
@@ -99,8 +99,8 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
             split_print_usage(argv[0]);
             exit(0);
         } else if (arg == "--version") {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "version: %d (%s)\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", JARVIS_COMPILER, JARVIS_BUILD_TARGET);
             exit(0);
         } else if (arg == "--dry-run") {
             arg_found = true;
@@ -308,7 +308,7 @@ struct split_strategy {
         for (auto & ctx_out : ctx_outs) {
             // construct file path
             char split_path[PATH_MAX] = {0};
-            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+            jarvis_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
 
             // open the output file
             printf("Writing file %s ... ", split_path);
@@ -430,7 +430,7 @@ static void gguf_merge(const split_params & split_params) {
         };
 
         if (i_split > 0) {
-            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+            jarvis_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
         }
         fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
 
@@ -470,7 +470,7 @@ static void gguf_merge(const split_params & split_params) {
             }
 
             // Verify the file naming and extract split_prefix
-            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+            if (!jarvis_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
                 fprintf(stderr, "\n%s: unexpected input file name: %s"
                                 " i_split=%d"
                                 " n_split=%d\n", __func__,
@@ -508,7 +508,7 @@ static void gguf_merge(const split_params & split_params) {
 
     // Write tensors data
     for (int i_split = 0; i_split < n_split; i_split++) {
-        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        jarvis_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
         std::ifstream f_input(split_path, std::ios::binary);
         if (!f_input.is_open()) {
             fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index d5a92d6051063..246e9a3573ec6 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -18,8 +18,8 @@ fi
 
 set -x
 
-SPLIT=$1/llama-gguf-split
-MAIN=$1/llama-cli
+SPLIT=$1/jarvis-gguf-split
+MAIN=$1/jarvis-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index a9569b411956b..3cb82c8919c3b 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET llama-gguf)
+set(TARGET jarvis-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
index 86dfddca346fe..0039c26030fcf 100644
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-gritlm)
+set(TARGET jarvis-gritlm)
 add_executable(${TARGET} gritlm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
index 786ba57363def..88fde2e28aafc 100644
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
 
 Run the example using the downloaded model:
 ```console
-$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
+$ ./jarvis-gritlm -m models/gritlm-7b_q4_1.gguf
 
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 6e42fa0734ecb..58df109196ff9 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,39 +1,39 @@
 #include "arg.h"
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <string>
 #include <vector>
 
 // #define GRIT_DEBUG
 
-static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
+static std::vector<std::vector<float>> encode(jarvis_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
     std::vector<std::vector<float>> result;
 
-    const llama_model * model = llama_get_model(ctx);
+    const jarvis_model * model = jarvis_get_model(ctx);
 
-    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+    jarvis_batch batch = jarvis_batch_init(jarvis_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
         common_batch_clear(batch);
 
         const std::string input_string = instruction + sentences[i];
 
-        std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
+        std::vector<jarvis_token> inputs = common_tokenize(model, input_string, true, false);
 
         const int32_t n_toks = inputs.size();
 
         // GritLM seems to have EOS = ""
         // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(model));
+        // inputs.push_back(jarvis_token_eos(model));
 
         // we want to ignore instruction tokens for mean pooling
         const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
 
 #ifdef GRIT_DEBUG
         // debug tokens - should be matching as referenced in the GritLM sample
-        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
-            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
+        std::for_each(inputs.begin(), inputs.end(), [&ctx](jarvis_token t) {
+            std::printf("[%u:%s]", t, jarvis_token_to_piece(ctx, t).c_str());
         });
         std::printf("\n");
 #endif
@@ -44,22 +44,22 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
-        llama_set_embeddings(ctx, true);
-        llama_set_causal_attn(ctx, false);
+        jarvis_kv_cache_clear(ctx);
+        jarvis_set_embeddings(ctx, true);
+        jarvis_set_causal_attn(ctx, false);
 
         // run model
-        llama_decode(ctx, batch);
+        jarvis_decode(ctx, batch);
 
         // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(model);
+        uint64_t n_embd = jarvis_n_embd(model);
 
         // allocate embedding output
         std::vector<float> emb_unorm(n_embd, 0.0f);
 
         // sum up all token embeddings
         for (int32_t k = n_inst; k < n_toks; k++) {
-            float * emb = llama_get_embeddings_ith(ctx, k);
+            float * emb = jarvis_get_embeddings_ith(ctx, k);
             for (uint64_t j = 0; j < n_embd; j++) {
                 emb_unorm[j] += emb[j];
             }
@@ -88,24 +88,24 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 #endif
     }
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
     return result;
 }
 
-static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
+static std::string generate(jarvis_context * ctx, jarvis_sampler * smpl, const std::string & prompt, bool stream) {
     std::string result;
 
-    const llama_model * model = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(model);
+    const jarvis_model * model = jarvis_get_model(ctx);
+    jarvis_token eos_token = jarvis_token_eos(model);
 
-    llama_kv_cache_clear(ctx);
-    llama_set_embeddings(ctx, false);
-    llama_set_causal_attn(ctx, true);
+    jarvis_kv_cache_clear(ctx);
+    jarvis_set_embeddings(ctx, false);
+    jarvis_set_causal_attn(ctx, true);
 
-    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+    jarvis_batch bat = jarvis_batch_init(jarvis_n_batch(ctx), 0, 1);
 
-    std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
+    std::vector<jarvis_token> inputs = common_tokenize(model, prompt, false, true);
     int32_t i_current_token = 0;
 
     while (true) {
@@ -119,9 +119,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
         }
         inputs.clear();
 
-        llama_decode(ctx, bat);
+        jarvis_decode(ctx, bat);
 
-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        jarvis_token token = jarvis_sampler_sample(smpl, ctx, bat.n_tokens - 1);
 
         if (token == eos_token) {
             break;
@@ -142,7 +142,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
         std::printf("\n");
     }
 
-    llama_batch_free(bat);
+    jarvis_batch_free(bat);
 
     return result;
 }
@@ -154,29 +154,29 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) {
         return 1;
     }
 
     common_init();
 
-    llama_model_params mparams = common_model_params_to_llama(params);
-    llama_context_params cparams = common_context_params_to_llama(params);
+    jarvis_model_params mparams = common_model_params_to_jarvis(params);
+    jarvis_context_params cparams = common_context_params_to_jarvis(params);
 
-    llama_backend_init();
+    jarvis_backend_init();
 
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
+    jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), mparams);
 
     // create generation context
-    llama_context * ctx = llama_new_context_with_model(model, cparams);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, cparams);
 
-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = jarvis_sampler_chain_default_params();
 
     sparams.no_perf = false;
 
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_greedy());
 
     // ### Embedding/Representation ###
     // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -197,7 +197,7 @@ int main(int argc, char * argv[]) {
         const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
         const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
 
-        const int n_embd = llama_n_embd(model);
+        const int n_embd = jarvis_n_embd(model);
 
         const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
         const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@@ -217,10 +217,10 @@ int main(int argc, char * argv[]) {
         std::string response = generate(ctx, smpl, prompt, true);
     }
 
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_free_model(model);
-    llama_backend_free();
+    jarvis_sampler_free(smpl);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
index d4c8265bdb9d2..c03c64826c129 100644
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-imatrix)
+set(TARGET jarvis-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index bb5faec94c20a..2781dce75e951 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,12 +1,12 @@
-# llama.cpp/examples/imatrix
+# jarvis.cpp/examples/imatrix
 
 Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
-More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
+More information is available here: https://github.com/ggerganov/jarvis.cpp/pull/4861
 
 ## Usage
 
 ```
-./llama-imatrix \
+./jarvis-imatrix \
     -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
     [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
     [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
@@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 GGML_CUDA=1 make -j
 
 # generate importance matrix (imatrix.dat)
-./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+./jarvis-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 
 # use the imatrix to perform a Q4_K_M quantization
-./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+./jarvis-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 70ff47768c02b..437651a750227 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cmath>
 #include <cstdio>
@@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
 
     // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
+    // ref: https://github.com/ggerganov/jarvis.cpp/pull/6387
     if (t->op == GGML_OP_MUL_MAT_ID) {
         //   ids  -> [n_experts_used, n_tokens]
         //   src1 -> [cols, n_expert_used, n_tokens]
@@ -428,15 +428,15 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const common_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
-    const int n_ctx = llama_n_ctx(ctx);
+static bool compute_imatrix(jarvis_context * ctx, const common_params & params) {
+    const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
+    GGML_ASSERT(!jarvis_add_eos_token(jarvis_get_model(ctx)));
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     auto tim1 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+    std::vector<jarvis_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -467,7 +467,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const int n_chunk_max = tokens.size() / n_ctx;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
     const int n_batch = params.n_batch;
 
     int count = 0;
@@ -494,9 +494,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -507,7 +507,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = jarvis_token_bos(jarvis_get_model(ctx));
             }
 
             common_batch_clear(batch);
@@ -515,9 +515,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                 common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
             }
 
-            if (llama_decode(ctx, batch)) {
+            if (jarvis_decode(ctx, batch)) {
                 LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
+                jarvis_batch_free(batch);
                 return false;
             }
 
@@ -525,12 +525,12 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
             tokens[batch_start] = token_org;
 
             if (params.compute_ppl && num_batches > 1) {
-                const auto * batch_logits = llama_get_logits(ctx);
+                const auto * batch_logits = jarvis_get_logits(ctx);
                 logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
             }
         }
 
-        llama_batch_free(batch);
+        jarvis_batch_free(batch);
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
@@ -547,7 +547,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
 
         if (params.compute_ppl) {
             const int first = n_ctx/2;
-            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto * all_logits = num_batches > 1 ? logits.data() : jarvis_get_logits(ctx);
             process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                     workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
             count += n_ctx - first - 1;
@@ -583,7 +583,7 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.escape = false;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_IMATRIX, print_usage)) {
         return 1;
     }
 
@@ -606,8 +606,8 @@ int main(int argc, char ** argv) {
         g_collector.save_imatrix();
     }
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // pass the callback to the backend scheduler
     // it will be executed for each node during the graph computation
@@ -616,16 +616,16 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
     if (model == nullptr || ctx == nullptr) {
         LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
         LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
@@ -644,12 +644,12 @@ int main(int argc, char ** argv) {
     g_collector.save_imatrix();
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index 9b1aa3b63c920..f9ad699135e60 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-infill)
+set(TARGET jarvis-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 810a0c5e76697..ba2ab2bae41a9 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -1,25 +1,25 @@
-# llama.cpp/example/infill
+# jarvis.cpp/example/infill
 
-This example shows how to use the infill mode with Code Llama models supporting infill mode.
+This example shows how to use the infill mode with Code Jarvis models supporting infill mode.
 Currently the 7B and 13B models support infill mode.
 
 Infill supports most of the options available in the main example.
 
-For further information have a look at the main README.md in llama.cpp/example/main/README.md
+For further information have a look at the main README.md in jarvis.cpp/example/main/README.md
 
 ## Common Options
 
-In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
+In this section, we cover the most commonly used options for running the `infill` program with the JARVIS models:
 
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the JARVIS model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but JARVIS models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
 ## Input Prompts
 
-The `infill` program provides several ways to interact with the LLaMA models using input prompts:
+The `infill` program provides several ways to interact with the JARVIS models using input prompts:
 
 -   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
 -   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
@@ -27,7 +27,7 @@ The `infill` program provides several ways to interact with the LLaMA models usi
 
 ## Interaction
 
-The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
+The `infill` program offers a seamless way to interact with JARVIS models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
 
 ### Interaction Options
 
@@ -37,11 +37,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
 
 ### Example
 
-Download a model that supports infill, for example CodeLlama:
+Download a model that supports infill, for example CodeJarvis:
 ```console
-scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+scripts/hf.sh --repo TheBloke/CodeJarvis-13B-GGUF --file codejarvis-13b.Q5_K_S.gguf --outdir models
 ```
 
 ```bash
-./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
+./jarvis-infill -t 10 -ngl 0 -m models/codejarvis-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index f18362c91c7bf..8c27eef10ef6a 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -3,7 +3,7 @@
 #include "console.h"
 #include "sampling.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -33,20 +33,20 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
+static jarvis_context           ** g_ctx;
+static jarvis_model             ** g_model;
 static common_sampler          ** g_smpl;
 static common_params            * g_params;
-static std::vector<llama_token> * g_input_tokens;
+static std::vector<jarvis_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
+static std::vector<jarvis_token> * g_output_tokens;
 
 static bool is_interacting = false;
 
 static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
+    const jarvis_context * ctx, const common_params & params, const jarvis_model * model,
+    const std::vector<jarvis_token> & input_tokens, const std::string & output,
+    const std::vector<jarvis_token> & output_tokens
 ) {
     if (params.logdir.empty()) {
         return;
@@ -71,7 +71,7 @@ static void write_logfile(
 
     fprintf(logfile, "binary: infill\n");
     char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
+    jarvis_model_desc(model, model_desc, sizeof(model_desc));
     yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
 
     fprintf(logfile, "\n");
@@ -83,7 +83,7 @@ static void write_logfile(
     yaml_dump_string_multiline(logfile, "output", output.c_str());
     yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
 
-    llama_perf_dump_yaml(logfile, ctx);
+    jarvis_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
     common_params params;
     g_params = &params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_INFILL)) {
         return 1;
     }
 
@@ -160,12 +160,12 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_INF("%s: llama backend init\n", __func__);
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    LOG_INF("%s: jarvis backend init\n", __func__);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
+    jarvis_model * model = nullptr;
+    jarvis_context * ctx = nullptr;
     common_sampler * smpl = nullptr;
 
     g_model = &model;
@@ -174,18 +174,18 @@ int main(int argc, char ** argv) {
 
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    model = llama_init.model;
-    ctx = llama_init.context;
+    model = jarvis_init.model;
+    ctx = jarvis_init.context;
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
+    const int n_ctx = jarvis_n_ctx(ctx);
     LOG_DBG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
@@ -197,28 +197,28 @@ int main(int argc, char ** argv) {
         LOG_INF("\n");
         LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
-    const bool add_bos = llama_add_bos_token(model);
-    GGML_ASSERT(!llama_add_eos_token(model));
+    const bool add_bos = jarvis_add_bos_token(model);
+    GGML_ASSERT(!jarvis_add_eos_token(model));
 
-    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> embd_end;
-    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+    std::vector<jarvis_token> embd_inp;
+    std::vector<jarvis_token> embd_end;
+    std::vector<jarvis_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+    std::vector<jarvis_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
-    GGML_ASSERT(llama_token_fim_pre(model) >= 0);
-    GGML_ASSERT(llama_token_fim_suf(model) >= 0);
+    GGML_ASSERT(jarvis_token_fim_pre(model) >= 0);
+    GGML_ASSERT(jarvis_token_fim_suf(model) >= 0);
 
-    inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
+    inp_pfx.insert(inp_pfx.begin(), jarvis_token_fim_pre(model));
+    inp_sfx.insert(inp_sfx.begin(), jarvis_token_fim_suf(model));
 
     embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
     embd_end = params.spm_infill ? inp_pfx : inp_sfx;
     if (add_bos) {
-        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+        embd_inp.insert(embd_inp.begin(), jarvis_token_bos(model));
     }
     embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
-    const llama_token middle_token = llama_token_fim_mid(model);
+    const jarvis_token middle_token = jarvis_token_fim_mid(model);
     if (middle_token >= 0) {
         embd_inp.push_back(middle_token);
     }
@@ -230,7 +230,7 @@ int main(int argc, char ** argv) {
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
+        embd_inp.push_back(jarvis_token_bos(model));
         LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
@@ -311,10 +311,10 @@ int main(int argc, char ** argv) {
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
-            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
+            control_message = " - To return control to JARVIS, end your input with '\\'.\n"
                               " - To return control without starting a new line, end your input with '/'.\n";
         } else {
-            control_message = " - Press Return to return control to LLaMA.\n"
+            control_message = " - Press Return to return control to JARVIS.\n"
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
@@ -340,7 +340,7 @@ int main(int argc, char ** argv) {
     // the first thing we will do is to output the prompt, so set color accordingly
     console::set_display(console::prompt);
 
-    std::vector<llama_token> embd;
+    std::vector<jarvis_token> embd;
 
     while (n_remain != 0 || params.interactive) {
         // predict
@@ -375,8 +375,8 @@ int main(int argc, char ** argv) {
                 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                jarvis_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                jarvis_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                if (jarvis_decode(ctx, jarvis_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -411,7 +411,7 @@ int main(int argc, char ** argv) {
         embd.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = common_sampler_sample(smpl, ctx, -1);
+            const jarvis_token id = common_sampler_sample(smpl, ctx, -1);
 
             common_sampler_accept(smpl, id, true);
 
@@ -465,10 +465,10 @@ int main(int argc, char ** argv) {
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
             // deal with eot token in infill mode
-            if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((common_sampler_last(smpl) == jarvis_token_eot(model) || is_interacting) && params.interactive){
                 if (is_interacting && !params.interactive_first) {
                     // print an eot token
-                    LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", common_token_to_piece(ctx, jarvis_token_eot(model)).c_str());
                 }
                 LOG("\n");
                 console::set_display(console::user_input);
@@ -505,16 +505,16 @@ int main(int argc, char ** argv) {
                 }
 
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+                std::vector<jarvis_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<jarvis_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
-                inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
+                inp_pfx.insert(inp_pfx.begin(), jarvis_token_fim_pre(model));
+                inp_sfx.insert(inp_sfx.begin(), jarvis_token_fim_suf(model));
 
                 embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                 embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                 if (add_bos) {
-                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                    embd_inp.insert(embd_inp.begin(), jarvis_token_bos(model));
                 }
                 embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
@@ -529,7 +529,7 @@ int main(int argc, char ** argv) {
                 is_interacting = false;
             }
             // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
+            else if (jarvis_token_is_eog(model, common_sampler_last(smpl))) {
                 LOG_DBG("found EOS token\n");
 
                 if (params.interactive) {
@@ -545,7 +545,7 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    embd_inp.push_back(jarvis_token_bos(model));
                 }
 
                 std::string buffer;
@@ -585,7 +585,7 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
+                        const jarvis_token token = embd_inp[i];
                         output_tokens.push_back(token);
                         output_ss << common_token_to_piece(ctx, token);
                     }
@@ -608,7 +608,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
+        if (!embd.empty() && jarvis_token_is_eog(model, embd.back()) && !params.interactive) {
             break;
         }
 
@@ -620,18 +620,18 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        LOG("%s", common_token_to_piece(ctx, jarvis_token_eot(model)).c_str());
     }
 
     LOG("\n");
     common_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
     common_sampler_free(smpl);
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/jarvis-bench/CMakeLists.txt b/examples/jarvis-bench/CMakeLists.txt
new file mode 100644
index 0000000000000..e081060a8a1be
--- /dev/null
+++ b/examples/jarvis-bench/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET jarvis-bench)
+add_executable(${TARGET} jarvis-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llama-bench/README.md b/examples/jarvis-bench/README.md
similarity index 56%
rename from examples/llama-bench/README.md
rename to examples/jarvis-bench/README.md
index 6bbe4bb75fbf8..ca550fe7eff86 100644
--- a/examples/llama-bench/README.md
+++ b/examples/jarvis-bench/README.md
@@ -1,6 +1,6 @@
-# llama.cpp/examples/llama-bench
+# jarvis.cpp/examples/jarvis-bench
 
-Performance testing tool for llama.cpp.
+Performance testing tool for jarvis.cpp.
 
 ## Table of contents
 
@@ -20,7 +20,7 @@ Performance testing tool for llama.cpp.
 ## Syntax
 
 ```
-usage: ./llama-bench [options]
+usage: ./jarvis-bench [options]
 
 options:
   -h, --help
@@ -56,7 +56,7 @@ options:
 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```
 
-llama-bench can perform three types of tests:
+jarvis-bench can perform three types of tests:
 
 - Prompt processing (pp): processing a prompt in batches (`-p`)
 - Text generation (tg): generating a sequence of tokens (`-n`)
@@ -77,108 +77,108 @@ Note:
 ### Text generation with different models
 
 ```sh
-$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
+$ ./jarvis-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
 ```
 
 | model                          |       size |     params | backend    | ngl | test       |              t/s |
 | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
+| jarvis 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
+| jarvis 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
+| jarvis 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
 
 ### Prompt processing with different batch sizes
 
 ```sh
-$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
+$ ./jarvis-bench -n 0 -p 1024 -b 128,256,512,1024
 ```
 
 | model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
 | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
 
 ### Different numbers of threads
 
 ```sh
-$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
+$ ./jarvis-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
 ```
 
 | model                          |       size |     params | backend    |    threads | test       |              t/s |
 | ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
 
 ### Different numbers of layers offloaded to the GPU
 
 ```sh
-$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
+$ ./jarvis-bench -ngl 10,20,30,31,32,33,34,35
 ```
 
 | model                          |       size |     params | backend    | ngl | test       |              t/s |
 | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
 
 ## Output formats
 
-By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
+By default, jarvis-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
 
 ### Markdown
 
 ```sh
-$ ./llama-bench -o md
+$ ./jarvis-bench -o md
 ```
 
 | model                          |       size |     params | backend    | ngl | test       |              t/s |
 | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
+| jarvis 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
 
 ### CSV
 
 ```sh
-$ ./llama-bench -o csv
+$ ./jarvis-bench -o csv
 ```
 
 ```csv
 build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","jarvis 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","jarvis 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
 
 ### JSON
 
 ```sh
-$ ./llama-bench -o json
+$ ./jarvis-bench -o json
 ```
 
 ```json
@@ -193,7 +193,7 @@ $ ./llama-bench -o json
     "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
     "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
     "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
+    "model_type": "jarvis 7B mostly Q4_0",
     "model_size": 3825065984,
     "model_n_params": 6738415616,
     "n_batch": 512,
@@ -223,7 +223,7 @@ $ ./llama-bench -o json
     "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
     "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
     "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
+    "model_type": "jarvis 7B mostly Q4_0",
     "model_size": 3825065984,
     "model_n_params": 6738415616,
     "n_batch": 512,
@@ -250,12 +250,12 @@ $ ./llama-bench -o json
 ### JSONL
 
 ```sh
-$ ./llama-bench -o jsonl
+$ ./jarvis-bench -o jsonl
 ```
 
 ```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"jarvis 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"jarvis 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
 ```
 
 
@@ -264,7 +264,7 @@ $ ./llama-bench -o jsonl
 SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
 
 ```sh
-$ ./llama-bench -o sql
+$ ./jarvis-bench -o sql
 ```
 
 ```sql
@@ -297,6 +297,6 @@ CREATE TABLE IF NOT EXISTS test (
   stddev_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'jarvis 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'jarvis 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/jarvis-bench/jarvis-bench.cpp
similarity index 93%
rename from examples/llama-bench/llama-bench.cpp
rename to examples/jarvis-bench/jarvis-bench.cpp
index 4a8ea96764630..c1a3368a09f96 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/jarvis-bench/jarvis-bench.cpp
@@ -19,7 +19,7 @@
 #include <thread>
 
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"
@@ -207,11 +207,11 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
     return true;
 }
 
-static const char * split_mode_str(llama_split_mode mode) {
+static const char * split_mode_str(jarvis_split_mode mode) {
     switch (mode) {
-        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_MODE_ROW:   return "row";
+        case JARVIS_SPLIT_MODE_NONE:  return "none";
+        case JARVIS_SPLIT_MODE_LAYER: return "layer";
+        case JARVIS_SPLIT_MODE_ROW:   return "row";
         default: GGML_ABORT("invalid split mode");
     }
 }
@@ -237,7 +237,7 @@ struct cmd_params {
     std::vector<int> poll;
     std::vector<int> n_gpu_layers;
     std::vector<std::string> rpc_servers;
-    std::vector<llama_split_mode> split_mode;
+    std::vector<jarvis_split_mode> split_mode;
     std::vector<int> main_gpu;
     std::vector<bool> no_kv_offload;
     std::vector<bool> flash_attn;
@@ -269,11 +269,11 @@ static const cmd_params cmd_params_defaults = {
     /* poll                 */ {50},
     /* n_gpu_layers         */ {99},
     /* rpc_servers          */ {""},
-    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
+    /* split_mode           */ {JARVIS_SPLIT_MODE_LAYER},
     /* main_gpu             */ {0},
     /* no_kv_offload        */ {false},
     /* flash_attn           */ {false},
-    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* tensor_split         */ {std::vector<float>(jarvis_max_devices(), 0.0f)},
     /* use_mmap             */ {true},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
@@ -304,7 +304,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  --cpu-strict <0|1>                        (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
     printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    if (llama_supports_rpc()) {
+    if (jarvis_supports_rpc()) {
         printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
     }
     printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -497,7 +497,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<int>(argv[i], split_delim);
             params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-        } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
+        } else if (jarvis_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -509,15 +509,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 break;
             }
             auto p = string_split<std::string>(argv[i], split_delim);
-            std::vector<llama_split_mode> modes;
+            std::vector<jarvis_split_mode> modes;
             for (const auto & m : p) {
-                llama_split_mode mode;
+                jarvis_split_mode mode;
                 if (m == "none") {
-                    mode = LLAMA_SPLIT_MODE_NONE;
+                    mode = JARVIS_SPLIT_MODE_NONE;
                 } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_MODE_LAYER;
+                    mode = JARVIS_SPLIT_MODE_LAYER;
                 } else if (m == "row") {
-                    mode = LLAMA_SPLIT_MODE_ROW;
+                    mode = JARVIS_SPLIT_MODE_ROW;
                 } else {
                     invalid_param = true;
                     break;
@@ -583,10 +583,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 const std::regex regex{R"([;/]+)"};
                 std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
                 std::vector<std::string> split_arg{it, {}};
-                GGML_ASSERT(split_arg.size() <= llama_max_devices());
+                GGML_ASSERT(split_arg.size() <= jarvis_max_devices());
 
-                std::vector<float> tensor_split(llama_max_devices());
-                for (size_t i = 0; i < llama_max_devices(); ++i) {
+                std::vector<float> tensor_split(jarvis_max_devices());
+                for (size_t i = 0; i < jarvis_max_devices(); ++i) {
                     if (i < split_arg.size()) {
                         tensor_split[i] = std::stof(split_arg[i]);
                     } else {
@@ -680,7 +680,7 @@ struct cmd_params_instance {
     int poll;
     int n_gpu_layers;
     std::string rpc_servers;
-    llama_split_mode split_mode;
+    jarvis_split_mode split_mode;
     int main_gpu;
     bool no_kv_offload;
     bool flash_attn;
@@ -688,8 +688,8 @@ struct cmd_params_instance {
     bool use_mmap;
     bool embeddings;
 
-    llama_model_params to_llama_mparams() const {
-        llama_model_params mparams = llama_model_default_params();
+    jarvis_model_params to_jarvis_mparams() const {
+        jarvis_model_params mparams = jarvis_model_default_params();
 
         mparams.n_gpu_layers = n_gpu_layers;
         if (!rpc_servers.empty()) {
@@ -713,8 +713,8 @@ struct cmd_params_instance {
                tensor_split == other.tensor_split;
     }
 
-    llama_context_params to_llama_cparams() const {
-        llama_context_params cparams = llama_context_default_params();
+    jarvis_context_params to_jarvis_cparams() const {
+        jarvis_context_params cparams = jarvis_context_default_params();
 
         cparams.n_ctx = n_prompt + n_gen;
         cparams.n_batch = n_batch;
@@ -868,7 +868,7 @@ struct test {
     ggml_type type_k;
     ggml_type type_v;
     int n_gpu_layers;
-    llama_split_mode split_mode;
+    jarvis_split_mode split_mode;
     int main_gpu;
     bool no_kv_offload;
     bool flash_attn;
@@ -880,13 +880,13 @@ struct test {
     std::string test_time;
     std::vector<uint64_t> samples_ns;
 
-    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
+    test(const cmd_params_instance & inst, const jarvis_model * lmodel, const jarvis_context * ctx) {
         model_filename = inst.model;
         char buf[128];
-        llama_model_desc(lmodel, buf, sizeof(buf));
+        jarvis_model_desc(lmodel, buf, sizeof(buf));
         model_type = buf;
-        model_size = llama_model_size(lmodel);
-        model_n_params = llama_model_n_params(lmodel);
+        model_size = jarvis_model_size(lmodel);
+        model_n_params = jarvis_model_n_params(lmodel);
         n_batch = inst.n_batch;
         n_ubatch = inst.n_ubatch;
         n_threads = inst.n_threads;
@@ -1008,7 +1008,7 @@ struct test {
     std::vector<std::string> get_values() const {
         std::string tensor_split_str;
         int max_nonzero = 0;
-        for (size_t i = 0; i < llama_max_devices(); i++) {
+        for (size_t i = 0; i < jarvis_max_devices(); i++) {
             if (tensor_split[i] > 0) {
                 max_nonzero = i;
             }
@@ -1050,8 +1050,8 @@ struct test {
     }
 };
 
-const std::string test::build_commit = LLAMA_COMMIT;
-const int         test::build_number = LLAMA_BUILD_NUMBER;
+const std::string test::build_commit = JARVIS_COMMIT;
+const int         test::build_number = JARVIS_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
@@ -1428,45 +1428,45 @@ struct sql_printer : public printer {
     }
 };
 
-static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
-    llama_set_n_threads(ctx, n_threads, n_threads);
+static void test_prompt(jarvis_context * ctx, int n_prompt, int n_batch, int n_threads) {
+    jarvis_set_n_threads(ctx, n_threads, n_threads);
 
-    const llama_model * model = llama_get_model(ctx);
-    const int32_t n_vocab = llama_n_vocab(model);
+    const jarvis_model * model = jarvis_get_model(ctx);
+    const int32_t n_vocab = jarvis_n_vocab(model);
 
-    std::vector<llama_token> tokens(n_batch);
+    std::vector<jarvis_token> tokens(n_batch);
 
     int n_processed = 0;
 
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+        tokens[0] = n_processed == 0 && jarvis_add_bos_token(model) ? jarvis_token_bos(model) : std::rand() % n_vocab;
         for (int i = 1; i < n_tokens; i++) {
             tokens[i] = std::rand() % n_vocab;
         }
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
+        jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), n_tokens));
         n_processed += n_tokens;
     }
 
-    llama_synchronize(ctx);
+    jarvis_synchronize(ctx);
 }
 
-static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
-    llama_set_n_threads(ctx, n_threads, n_threads);
+static void test_gen(jarvis_context * ctx, int n_gen, int n_threads) {
+    jarvis_set_n_threads(ctx, n_threads, n_threads);
 
-    const llama_model * model = llama_get_model(ctx);
-    const int32_t n_vocab = llama_n_vocab(model);
+    const jarvis_model * model = jarvis_get_model(ctx);
+    const int32_t n_vocab = jarvis_n_vocab(model);
 
-    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+    jarvis_token token = jarvis_add_bos_token(model) ? jarvis_token_bos(model) : std::rand() % n_vocab;
 
     for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1));
-        llama_synchronize(ctx);
+        jarvis_decode(ctx, jarvis_batch_get_one(&token, 1));
+        jarvis_synchronize(ctx);
         token = std::rand() % n_vocab;
     }
 }
 
-static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
+static void jarvis_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
     (void) level;
     (void) text;
     (void) user_data;
@@ -1508,12 +1508,12 @@ int main(int argc, char ** argv) {
 
     cmd_params params = parse_cmd_params(argc, argv);
 
-    // initialize llama.cpp
+    // initialize jarvis.cpp
     if (!params.verbose) {
-        llama_log_set(llama_null_log_callback, NULL);
+        jarvis_log_set(jarvis_null_log_callback, NULL);
     }
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     set_process_priority(params.prio);
 
@@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
 
     std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
 
-    llama_model * lmodel = nullptr;
+    jarvis_model * lmodel = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
     int params_idx = 0;
@@ -1541,15 +1541,15 @@ int main(int argc, char ** argv) {
     for (const auto & inst : params_instances) {
         params_idx ++;
         if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+            fprintf(stderr, "jarvis-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
         }
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
             if (lmodel) {
-                llama_free_model(lmodel);
+                jarvis_free_model(lmodel);
             }
 
-            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            lmodel = jarvis_load_model_from_file(inst.model.c_str(), inst.to_jarvis_mparams());
             if (lmodel == NULL) {
                 fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
                 return 1;
@@ -1557,16 +1557,16 @@ int main(int argc, char ** argv) {
             prev_inst = &inst;
         }
 
-        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
+        jarvis_context * ctx = jarvis_new_context_with_model(lmodel, inst.to_jarvis_cparams());
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
-            llama_free_model(lmodel);
+            jarvis_free_model(lmodel);
             return 1;
         }
 
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
         // cool off before the test
         if (params.delay) {
@@ -1588,37 +1588,37 @@ int main(int argc, char ** argv) {
             exit(1);
         }
 
-        llama_attach_threadpool(ctx, threadpool, NULL);
+        jarvis_attach_threadpool(ctx, threadpool, NULL);
 
         // warmup run
         if (t.n_prompt > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+                fprintf(stderr, "jarvis-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
             test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+                fprintf(stderr, "jarvis-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
             }
             test_gen(ctx, 1, t.n_threads);
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(ctx);
+            jarvis_kv_cache_clear(ctx);
 
             uint64_t t_start = get_time_ns();
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                    fprintf(stderr, "jarvis-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                    fprintf(stderr, "jarvis-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
                 test_gen(ctx, t.n_gen, t.n_threads);
             }
@@ -1637,14 +1637,14 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_perf_context_print(ctx);
+        jarvis_perf_context_print(ctx);
 
-        llama_free(ctx);
+        jarvis_free(ctx);
 
         ggml_threadpool_free(threadpool);
     }
 
-    llama_free_model(lmodel);
+    jarvis_free_model(lmodel);
 
     if (p) {
         p->print_footer();
@@ -1654,7 +1654,7 @@ int main(int argc, char ** argv) {
         p_err->print_footer();
     }
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/llama.android/.gitignore b/examples/jarvis.android/.gitignore
similarity index 100%
rename from examples/llama.android/.gitignore
rename to examples/jarvis.android/.gitignore
diff --git a/examples/llama.android/README.md b/examples/jarvis.android/README.md
similarity index 100%
rename from examples/llama.android/README.md
rename to examples/jarvis.android/README.md
diff --git a/examples/llama.android/app/.gitignore b/examples/jarvis.android/app/.gitignore
similarity index 100%
rename from examples/llama.android/app/.gitignore
rename to examples/jarvis.android/app/.gitignore
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/jarvis.android/app/build.gradle.kts
similarity index 93%
rename from examples/llama.android/app/build.gradle.kts
rename to examples/jarvis.android/app/build.gradle.kts
index 8d1b37195efd4..faf26959b44a1 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/jarvis.android/app/build.gradle.kts
@@ -4,11 +4,11 @@ plugins {
 }
 
 android {
-    namespace = "com.example.llama"
+    namespace = "com.example.jarvis"
     compileSdk = 34
 
     defaultConfig {
-        applicationId = "com.example.llama"
+        applicationId = "com.example.jarvis"
         minSdk = 33
         targetSdk = 34
         versionCode = 1
@@ -54,7 +54,7 @@ dependencies {
     implementation("androidx.compose.ui:ui-graphics")
     implementation("androidx.compose.ui:ui-tooling-preview")
     implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
+    implementation(project(":jarvis"))
     testImplementation("junit:junit:4.13.2")
     androidTestImplementation("androidx.test.ext:junit:1.1.5")
     androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/jarvis.android/app/proguard-rules.pro
similarity index 100%
rename from examples/llama.android/app/proguard-rules.pro
rename to examples/jarvis.android/app/proguard-rules.pro
diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/jarvis.android/app/src/main/AndroidManifest.xml
similarity index 89%
rename from examples/llama.android/app/src/main/AndroidManifest.xml
rename to examples/jarvis.android/app/src/main/AndroidManifest.xml
index 41a358a299154..fcd605d2484b5 100644
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/jarvis.android/app/src/main/AndroidManifest.xml
@@ -12,13 +12,13 @@
         android:label="@string/app_name"
         android:roundIcon="@mipmap/ic_launcher_round"
         android:supportsRtl="true"
-        android:theme="@style/Theme.LlamaAndroid"
+        android:theme="@style/Theme.JarvisAndroid"
         >
 
         <activity
             android:name=".MainActivity"
             android:exported="true"
-            android:theme="@style/Theme.LlamaAndroid">
+            android:theme="@style/Theme.JarvisAndroid">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
 
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt
similarity index 99%
rename from examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt
index 78c231ae55d8c..1c8320e7a4f15 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt
@@ -1,4 +1,4 @@
-package com.example.llama
+package com.example.jarvis
 
 import android.app.DownloadManager
 import android.net.Uri
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt
similarity index 94%
rename from examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt
index 9da04f7d3c32e..00789cb3bad3e 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt
@@ -1,4 +1,4 @@
-package com.example.llama
+package com.example.jarvis
 
 import android.app.ActivityManager
 import android.app.DownloadManager
@@ -30,7 +30,7 @@ import androidx.compose.runtime.Composable
 import androidx.compose.ui.Modifier
 import androidx.compose.ui.unit.dp
 import androidx.core.content.getSystemService
-import com.example.llama.ui.theme.LlamaAndroidTheme
+import com.example.jarvis.ui.theme.JarvisAndroidTheme
 import java.io.File
 
 class MainActivity(
@@ -77,9 +77,9 @@ class MainActivity(
                 File(extFilesDir, "phi-2-q4_0.gguf"),
             ),
             Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
+                "TinyJarvis 1.1B (f16, 2.2 GiB)",
+                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf?download=true"),
+                File(extFilesDir, "tinyjarvis-1.1-f16.gguf"),
             ),
             Downloadable(
                 "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
@@ -89,7 +89,7 @@ class MainActivity(
         )
 
         setContent {
-            LlamaAndroidTheme {
+            JarvisAndroidTheme {
                 // A surface container using the 'background' color from the theme
                 Surface(
                     modifier = Modifier.fillMaxSize(),
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt
similarity index 85%
rename from examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt
index 45ac29938f441..74dba04fa668a 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt
@@ -1,6 +1,6 @@
-package com.example.llama
+package com.example.jarvis
 
-import android.llama.cpp.LLamaAndroid
+import android.jarvis.cpp.JarvisAndroid
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
@@ -10,7 +10,7 @@ import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch
 
-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
+class MainViewModel(private val jarvisAndroid: JarvisAndroid = JarvisAndroid.instance()): ViewModel() {
     companion object {
         @JvmStatic
         private val NanosPerSecond = 1_000_000_000.0
@@ -29,7 +29,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
 
         viewModelScope.launch {
             try {
-                llamaAndroid.unload()
+                jarvisAndroid.unload()
             } catch (exc: IllegalStateException) {
                 messages += exc.message!!
             }
@@ -45,7 +45,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
         messages += ""
 
         viewModelScope.launch {
-            llamaAndroid.send(text)
+            jarvisAndroid.send(text)
                 .catch {
                     Log.e(tag, "send() failed", it)
                     messages += it.message!!
@@ -58,7 +58,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
         viewModelScope.launch {
             try {
                 val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
+                val warmupResult = jarvisAndroid.bench(pp, tg, pl, nr)
                 val end = System.nanoTime()
 
                 messages += warmupResult
@@ -71,7 +71,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
                     return@launch
                 }
 
-                messages += llamaAndroid.bench(512, 128, 1, 3)
+                messages += jarvisAndroid.bench(512, 128, 1, 3)
             } catch (exc: IllegalStateException) {
                 Log.e(tag, "bench() failed", exc)
                 messages += exc.message!!
@@ -82,7 +82,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
     fun load(pathToModel: String) {
         viewModelScope.launch {
             try {
-                llamaAndroid.load(pathToModel)
+                jarvisAndroid.load(pathToModel)
                 messages += "Loaded $pathToModel"
             } catch (exc: IllegalStateException) {
                 Log.e(tag, "load() failed", exc)
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt
similarity index 87%
rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt
index 40c30e8d97077..84e34456c5b8b 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt
@@ -1,4 +1,4 @@
-package com.example.llama.ui.theme
+package com.example.jarvis.ui.theme
 
 import androidx.compose.ui.graphics.Color
 
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt
similarity index 97%
rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt
index e742220a8d719..3298e08c63b08 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt
@@ -1,4 +1,4 @@
-package com.example.llama.ui.theme
+package com.example.jarvis.ui.theme
 
 import android.app.Activity
 import android.os.Build
@@ -38,7 +38,7 @@ private val LightColorScheme = lightColorScheme(
 )
 
 @Composable
-fun LlamaAndroidTheme(
+fun JarvisAndroidTheme(
     darkTheme: Boolean = isSystemInDarkTheme(),
     // Dynamic color is available on Android 12+
     dynamicColor: Boolean = true,
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt
similarity index 96%
rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt
index 0b87946ca3ab1..bde5dfbb78802 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt
@@ -1,4 +1,4 @@
-package com.example.llama.ui.theme
+package com.example.jarvis.ui.theme
 
 import androidx.compose.material3.Typography
 import androidx.compose.ui.text.TextStyle
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/jarvis.android/app/src/main/res/drawable/ic_launcher_background.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
rename to examples/jarvis.android/app/src/main/res/drawable/ic_launcher_background.xml
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/jarvis.android/app/src/main/res/drawable/ic_launcher_foreground.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
rename to examples/jarvis.android/app/src/main/res/drawable/ic_launcher_foreground.xml
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
rename to examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
rename to examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
similarity index 100%
rename from examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
rename to examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/jarvis.android/app/src/main/res/values/colors.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/values/colors.xml
rename to examples/jarvis.android/app/src/main/res/values/colors.xml
diff --git a/examples/jarvis.android/app/src/main/res/values/strings.xml b/examples/jarvis.android/app/src/main/res/values/strings.xml
new file mode 100644
index 0000000000000..be0735465a5da
--- /dev/null
+++ b/examples/jarvis.android/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">JarvisAndroid</string>
+</resources>
diff --git a/examples/jarvis.android/app/src/main/res/values/themes.xml b/examples/jarvis.android/app/src/main/res/values/themes.xml
new file mode 100644
index 0000000000000..6c7456dea61b0
--- /dev/null
+++ b/examples/jarvis.android/app/src/main/res/values/themes.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="Theme.JarvisAndroid" parent="android:Theme.Material.Light.NoActionBar" />
+</resources>
diff --git a/examples/llama.android/app/src/main/res/xml/backup_rules.xml b/examples/jarvis.android/app/src/main/res/xml/backup_rules.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/xml/backup_rules.xml
rename to examples/jarvis.android/app/src/main/res/xml/backup_rules.xml
diff --git a/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml b/examples/jarvis.android/app/src/main/res/xml/data_extraction_rules.xml
similarity index 100%
rename from examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
rename to examples/jarvis.android/app/src/main/res/xml/data_extraction_rules.xml
diff --git a/examples/llama.android/build.gradle.kts b/examples/jarvis.android/build.gradle.kts
similarity index 100%
rename from examples/llama.android/build.gradle.kts
rename to examples/jarvis.android/build.gradle.kts
diff --git a/examples/llama.android/gradle.properties b/examples/jarvis.android/gradle.properties
similarity index 100%
rename from examples/llama.android/gradle.properties
rename to examples/jarvis.android/gradle.properties
diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.jar b/examples/jarvis.android/gradle/wrapper/gradle-wrapper.jar
similarity index 100%
rename from examples/llama.android/gradle/wrapper/gradle-wrapper.jar
rename to examples/jarvis.android/gradle/wrapper/gradle-wrapper.jar
diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties b/examples/jarvis.android/gradle/wrapper/gradle-wrapper.properties
similarity index 100%
rename from examples/llama.android/gradle/wrapper/gradle-wrapper.properties
rename to examples/jarvis.android/gradle/wrapper/gradle-wrapper.properties
diff --git a/examples/llama.android/gradlew b/examples/jarvis.android/gradlew
old mode 100755
new mode 100644
similarity index 100%
rename from examples/llama.android/gradlew
rename to examples/jarvis.android/gradlew
diff --git a/examples/llama.android/llama/.gitignore b/examples/jarvis.android/jarvis/.gitignore
similarity index 100%
rename from examples/llama.android/llama/.gitignore
rename to examples/jarvis.android/jarvis/.gitignore
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/jarvis.android/jarvis/build.gradle.kts
similarity index 94%
rename from examples/llama.android/llama/build.gradle.kts
rename to examples/jarvis.android/jarvis/build.gradle.kts
index 2d1dfba2040da..0b2c5e276ce00 100644
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/jarvis.android/jarvis/build.gradle.kts
@@ -4,7 +4,7 @@ plugins {
 }
 
 android {
-    namespace = "android.llama.cpp"
+    namespace = "android.jarvis.cpp"
     compileSdk = 34
 
     defaultConfig {
@@ -18,7 +18,7 @@ android {
         }
         externalNativeBuild {
             cmake {
-                arguments += "-DLLAMA_BUILD_COMMON=ON"
+                arguments += "-DJARVIS_BUILD_COMMON=ON"
                 arguments += "-DCMAKE_BUILD_TYPE=Release"
                 cppFlags += listOf()
                 arguments += listOf()
diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/jarvis.android/jarvis/consumer-rules.pro
similarity index 100%
rename from examples/llama.android/llama/consumer-rules.pro
rename to examples/jarvis.android/jarvis/consumer-rules.pro
diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/jarvis.android/jarvis/proguard-rules.pro
similarity index 100%
rename from examples/llama.android/llama/proguard-rules.pro
rename to examples/jarvis.android/jarvis/proguard-rules.pro
diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/jarvis.android/jarvis/src/androidTest/java/android/jarvis/cpp/ExampleInstrumentedTest.kt
similarity index 85%
rename from examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
rename to examples/jarvis.android/jarvis/src/androidTest/java/android/jarvis/cpp/ExampleInstrumentedTest.kt
index 05d6ab5d2dd23..9d9148f9cdb19 100644
--- a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
+++ b/examples/jarvis.android/jarvis/src/androidTest/java/android/jarvis/cpp/ExampleInstrumentedTest.kt
@@ -1,4 +1,4 @@
-package android.llama.cpp
+package android.jarvis.cpp
 
 import androidx.test.platform.app.InstrumentationRegistry
 import androidx.test.ext.junit.runners.AndroidJUnit4
@@ -19,6 +19,6 @@ class ExampleInstrumentedTest {
     fun useAppContext() {
         // Context of the app under test.
         val appContext = InstrumentationRegistry.getInstrumentation().targetContext
-        assertEquals("android.llama.cpp.test", appContext.packageName)
+        assertEquals("android.jarvis.cpp.test", appContext.packageName)
     }
 }
diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/jarvis.android/jarvis/src/main/AndroidManifest.xml
similarity index 100%
rename from examples/llama.android/llama/src/main/AndroidManifest.xml
rename to examples/jarvis.android/jarvis/src/main/AndroidManifest.xml
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/jarvis.android/jarvis/src/main/cpp/CMakeLists.txt
similarity index 88%
rename from examples/llama.android/llama/src/main/cpp/CMakeLists.txt
rename to examples/jarvis.android/jarvis/src/main/cpp/CMakeLists.txt
index 2de496574f54a..1573a4398fbdb 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/jarvis.android/jarvis/src/main/cpp/CMakeLists.txt
@@ -9,17 +9,17 @@ cmake_minimum_required(VERSION 3.22.1)
 # Since this is the top level CMakeLists.txt, the project name is also accessible
 # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
 # build script scope).
-project("llama-android")
+project("jarvis-android")
 
 #include(FetchContent)
 #FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        jarvis
+#        GIT_REPOSITORY https://github.com/ggerganov/jarvis.cpp
 #        GIT_TAG        master
 #)
 
 # Also provides "common"
-#FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(jarvis)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -31,8 +31,8 @@ project("llama-android")
 # is preferred for the same purpose.
 #
 
-#load local llama.cpp
-add_subdirectory(../../../../../../ build-llama)
+#load local jarvis.cpp
+add_subdirectory(../../../../../../ build-jarvis)
 
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
@@ -40,14 +40,14 @@ add_subdirectory(../../../../../../ build-llama)
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
         # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
+        jarvis-android.cpp)
 
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
         # List libraries link to the target library
-        llama
+        jarvis
         common
         android
         log)
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/jarvis.android/jarvis/src/main/cpp/jarvis-android.cpp
similarity index 65%
rename from examples/llama.android/llama/src/main/cpp/llama-android.cpp
rename to examples/jarvis.android/jarvis/src/main/cpp/jarvis-android.cpp
index b3858ddfb6168..e60e7006c7691 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/jarvis.android/jarvis/src/main/cpp/jarvis-android.cpp
@@ -4,7 +4,7 @@
 #include <math.h>
 #include <string>
 #include <unistd.h>
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 
 // Write C++ code here.
@@ -15,17 +15,17 @@
 //
 // In MainActivity.java:
 //    static {
-//       System.loadLibrary("llama-android");
+//       System.loadLibrary("jarvis-android");
 //    }
 //
 // Or, in MainActivity.kt:
 //    companion object {
 //      init {
-//         System.loadLibrary("llama-android")
+//         System.loadLibrary("jarvis-android")
 //      }
 //    }
 
-#define TAG "llama-android.cpp"
+#define TAG "jarvis-android.cpp"
 #define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
 #define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
 
@@ -81,13 +81,13 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
-    llama_model_params model_params = llama_model_default_params();
+Java_android_jarvis_cpp_JarvisAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
+    jarvis_model_params model_params = jarvis_model_default_params();
 
     auto path_to_model = env->GetStringUTFChars(filename, 0);
     LOGi("Loading model from %s", path_to_model);
 
-    auto model = llama_load_model_from_file(path_to_model, model_params);
+    auto model = jarvis_load_model_from_file(path_to_model, model_params);
     env->ReleaseStringUTFChars(filename, path_to_model);
 
     if (!model) {
@@ -101,14 +101,14 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_free_model(reinterpret_cast<llama_model *>(model));
+Java_android_jarvis_cpp_JarvisAndroid_free_1model(JNIEnv *, jobject, jlong model) {
+    jarvis_free_model(reinterpret_cast<jarvis_model *>(model));
 }
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
-    auto model = reinterpret_cast<llama_model *>(jmodel);
+Java_android_jarvis_cpp_JarvisAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+    auto model = reinterpret_cast<jarvis_model *>(jmodel);
 
     if (!model) {
         LOGe("new_context(): model cannot be null");
@@ -119,18 +119,18 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
     int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
     LOGi("Using %d threads", n_threads);
 
-    llama_context_params ctx_params = llama_context_default_params();
+    jarvis_context_params ctx_params = jarvis_context_default_params();
 
     ctx_params.n_ctx           = 2048;
     ctx_params.n_threads       = n_threads;
     ctx_params.n_threads_batch = n_threads;
 
-    llama_context * context = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * context = jarvis_new_context_with_model(model, ctx_params);
 
     if (!context) {
-        LOGe("llama_new_context_with_model() returned null)");
+        LOGe("jarvis_new_context_with_model() returned null)");
         env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
-                      "llama_new_context_with_model() returned null)");
+                      "jarvis_new_context_with_model() returned null)");
         return 0;
     }
 
@@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
-    llama_free(reinterpret_cast<llama_context *>(context));
+Java_android_jarvis_cpp_JarvisAndroid_free_1context(JNIEnv *, jobject, jlong context) {
+    jarvis_free(reinterpret_cast<jarvis_context *>(context));
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
-    llama_backend_free();
+Java_android_jarvis_cpp_JarvisAndroid_backend_1free(JNIEnv *, jobject) {
+    jarvis_backend_free();
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
-    llama_log_set(log_callback, NULL);
+Java_android_jarvis_cpp_JarvisAndroid_log_1to_1android(JNIEnv *, jobject) {
+    jarvis_log_set(log_callback, NULL);
 }
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_bench_1model(
+Java_android_jarvis_cpp_JarvisAndroid_bench_1model(
         JNIEnv *env,
         jobject,
         jlong context_pointer,
@@ -173,11 +173,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
     auto pp_std = 0.0;
     auto tg_std = 0.0;
 
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto model = reinterpret_cast<llama_model *>(model_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto context = reinterpret_cast<jarvis_context *>(context_pointer);
+    const auto model = reinterpret_cast<jarvis_model *>(model_pointer);
+    const auto batch = reinterpret_cast<jarvis_batch *>(batch_pointer);
 
-    const int n_ctx = llama_n_ctx(context);
+    const int n_ctx = jarvis_n_ctx(context);
 
     LOGi("n_ctx = %d", n_ctx);
 
@@ -194,11 +194,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         }
 
         batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
+        jarvis_kv_cache_clear(context);
 
         const auto t_pp_start = ggml_time_us();
-        if (llama_decode(context, *batch) != 0) {
-            LOGi("llama_decode() failed during prompt processing");
+        if (jarvis_decode(context, *batch) != 0) {
+            LOGi("jarvis_decode() failed during prompt processing");
         }
         const auto t_pp_end = ggml_time_us();
 
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         LOGi("Benchmark text generation (tg)");
 
-        llama_kv_cache_clear(context);
+        jarvis_kv_cache_clear(context);
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
@@ -215,15 +215,15 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
                 common_batch_add(*batch, 0, i, { j }, true);
             }
 
-            LOGi("llama_decode() text generation: %d", i);
-            if (llama_decode(context, *batch) != 0) {
-                LOGi("llama_decode() failed during text generation");
+            LOGi("jarvis_decode() text generation: %d", i);
+            if (jarvis_decode(context, *batch) != 0) {
+                LOGi("jarvis_decode() failed during text generation");
             }
         }
 
         const auto t_tg_end = ggml_time_us();
 
-        llama_kv_cache_clear(context);
+        jarvis_kv_cache_clear(context);
 
         const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
         const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -252,10 +252,10 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
     }
 
     char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
+    jarvis_model_desc(model, model_desc, sizeof(model_desc));
 
-    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
-    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
+    const auto model_size     = double(jarvis_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
+    const auto model_n_params = double(jarvis_model_n_params(model)) / 1e9;
 
     const auto backend    = "(Android)"; // TODO: What should this be?
 
@@ -271,11 +271,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_android_jarvis_cpp_JarvisAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
 
-    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
+    // Source: Copy of jarvis.cpp:jarvis_batch_init but heap-allocated.
 
-    llama_batch *batch = new llama_batch {
+    jarvis_batch *batch = new jarvis_batch {
         0,
         nullptr,
         nullptr,
@@ -288,14 +288,14 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
     if (embd) {
         batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
     } else {
-        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
+        batch->token = (jarvis_token *) malloc(sizeof(jarvis_token) * n_tokens);
     }
 
-    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
+    batch->pos      = (jarvis_pos *)     malloc(sizeof(jarvis_pos)      * n_tokens);
     batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
-    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
+    batch->seq_id   = (jarvis_seq_id **) malloc(sizeof(jarvis_seq_id *) * n_tokens);
     for (int i = 0; i < n_tokens; ++i) {
-        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+        batch->seq_id[i] = (jarvis_seq_id *) malloc(sizeof(jarvis_seq_id) * n_seq_max);
     }
     batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
 
@@ -304,42 +304,42 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+Java_android_jarvis_cpp_JarvisAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    jarvis_batch_free(*reinterpret_cast<jarvis_batch *>(batch_pointer));
 }
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
-    auto sparams = llama_sampler_chain_default_params();
+Java_android_jarvis_cpp_JarvisAndroid_new_1sampler(JNIEnv *, jobject) {
+    auto sparams = jarvis_sampler_chain_default_params();
     sparams.no_perf = true;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_greedy());
 
     return reinterpret_cast<jlong>(smpl);
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
-    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
+Java_android_jarvis_cpp_JarvisAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
+    jarvis_sampler_free(reinterpret_cast<jarvis_sampler *>(sampler_pointer));
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
-    llama_backend_init();
+Java_android_jarvis_cpp_JarvisAndroid_backend_1init(JNIEnv *, jobject) {
+    jarvis_backend_init();
 }
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
-    return env->NewStringUTF(llama_print_system_info());
+Java_android_jarvis_cpp_JarvisAndroid_system_1info(JNIEnv *env, jobject) {
+    return env->NewStringUTF(jarvis_print_system_info());
 }
 
 extern "C"
 JNIEXPORT jint JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1init(
+Java_android_jarvis_cpp_JarvisAndroid_completion_1init(
         JNIEnv *env,
         jobject,
         jlong context_pointer,
@@ -351,12 +351,12 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
     cached_token_chars.clear();
 
     const auto text = env->GetStringUTFChars(jtext, 0);
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto context = reinterpret_cast<jarvis_context *>(context_pointer);
+    const auto batch = reinterpret_cast<jarvis_batch *>(batch_pointer);
 
     const auto tokens_list = common_tokenize(context, text, 1);
 
-    auto n_ctx = llama_n_ctx(context);
+    auto n_ctx = jarvis_n_ctx(context);
     auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
 
     LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
@@ -376,11 +376,11 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
         common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
     }
 
-    // llama_decode will output logits only for the last token of the prompt
+    // jarvis_decode will output logits only for the last token of the prompt
     batch->logits[batch->n_tokens - 1] = true;
 
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() failed");
+    if (jarvis_decode(context, *batch) != 0) {
+        LOGe("jarvis_decode() failed");
     }
 
     env->ReleaseStringUTFChars(jtext, text);
@@ -390,7 +390,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1loop(
+Java_android_jarvis_cpp_JarvisAndroid_completion_1loop(
         JNIEnv * env,
         jobject,
         jlong context_pointer,
@@ -399,20 +399,20 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
         jint n_len,
         jobject intvar_ncur
 ) {
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
-    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
-    const auto model = llama_get_model(context);
+    const auto context = reinterpret_cast<jarvis_context *>(context_pointer);
+    const auto batch   = reinterpret_cast<jarvis_batch   *>(batch_pointer);
+    const auto sampler = reinterpret_cast<jarvis_sampler *>(sampler_pointer);
+    const auto model = jarvis_get_model(context);
 
     if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
     if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
     if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
 
     // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
+    const auto new_token_id = jarvis_sampler_sample(sampler, context, -1);
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+    if (jarvis_token_is_eog(model, new_token_id) || n_cur == n_len) {
         return nullptr;
     }
 
@@ -433,8 +433,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 
     env->CallVoidMethod(intvar_ncur, la_int_var_inc);
 
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() returned null");
+    if (jarvis_decode(context, *batch) != 0) {
+        LOGe("jarvis_decode() returned null");
     }
 
     return new_token;
@@ -442,6 +442,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+Java_android_jarvis_cpp_JarvisAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+    jarvis_kv_cache_clear(reinterpret_cast<jarvis_context *>(context));
 }
diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/jarvis.android/jarvis/src/main/java/android/jarvis/cpp/JarvisAndroid.kt
similarity index 95%
rename from examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
rename to examples/jarvis.android/jarvis/src/main/java/android/jarvis/cpp/JarvisAndroid.kt
index cf520e4594004..9074fe1f1ebfd 100644
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/jarvis.android/jarvis/src/main/java/android/jarvis/cpp/JarvisAndroid.kt
@@ -1,4 +1,4 @@
-package android.llama.cpp
+package android.jarvis.cpp
 
 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread
 
-class LLamaAndroid {
+class JarvisAndroid {
     private val tag: String? = this::class.simpleName
 
     private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@@ -20,9 +20,9 @@ class LLamaAndroid {
             Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
 
             // No-op if called more than once.
-            System.loadLibrary("llama-android")
+            System.loadLibrary("jarvis-android")
 
-            // Set llama log handler to Android
+            // Set jarvis log handler to Android
             log_to_android()
             backend_init(false)
 
@@ -172,8 +172,8 @@ class LLamaAndroid {
         }
 
         // Enforce only one instance of Llm.
-        private val _instance: LLamaAndroid = LLamaAndroid()
+        private val _instance: JarvisAndroid = JarvisAndroid()
 
-        fun instance(): LLamaAndroid = _instance
+        fun instance(): JarvisAndroid = _instance
     }
 }
diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/jarvis.android/jarvis/src/test/java/android/jarvis/cpp/ExampleUnitTest.kt
similarity index 92%
rename from examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
rename to examples/jarvis.android/jarvis/src/test/java/android/jarvis/cpp/ExampleUnitTest.kt
index cbbb974d32266..2bdc904c877bc 100644
--- a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
+++ b/examples/jarvis.android/jarvis/src/test/java/android/jarvis/cpp/ExampleUnitTest.kt
@@ -1,4 +1,4 @@
-package android.llama.cpp
+package android.jarvis.cpp
 
 import org.junit.Test
 
diff --git a/examples/llama.android/settings.gradle.kts b/examples/jarvis.android/settings.gradle.kts
similarity index 84%
rename from examples/llama.android/settings.gradle.kts
rename to examples/jarvis.android/settings.gradle.kts
index c7c1a034a45b8..2b44fa0640ab1 100644
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/jarvis.android/settings.gradle.kts
@@ -13,6 +13,6 @@ dependencyResolutionManagement {
     }
 }
 
-rootProject.name = "LlamaAndroid"
+rootProject.name = "JarvisAndroid"
 include(":app")
-include(":llama")
+include(":jarvis")
diff --git a/examples/llama.swiftui/.gitignore b/examples/jarvis.swiftui/.gitignore
similarity index 100%
rename from examples/llama.swiftui/.gitignore
rename to examples/jarvis.swiftui/.gitignore
diff --git a/examples/jarvis.swiftui/README.md b/examples/jarvis.swiftui/README.md
new file mode 100644
index 0000000000000..0a9683ceeb585
--- /dev/null
+++ b/examples/jarvis.swiftui/README.md
@@ -0,0 +1,12 @@
+# jarvis.cpp/examples/jarvis.swiftui
+
+Local inference of jarvis.cpp on an iPhone. This is a sample app that can be used as a starting
+point for more advanced projects.
+
+For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/jarvis.cpp/discussions/4508
+
+![image](https://github.com/ggerganov/jarvis.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
+
+Video demonstration:
+
+https://github.com/bachittle/jarvis.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/jarvis.swiftui/jarvis.cpp.swift/LibJarvis.swift
similarity index 68%
rename from examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
rename to examples/jarvis.swiftui/jarvis.cpp.swift/LibJarvis.swift
index 65cd4eb515c7f..c716b987446fe 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/jarvis.swiftui/jarvis.cpp.swift/LibJarvis.swift
@@ -1,15 +1,15 @@
 import Foundation
-import llama
+import jarvis
 
-enum LlamaError: Error {
+enum JarvisError: Error {
     case couldNotInitializeContext
 }
 
-func llama_batch_clear(_ batch: inout llama_batch) {
+func jarvis_batch_clear(_ batch: inout jarvis_batch) {
     batch.n_tokens = 0
 }
 
-func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
+func jarvis_batch_add(_ batch: inout jarvis_batch, _ id: jarvis_token, _ pos: jarvis_pos, _ seq_ids: [jarvis_seq_id], _ logits: Bool) {
     batch.token   [Int(batch.n_tokens)] = id
     batch.pos     [Int(batch.n_tokens)] = pos
     batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
@@ -21,12 +21,12 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
     batch.n_tokens += 1
 }
 
-actor LlamaContext {
+actor JarvisContext {
     private var model: OpaquePointer
     private var context: OpaquePointer
-    private var sampling: UnsafeMutablePointer<llama_sampler>
-    private var batch: llama_batch
-    private var tokens_list: [llama_token]
+    private var sampling: UnsafeMutablePointer<jarvis_sampler>
+    private var batch: jarvis_batch
+    private var tokens_list: [jarvis_token]
     var is_done: Bool = false
 
     /// This variable is used to store temporarily invalid cchars
@@ -41,51 +41,51 @@ actor LlamaContext {
         self.model = model
         self.context = context
         self.tokens_list = []
-        self.batch = llama_batch_init(512, 0, 1)
+        self.batch = jarvis_batch_init(512, 0, 1)
         self.temporary_invalid_cchars = []
-        let sparams = llama_sampler_chain_default_params()
-        self.sampling = llama_sampler_chain_init(sparams)
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
+        let sparams = jarvis_sampler_chain_default_params()
+        self.sampling = jarvis_sampler_chain_init(sparams)
+        jarvis_sampler_chain_add(self.sampling, jarvis_sampler_init_temp(0.4))
+        jarvis_sampler_chain_add(self.sampling, jarvis_sampler_init_dist(1234))
     }
 
     deinit {
-        llama_sampler_free(sampling)
-        llama_batch_free(batch)
-        llama_free(context)
-        llama_free_model(model)
-        llama_backend_free()
+        jarvis_sampler_free(sampling)
+        jarvis_batch_free(batch)
+        jarvis_free(context)
+        jarvis_free_model(model)
+        jarvis_backend_free()
     }
 
-    static func create_context(path: String) throws -> LlamaContext {
-        llama_backend_init()
-        var model_params = llama_model_default_params()
+    static func create_context(path: String) throws -> JarvisContext {
+        jarvis_backend_init()
+        var model_params = jarvis_model_default_params()
 
 #if targetEnvironment(simulator)
         model_params.n_gpu_layers = 0
         print("Running on simulator, force use n_gpu_layers = 0")
 #endif
-        let model = llama_load_model_from_file(path, model_params)
+        let model = jarvis_load_model_from_file(path, model_params)
         guard let model else {
             print("Could not load model at \(path)")
-            throw LlamaError.couldNotInitializeContext
+            throw JarvisError.couldNotInitializeContext
         }
 
         let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
         print("Using \(n_threads) threads")
 
-        var ctx_params = llama_context_default_params()
+        var ctx_params = jarvis_context_default_params()
         ctx_params.n_ctx = 2048
         ctx_params.n_threads       = Int32(n_threads)
         ctx_params.n_threads_batch = Int32(n_threads)
 
-        let context = llama_new_context_with_model(model, ctx_params)
+        let context = jarvis_new_context_with_model(model, ctx_params)
         guard let context else {
             print("Could not load context!")
-            throw LlamaError.couldNotInitializeContext
+            throw JarvisError.couldNotInitializeContext
         }
 
-        return LlamaContext(model: model, context: context)
+        return JarvisContext(model: model, context: context)
     }
 
     func model_info() -> String {
@@ -97,7 +97,7 @@ actor LlamaContext {
 
         // TODO: this is probably very stupid way to get the string from C
 
-        let nChars = llama_model_desc(model, result, 256)
+        let nChars = jarvis_model_desc(model, result, 256)
         let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
 
         var SwiftString = ""
@@ -118,7 +118,7 @@ actor LlamaContext {
         tokens_list = tokenize(text: text, add_bos: true)
         temporary_invalid_cchars = []
 
-        let n_ctx = llama_n_ctx(context)
+        let n_ctx = jarvis_n_ctx(context)
         let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
 
         print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
@@ -131,27 +131,27 @@ actor LlamaContext {
             print(String(cString: token_to_piece(token: id) + [0]))
         }
 
-        llama_batch_clear(&batch)
+        jarvis_batch_clear(&batch)
 
         for i1 in 0..<tokens_list.count {
             let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
+            jarvis_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
         }
         batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-        if llama_decode(context, batch) != 0 {
-            print("llama_decode() failed")
+        if jarvis_decode(context, batch) != 0 {
+            print("jarvis_decode() failed")
         }
 
         n_cur = batch.n_tokens
     }
 
     func completion_loop() -> String {
-        var new_token_id: llama_token = 0
+        var new_token_id: jarvis_token = 0
 
-        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
+        new_token_id = jarvis_sampler_sample(sampling, context, batch.n_tokens - 1)
 
-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
+        if jarvis_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
             is_done = true
             let new_token_str = String(cString: temporary_invalid_cchars + [0])
@@ -176,14 +176,14 @@ actor LlamaContext {
         print(new_token_str)
         // tokens_list.append(new_token_id)
 
-        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
+        jarvis_batch_clear(&batch)
+        jarvis_batch_add(&batch, new_token_id, n_cur, [0], true)
 
         n_decode += 1
         n_cur    += 1
 
-        if llama_decode(context, batch) != 0 {
-            print("failed to evaluate llama!")
+        if jarvis_decode(context, batch) != 0 {
+            print("failed to evaluate jarvis!")
         }
 
         return new_token_str
@@ -199,48 +199,48 @@ actor LlamaContext {
         for _ in 0..<nr {
             // bench prompt processing
 
-            llama_batch_clear(&batch)
+            jarvis_batch_clear(&batch)
 
             let n_tokens = pp
 
             for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
+                jarvis_batch_add(&batch, 0, Int32(i), [0], false)
             }
             batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-            llama_kv_cache_clear(context)
+            jarvis_kv_cache_clear(context)
 
             let t_pp_start = ggml_time_us()
 
-            if llama_decode(context, batch) != 0 {
-                print("llama_decode() failed during prompt")
+            if jarvis_decode(context, batch) != 0 {
+                print("jarvis_decode() failed during prompt")
             }
-            llama_synchronize(context)
+            jarvis_synchronize(context)
 
             let t_pp_end = ggml_time_us()
 
             // bench text generation
 
-            llama_kv_cache_clear(context)
+            jarvis_kv_cache_clear(context)
 
             let t_tg_start = ggml_time_us()
 
             for i in 0..<tg {
-                llama_batch_clear(&batch)
+                jarvis_batch_clear(&batch)
 
                 for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
+                    jarvis_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
                 }
 
-                if llama_decode(context, batch) != 0 {
-                    print("llama_decode() failed during text generation")
+                if jarvis_decode(context, batch) != 0 {
+                    print("jarvis_decode() failed during text generation")
                 }
-                llama_synchronize(context)
+                jarvis_synchronize(context)
             }
 
             let t_tg_end = ggml_time_us()
 
-            llama_kv_cache_clear(context)
+            jarvis_kv_cache_clear(context)
 
             let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
             let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -269,8 +269,8 @@ actor LlamaContext {
         }
 
         let model_desc     = model_info();
-        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
-        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
+        let model_size     = String(format: "%.2f GiB", Double(jarvis_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
+        let model_n_params = String(format: "%.2f B", Double(jarvis_model_n_params(model)) / 1e9);
         let backend        = "Metal";
         let pp_avg_str     = String(format: "%.2f", pp_avg);
         let tg_avg_str     = String(format: "%.2f", tg_avg);
@@ -290,16 +290,16 @@ actor LlamaContext {
     func clear() {
         tokens_list.removeAll()
         temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
+        jarvis_kv_cache_clear(context)
     }
 
-    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
+    private func tokenize(text: String, add_bos: Bool) -> [jarvis_token] {
         let utf8Count = text.utf8.count
         let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
-        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokens = UnsafeMutablePointer<jarvis_token>.allocate(capacity: n_tokens)
+        let tokenCount = jarvis_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
 
-        var swiftTokens: [llama_token] = []
+        var swiftTokens: [jarvis_token] = []
         for i in 0..<tokenCount {
             swiftTokens.append(tokens[Int(i)])
         }
@@ -310,13 +310,13 @@ actor LlamaContext {
     }
 
     /// - note: The result does not contain null-terminator
-    private func token_to_piece(token: llama_token) -> [CChar] {
+    private func token_to_piece(token: jarvis_token) -> [CChar] {
         let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
         result.initialize(repeating: Int8(0), count: 8)
         defer {
             result.deallocate()
         }
-        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
+        let nTokens = jarvis_token_to_piece(model, token, result, 8, 0, false)
 
         if nTokens < 0 {
             let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -324,7 +324,7 @@ actor LlamaContext {
             defer {
                 newResult.deallocate()
             }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
+            let nNewTokens = jarvis_token_to_piece(model, token, newResult, -nTokens, 0, false)
             let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
             return Array(bufferPointer)
         } else {
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/project.pbxproj
similarity index 83%
rename from examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
rename to examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/project.pbxproj
index 3950b9e9df843..2bdebef61b999 100644
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/project.pbxproj
@@ -10,14 +10,14 @@
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
-		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
+		8A1C83772AC328BD0096AF73 /* jarvis_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* jarvis_swiftuiApp.swift */; };
 		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
 		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
 		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
-		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
+		8A907F332AC7138A006146EA /* LibJarvis.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibJarvis.swift */; };
+		8A9F7C4D2AC332EE008AE1EA /* JarvisState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* JarvisState.swift */; };
+		DF810E132B4A5BA200301144 /* jarvis in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* jarvis */; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */
 
@@ -25,15 +25,15 @@
 		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
 		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
+		8A1C83732AC328BD0096AF73 /* jarvis.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = jarvis.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		8A1C83762AC328BD0096AF73 /* jarvis_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = jarvis_swiftuiApp.swift; sourceTree = "<group>"; };
 		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
 		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
-		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
+		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = jarvis.swiftui/Resources/models; sourceTree = "<group>"; };
+		8A907F322AC7134E006146EA /* LibJarvis.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibJarvis.swift; sourceTree = "<group>"; };
+		8A9F7C4C2AC332EE008AE1EA /* JarvisState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = JarvisState.swift; sourceTree = "<group>"; };
+		DF2D2FE72B4A59BE00FCB72D /* jarvis.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = jarvis.cpp; path = ../..; sourceTree = "<group>"; };
 		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -42,7 +42,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
+				DF810E132B4A5BA200301144 /* jarvis in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
 			);
@@ -54,10 +54,10 @@
 		8A1C836A2AC328BD0096AF73 = {
 			isa = PBXGroup;
 			children = (
-				DF2D2FE72B4A59BE00FCB72D /* llama.cpp */,
-				8A907F312AC7134E006146EA /* llama.cpp.swift */,
+				DF2D2FE72B4A59BE00FCB72D /* jarvis.cpp */,
+				8A907F312AC7134E006146EA /* jarvis.cpp.swift */,
 				8A3F84232AC4C891005E2EE8 /* models */,
-				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83752AC328BD0096AF73 /* jarvis.swiftui */,
 				8A1C83742AC328BD0096AF73 /* Products */,
 				8A39BE082AC7601000BFEB40 /* Frameworks */,
 			);
@@ -66,21 +66,21 @@
 		8A1C83742AC328BD0096AF73 /* Products */ = {
 			isa = PBXGroup;
 			children = (
-				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
+				8A1C83732AC328BD0096AF73 /* jarvis.swiftui.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
 		};
-		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83752AC328BD0096AF73 /* jarvis.swiftui */ = {
 			isa = PBXGroup;
 			children = (
 				8A3F84102AC4BD85005E2EE8 /* Resources */,
 				8A9F7C4B2AC332DC008AE1EA /* Models */,
 				8A9F7C4A2AC332BF008AE1EA /* UI */,
-				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
+				8A1C83762AC328BD0096AF73 /* jarvis_swiftuiApp.swift */,
 				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
 			);
-			path = llama.swiftui;
+			path = jarvis.swiftui;
 			sourceTree = "<group>";
 		};
 		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
@@ -107,12 +107,12 @@
 			path = models;
 			sourceTree = "<group>";
 		};
-		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
+		8A907F312AC7134E006146EA /* jarvis.cpp.swift */ = {
 			isa = PBXGroup;
 			children = (
-				8A907F322AC7134E006146EA /* LibLlama.swift */,
+				8A907F322AC7134E006146EA /* LibJarvis.swift */,
 			);
-			path = llama.cpp.swift;
+			path = jarvis.cpp.swift;
 			sourceTree = "<group>";
 		};
 		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
@@ -129,7 +129,7 @@
 		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
 			isa = PBXGroup;
 			children = (
-				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
+				8A9F7C4C2AC332EE008AE1EA /* JarvisState.swift */,
 			);
 			path = Models;
 			sourceTree = "<group>";
@@ -137,9 +137,9 @@
 /* End PBXGroup section */
 
 /* Begin PBXNativeTarget section */
-		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83722AC328BD0096AF73 /* jarvis.swiftui */ = {
 			isa = PBXNativeTarget;
-			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
+			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "jarvis.swiftui" */;
 			buildPhases = (
 				8A1C836F2AC328BD0096AF73 /* Sources */,
 				8A1C83702AC328BD0096AF73 /* Frameworks */,
@@ -149,12 +149,12 @@
 			);
 			dependencies = (
 			);
-			name = llama.swiftui;
+			name = jarvis.swiftui;
 			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
+				DF810E122B4A5BA200301144 /* jarvis */,
 			);
-			productName = llama.swiftui;
-			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
+			productName = jarvis.swiftui;
+			productReference = 8A1C83732AC328BD0096AF73 /* jarvis.swiftui.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */
@@ -173,7 +173,7 @@
 					};
 				};
 			};
-			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
+			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "jarvis.swiftui" */;
 			compatibilityVersion = "Xcode 14.0";
 			developmentRegion = en;
 			hasScannedForEncodings = 0;
@@ -188,7 +188,7 @@
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
-				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83722AC328BD0096AF73 /* jarvis.swiftui */,
 			);
 		};
 /* End PBXProject section */
@@ -211,10 +211,10 @@
 			buildActionMask = 2147483647;
 			files = (
 				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
-				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
+				8A907F332AC7138A006146EA /* LibJarvis.swift in Sources */,
+				8A9F7C4D2AC332EE008AE1EA /* JarvisState.swift in Sources */,
 				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
+				8A1C83772AC328BD0096AF73 /* jarvis_swiftuiApp.swift in Sources */,
 				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
 				79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */,
 			);
@@ -363,7 +363,7 @@
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.jarvis-swift";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
@@ -395,7 +395,7 @@
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.jarvis-swift";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
@@ -408,7 +408,7 @@
 /* End XCBuildConfiguration section */
 
 /* Begin XCConfigurationList section */
-		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
+		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "jarvis.swiftui" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				8A1C837F2AC328BE0096AF73 /* Debug */,
@@ -417,7 +417,7 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
+		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "jarvis.swiftui" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				8A1C83822AC328BE0096AF73 /* Debug */,
@@ -429,9 +429,9 @@
 /* End XCConfigurationList section */
 
 /* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
+		DF810E122B4A5BA200301144 /* jarvis */ = {
 			isa = XCSwiftPackageProductDependency;
-			productName = llama;
+			productName = jarvis;
 		};
 /* End XCSwiftPackageProductDependency section */
 	};
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
similarity index 100%
rename from examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
rename to examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json b/examples/jarvis.swiftui/jarvis.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
similarity index 100%
rename from examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
rename to examples/jarvis.swiftui/jarvis.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json b/examples/jarvis.swiftui/jarvis.swiftui/Assets.xcassets/Contents.json
similarity index 100%
rename from examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
rename to examples/jarvis.swiftui/jarvis.swiftui/Assets.xcassets/Contents.json
diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/jarvis.swiftui/jarvis.swiftui/Models/JarvisState.swift
similarity index 76%
rename from examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/Models/JarvisState.swift
index b8f6a31d582cd..cddb0fa443400 100644
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/Models/JarvisState.swift
@@ -9,17 +9,17 @@ struct Model: Identifiable {
 }
 
 @MainActor
-class LlamaState: ObservableObject {
+class JarvisState: ObservableObject {
     @Published var messageLog = ""
     @Published var cacheCleared = false
     @Published var downloadedModels: [Model] = []
     @Published var undownloadedModels: [Model] = []
     let NS_PER_S = 1_000_000_000.0
 
-    private var llamaContext: LlamaContext?
+    private var jarvisContext: JarvisContext?
     private var defaultModelUrl: URL? {
         Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
-        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
+        // Bundle.main.url(forResource: "jarvis-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
     }
 
     init() {
@@ -64,17 +64,17 @@ class LlamaState: ObservableObject {
         return paths[0]
     }
     private let defaultModels: [Model] = [
-        Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
+        Model(name: "TinyJarvis-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyJarvis-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyjarvis-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyjarvis-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
         Model(
-            name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)",
-            url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true",
-            filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
+            name: "TinyJarvis-1.1B Chat (Q8_0, 1.1 GiB)",
+            url: "https://huggingface.co/TheBloke/TinyJarvis-1.1B-Chat-v1.0-GGUF/resolve/main/tinyjarvis-1.1b-chat-v1.0.Q8_0.gguf?download=true",
+            filename: "tinyjarvis-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
         ),
 
         Model(
-            name: "TinyLlama-1.1B (F16, 2.2 GiB)",
-            url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
-            filename: "tinyllama-1.1b-f16.gguf", status: "download"
+            name: "TinyJarvis-1.1B (F16, 2.2 GiB)",
+            url: "https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf?download=true",
+            filename: "tinyjarvis-1.1b-f16.gguf", status: "download"
         ),
 
         Model(
@@ -103,7 +103,7 @@ class LlamaState: ObservableObject {
     func loadModel(modelUrl: URL?) throws {
         if let modelUrl {
             messageLog += "Loading model...\n"
-            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
+            jarvisContext = try JarvisContext.create_context(path: modelUrl.path())
             messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
 
             // Assuming that the model is successfully loaded, update the downloaded models
@@ -120,20 +120,20 @@ class LlamaState: ObservableObject {
 
 
     func complete(text: String) async {
-        guard let llamaContext else {
+        guard let jarvisContext else {
             return
         }
 
         let t_start = DispatchTime.now().uptimeNanoseconds
-        await llamaContext.completion_init(text: text)
+        await jarvisContext.completion_init(text: text)
         let t_heat_end = DispatchTime.now().uptimeNanoseconds
         let t_heat = Double(t_heat_end - t_start) / NS_PER_S
 
         messageLog += "\(text)"
 
         Task.detached {
-            while await !llamaContext.is_done {
-                let result = await llamaContext.completion_loop()
+            while await !jarvisContext.is_done {
+                let result = await jarvisContext.completion_loop()
                 await MainActor.run {
                     self.messageLog += "\(result)"
                 }
@@ -141,9 +141,9 @@ class LlamaState: ObservableObject {
 
             let t_end = DispatchTime.now().uptimeNanoseconds
             let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
-            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+            let tokens_per_second = Double(await jarvisContext.n_len) / t_generation
 
-            await llamaContext.clear()
+            await jarvisContext.clear()
 
             await MainActor.run {
                 self.messageLog += """
@@ -157,17 +157,17 @@ class LlamaState: ObservableObject {
     }
 
     func bench() async {
-        guard let llamaContext else {
+        guard let jarvisContext else {
             return
         }
 
         messageLog += "\n"
         messageLog += "Running benchmark...\n"
         messageLog += "Model info: "
-        messageLog += await llamaContext.model_info() + "\n"
+        messageLog += await jarvisContext.model_info() + "\n"
 
         let t_start = DispatchTime.now().uptimeNanoseconds
-        let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
+        let _ = await jarvisContext.bench(pp: 8, tg: 4, pl: 1) // heat up
         let t_end = DispatchTime.now().uptimeNanoseconds
 
         let t_heat = Double(t_end - t_start) / NS_PER_S
@@ -179,18 +179,18 @@ class LlamaState: ObservableObject {
             return
         }
 
-        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
+        let result = await jarvisContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
 
         messageLog += "\(result)"
         messageLog += "\n"
     }
 
     func clear() async {
-        guard let llamaContext else {
+        guard let jarvisContext else {
             return
         }
 
-        await llamaContext.clear()
+        await jarvisContext.clear()
         messageLog = ""
     }
 }
diff --git a/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore b/examples/jarvis.swiftui/jarvis.swiftui/Resources/models/.gitignore
similarity index 100%
rename from examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
rename to examples/jarvis.swiftui/jarvis.swiftui/Resources/models/.gitignore
diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/jarvis.swiftui/jarvis.swiftui/UI/ContentView.swift
similarity index 78%
rename from examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/UI/ContentView.swift
index 30c2dc4310210..37e1a052048c6 100644
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/UI/ContentView.swift
@@ -1,7 +1,7 @@
 import SwiftUI
 
 struct ContentView: View {
-    @StateObject var llamaState = LlamaState()
+    @StateObject var jarvisState = JarvisState()
     @State private var multiLineText = ""
     @State private var showingHelp = false    // To track if Help Sheet should be shown
 
@@ -9,7 +9,7 @@ struct ContentView: View {
         NavigationView {
             VStack {
                 ScrollView(.vertical, showsIndicators: true) {
-                    Text(llamaState.messageLog)
+                    Text(jarvisState.messageLog)
                         .font(.system(size: 12))
                         .frame(maxWidth: .infinity, alignment: .leading)
                         .padding()
@@ -37,13 +37,13 @@ struct ContentView: View {
                     }
 
                     Button("Copy") {
-                        UIPasteboard.general.string = llamaState.messageLog
+                        UIPasteboard.general.string = jarvisState.messageLog
                     }
                 }
                 .buttonStyle(.bordered)
                 .padding()
 
-                NavigationLink(destination: DrawerView(llamaState: llamaState)) {
+                NavigationLink(destination: DrawerView(jarvisState: jarvisState)) {
                     Text("View Models")
                 }
                 .padding()
@@ -57,29 +57,29 @@ struct ContentView: View {
 
     func sendText() {
         Task {
-            await llamaState.complete(text: multiLineText)
+            await jarvisState.complete(text: multiLineText)
             multiLineText = ""
         }
     }
 
     func bench() {
         Task {
-            await llamaState.bench()
+            await jarvisState.bench()
         }
     }
 
     func clear() {
         Task {
-            await llamaState.clear()
+            await jarvisState.clear()
         }
     }
     struct DrawerView: View {
 
-        @ObservedObject var llamaState: LlamaState
+        @ObservedObject var jarvisState: JarvisState
         @State private var showingHelp = false
         func delete(at offsets: IndexSet) {
             offsets.forEach { offset in
-                let model = llamaState.downloadedModels[offset]
+                let model = jarvisState.downloadedModels[offset]
                 let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
                 do {
                     try FileManager.default.removeItem(at: fileURL)
@@ -89,7 +89,7 @@ struct ContentView: View {
             }
 
             // Remove models from downloadedModels array
-            llamaState.downloadedModels.remove(atOffsets: offsets)
+            jarvisState.downloadedModels.remove(atOffsets: offsets)
         }
 
         func getDocumentsDirectory() -> URL {
@@ -100,18 +100,18 @@ struct ContentView: View {
             List {
                 Section(header: Text("Download Models From Hugging Face")) {
                     HStack {
-                        InputButton(llamaState: llamaState)
+                        InputButton(jarvisState: jarvisState)
                     }
                 }
                 Section(header: Text("Downloaded Models")) {
-                    ForEach(llamaState.downloadedModels) { model in
-                        DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
+                    ForEach(jarvisState.downloadedModels) { model in
+                        DownloadButton(jarvisState: jarvisState, modelName: model.name, modelUrl: model.url, filename: model.filename)
                     }
                     .onDelete(perform: delete)
                 }
                 Section(header: Text("Default Models")) {
-                    ForEach(llamaState.undownloadedModels) { model in
-                        DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
+                    ForEach(jarvisState.undownloadedModels) { model in
+                        DownloadButton(jarvisState: jarvisState, modelName: model.name, modelUrl: model.url, filename: model.filename)
                     }
                 }
 
diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/jarvis.swiftui/jarvis.swiftui/UI/DownloadButton.swift
similarity index 83%
rename from examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/UI/DownloadButton.swift
index 4584d6eaa3d32..21177a9455acd 100644
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/UI/DownloadButton.swift
@@ -1,7 +1,7 @@
 import SwiftUI
 
 struct DownloadButton: View {
-    @ObservedObject private var llamaState: LlamaState
+    @ObservedObject private var jarvisState: JarvisState
     private var modelName: String
     private var modelUrl: String
     private var filename: String
@@ -19,8 +19,8 @@ struct DownloadButton: View {
     private func checkFileExistenceAndUpdateStatus() {
     }
 
-    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
-        self.llamaState = llamaState
+    init(jarvisState: JarvisState, modelName: String, modelUrl: String, filename: String) {
+        self.jarvisState = jarvisState
         self.modelName = modelName
         self.modelUrl = modelUrl
         self.filename = filename
@@ -51,10 +51,10 @@ struct DownloadButton: View {
                     try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
                     print("Writing to \(filename) completed")
 
-                    llamaState.cacheCleared = false
+                    jarvisState.cacheCleared = false
 
                     let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded")
-                    llamaState.downloadedModels.append(model)
+                    jarvisState.downloadedModels.append(model)
                     status = "downloaded"
                 }
             } catch let err {
@@ -90,7 +90,7 @@ struct DownloadButton: View {
                         return
                     }
                     do {
-                        try llamaState.loadModel(modelUrl: fileURL)
+                        try jarvisState.loadModel(modelUrl: fileURL)
                     } catch let err {
                         print("Error: \(err.localizedDescription)")
                     }
@@ -104,7 +104,7 @@ struct DownloadButton: View {
         .onDisappear() {
             downloadTask?.cancel()
         }
-        .onChange(of: llamaState.cacheCleared) { newValue in
+        .onChange(of: jarvisState.cacheCleared) { newValue in
             if newValue {
                 downloadTask?.cancel()
                 let fileURL = DownloadButton.getFileURL(filename: filename)
@@ -116,9 +116,9 @@ struct DownloadButton: View {
 
 // #Preview {
 //    DownloadButton(
-//        llamaState: LlamaState(),
-//        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
-//        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
-//        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
+//        jarvisState: JarvisState(),
+//        modelName: "TheBloke / TinyJarvis-1.1B-1T-OpenOrca-GGUF (Q4_0)",
+//        modelUrl: "https://huggingface.co/TheBloke/TinyJarvis-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyjarvis-1.1b-1t-openorca.Q4_0.gguf?download=true",
+//        filename: "tinyjarvis-1.1b-1t-openorca.Q4_0.gguf"
 //    )
 // }
diff --git a/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift b/examples/jarvis.swiftui/jarvis.swiftui/UI/InputButton.swift
similarity index 93%
rename from examples/llama.swiftui/llama.swiftui/UI/InputButton.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/UI/InputButton.swift
index c5ffbad4ec331..36cef15088129 100644
--- a/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/UI/InputButton.swift
@@ -1,7 +1,7 @@
 import SwiftUI
 
 struct InputButton: View {
-    @ObservedObject var llamaState: LlamaState
+    @ObservedObject var jarvisState: JarvisState
     @State private var inputLink: String = ""
     @State private var status: String = "download"
     @State private var filename: String = ""
@@ -55,10 +55,10 @@ struct InputButton: View {
                     try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
                     print("Writing to \(filename) completed")
 
-                    llamaState.cacheCleared = false
+                    jarvisState.cacheCleared = false
 
                     let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded")
-                    llamaState.downloadedModels.append(model)
+                    jarvisState.downloadedModels.append(model)
                     status = "downloaded"
                 }
             } catch let err {
@@ -106,7 +106,7 @@ struct InputButton: View {
                         return
                     }
                     do {
-                        try llamaState.loadModel(modelUrl: fileURL)
+                        try jarvisState.loadModel(modelUrl: fileURL)
                     } catch let err {
                         print("Error: \(err.localizedDescription)")
                     }
@@ -120,7 +120,7 @@ struct InputButton: View {
         .onDisappear() {
             downloadTask?.cancel()
         }
-        .onChange(of: llamaState.cacheCleared) { newValue in
+        .onChange(of: jarvisState.cacheCleared) { newValue in
             if newValue {
                 downloadTask?.cancel()
                 let fileURL = InputButton.getFileURL(filename: self.filename)
diff --git a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift b/examples/jarvis.swiftui/jarvis.swiftui/UI/LoadCustomButton.swift
similarity index 84%
rename from examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/UI/LoadCustomButton.swift
index 4315dbe4f2786..7f71269aba655 100644
--- a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/UI/LoadCustomButton.swift
@@ -2,11 +2,11 @@ import SwiftUI
 import UniformTypeIdentifiers
 
 struct LoadCustomButton: View {
-    @ObservedObject private var llamaState: LlamaState
+    @ObservedObject private var jarvisState: JarvisState
     @State private var showFileImporter = false
 
-    init(llamaState: LlamaState) {
-        self.llamaState = llamaState
+    init(jarvisState: JarvisState) {
+        self.jarvisState = jarvisState
     }
 
     var body: some View {
@@ -29,7 +29,7 @@ struct LoadCustomButton: View {
                     if !gotAccess { return }
 
                     do {
-                        try llamaState.loadModel(modelUrl: file.absoluteURL)
+                        try jarvisState.loadModel(modelUrl: file.absoluteURL)
                     } catch let err {
                         print("Error: \(err.localizedDescription)")
                     }
diff --git a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift b/examples/jarvis.swiftui/jarvis.swiftui/jarvis_swiftuiApp.swift
similarity index 78%
rename from examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
rename to examples/jarvis.swiftui/jarvis.swiftui/jarvis_swiftuiApp.swift
index cccda8a979f5e..9a19b90943044 100644
--- a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
+++ b/examples/jarvis.swiftui/jarvis.swiftui/jarvis_swiftuiApp.swift
@@ -1,7 +1,7 @@
 import SwiftUI
 
 @main
-struct llama_swiftuiApp: App {
+struct jarvis_swiftuiApp: App {
     var body: some Scene {
         WindowGroup {
             ContentView()
diff --git a/examples/llama.vim b/examples/jarvis.vim
similarity index 83%
rename from examples/llama.vim
rename to examples/jarvis.vim
index 57eb2a9772d51..7335ae4c352d3 100644
--- a/examples/llama.vim
+++ b/examples/jarvis.vim
@@ -1,10 +1,10 @@
-" LLM-based text completion using llama.cpp
+" LLM-based text completion using jarvis.cpp
 "
 " requires:
 "
 "   - neovim or vim
 "   - curl
-"   - llama.cpp server instance
+"   - jarvis.cpp server instance
 "   - FIM-compatible model
 "
 " sample config:
@@ -13,11 +13,11 @@
 "   - Shift+Tab - accept just the first line of the suggestion
 "   - Ctrl+F    - toggle FIM completion manually
 "
-" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
+" make symlink or copy this file to ~/.config/nvim/autoload/jarvis.vim
 "
-" start the llama.cpp server with a FIM-compatible model. for example:
+" start the jarvis.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"   $ jarvis-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
 "
 "   --batch-size [512, model max context]
 "
@@ -27,28 +27,28 @@
 "   --ubatch-size [64, 2048]
 "
 "     chunks the batch into smaller chunks for faster processing
-"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"     depends on the specific hardware. use jarvis-bench to profile and determine the best size
 "
-"   --cache-reuse (ge:llama_config.n_predict, 1024]
+"   --cache-reuse (ge:jarvis_config.n_predict, 1024]
 "
-"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
+"     this should be either 0 (disabled) or strictly larger than g:jarvis_config.n_predict
 "     using non-zero value enables context reuse on the server side which dramatically improves the performance at
 "     large contexts. a value of 256 should be good for all cases
 "
-" run this once to initialise llama.vim:
+" run this once to initialise jarvis.vim:
 "
-"   :call llama#init()
+"   :call jarvis#init()
 "
-" more info: https://github.com/ggerganov/llama.cpp/pull/9787
+" more info: https://github.com/ggerganov/jarvis.cpp/pull/9787
 "
 
 " colors (adjust to your liking)
-highlight llama_hl_hint guifg=#ff772f ctermfg=202
-highlight llama_hl_info guifg=#77ff2f ctermfg=119
+highlight jarvis_hl_hint guifg=#ff772f ctermfg=202
+highlight jarvis_hl_info guifg=#77ff2f ctermfg=119
 
 " general parameters:
 "
-"   endpoint:         llama.cpp server endpoint
+"   endpoint:         jarvis.cpp server endpoint
 "   n_prefix:         number of lines before the cursor location to include in the local prefix
 "   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
@@ -91,7 +91,7 @@ let s:default_config = {
     \ 'ring_update_ms':   1000,
     \ }
 
-let g:llama_config = get(g:, 'llama_config', s:default_config)
+let g:jarvis_config = get(g:, 'jarvis_config', s:default_config)
 
 function! s:get_indent(str)
     let l:count = 0
@@ -109,10 +109,10 @@ function! s:rand(i0, i1) abort
     return a:i0 + rand() % (a:i1 - a:i0 + 1)
 endfunction
 
-function! llama#init()
+function! jarvis#init()
     if !executable('curl')
         echohl WarningMsg
-        echo 'llama.vim requires the "curl" command to be available'
+        echo 'jarvis.vim requires the "curl" command to be available'
         echohl None
         return
     endif
@@ -145,8 +145,8 @@ function! llama#init()
     let s:ghost_text_vim = has('textprop')
 
     if s:ghost_text_vim
-        let s:hlgroup_hint = 'llama_hl_hint'
-        let s:hlgroup_info = 'llama_hl_info'
+        let s:hlgroup_hint = 'jarvis_hl_hint'
+        let s:hlgroup_info = 'jarvis_hl_info'
 
         if empty(prop_type_get(s:hlgroup_hint))
             call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
@@ -156,34 +156,34 @@ function! llama#init()
         endif
     endif
 
-    augroup llama
+    augroup jarvis
         autocmd!
-        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
-        autocmd InsertLeavePre  * call llama#fim_cancel()
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> jarvis#fim_inline(v:false)
+        autocmd InsertLeavePre  * call jarvis#fim_cancel()
 
         autocmd CursorMoved     * call s:on_move()
         autocmd CursorMovedI    * call s:on_move()
-        autocmd CompleteChanged * call llama#fim_cancel()
+        autocmd CompleteChanged * call jarvis#fim_cancel()
 
-        if g:llama_config.auto_fim
-            autocmd CursorMovedI * call llama#fim(v:true)
+        if g:jarvis_config.auto_fim
+            autocmd CursorMovedI * call jarvis#fim(v:true)
         endif
 
         " gather chunks upon yanking
         autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
 
         " gather chunks upon entering/leaving a buffer
-        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
-        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:jarvis_config.ring_chunk_size/2]), min([line('.') + g:jarvis_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:jarvis_config.ring_chunk_size/2]), min([line('.') + g:jarvis_config.ring_chunk_size/2, line('$')])), v:true, v:true)
 
         " gather chunk upon saving the file
-        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:jarvis_config.ring_chunk_size/2]), min([line('.') + g:jarvis_config.ring_chunk_size/2, line('$')])), v:true, v:true)
     augroup END
 
-    silent! call llama#fim_cancel()
+    silent! call jarvis#fim_cancel()
 
     " init background update of the ring buffer
-    if g:llama_config.ring_n_chunks > 0
+    if g:jarvis_config.ring_n_chunks > 0
         call s:ring_update()
     endif
 endfunction
@@ -209,7 +209,7 @@ function! s:chunk_sim(c0, c1)
     return 2.0 * l:common / (l:lines0 + l:lines1)
 endfunction
 
-" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
+" pick a random chunk of size g:jarvis_config.ring_chunk_size from the provided text and queue it for processing
 "
 " no_mod   - do not pick chunks from buffers with pending changes
 " do_evict - evict chunks that are very similar to the new one
@@ -221,7 +221,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
     endif
 
     " if the extra context option is disabled - do nothing
-    if g:llama_config.ring_n_chunks <= 0
+    if g:jarvis_config.ring_n_chunks <= 0
         return
     endif
 
@@ -230,11 +230,11 @@ function! s:pick_chunk(text, no_mod, do_evict)
         return
     endif
 
-    if len(a:text) + 1 < g:llama_config.ring_chunk_size
+    if len(a:text) + 1 < g:jarvis_config.ring_chunk_size
         let l:chunk = a:text
     else
-        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
-        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:jarvis_config.ring_chunk_size/2]))
+        let l:l1 = min([l:l0 + g:jarvis_config.ring_chunk_size/2, len(a:text)])
 
         let l:chunk = a:text[l:l0:l:l1]
     endif
@@ -297,9 +297,9 @@ function! s:pick_chunk(text, no_mod, do_evict)
 endfunction
 
 " picks a queued chunk, sends it for processing and adds it to s:ring_chunks
-" called every g:llama_config.ring_update_ms
+" called every g:jarvis_config.ring_update_ms
 function! s:ring_update()
-    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
+    call timer_start(g:jarvis_config.ring_update_ms, {-> s:ring_update()})
 
     " update only if in normal mode or if the cursor hasn't moved for a while
     if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
@@ -311,7 +311,7 @@ function! s:ring_update()
     endif
 
     " move the first queued chunk to the ring buffer
-    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
+    if len(s:ring_chunks) == g:jarvis_config.ring_n_chunks
         call remove(s:ring_chunks, 0)
     endif
 
@@ -349,7 +349,7 @@ function! s:ring_update()
         \ "--silent",
         \ "--no-buffer",
         \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
+        \ "--url", g:jarvis_config.endpoint,
         \ "--header", "Content-Type: application/json",
         \ "--data", l:request
         \ ]
@@ -363,21 +363,21 @@ function! s:ring_update()
 endfunction
 
 " necessary for 'inoremap <expr>'
-function! llama#fim_inline(is_auto) abort
-    call llama#fim(a:is_auto)
+function! jarvis#fim_inline(is_auto) abort
+    call jarvis#fim(a:is_auto)
     return ''
 endfunction
 
 " the main FIM call
 " takes local context around the cursor and sends it together with the extra context to the server for completion
-function! llama#fim(is_auto) abort
+function! jarvis#fim(is_auto) abort
     " we already have a suggestion for the current cursor position
     if s:hint_shown && !a:is_auto
-        call llama#fim_cancel()
+        call jarvis#fim_cancel()
         return
     endif
 
-    call llama#fim_cancel()
+    call jarvis#fim_cancel()
 
     " avoid sending repeated requests too fast
     if reltimefloat(reltime(s:t_fim_start)) < 0.6
@@ -387,7 +387,7 @@ function! llama#fim(is_auto) abort
         endif
 
         let s:t_fim_start = reltime()
-        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
+        let s:timer_fim = timer_start(600, {-> jarvis#fim(v:true)})
         return
     endif
 
@@ -400,15 +400,15 @@ function! llama#fim(is_auto) abort
     let s:pos_y = line('.')
     let l:max_y = line('$')
 
-    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
-    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
+    let l:lines_prefix = getline(max([1, s:pos_y - g:jarvis_config.n_prefix]), s:pos_y - 1)
+    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:jarvis_config.n_suffix]))
 
     let s:line_cur = getline('.')
 
     let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
     let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
 
-    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
+    if a:is_auto && len(s:line_cur_suffix) > g:jarvis_config.max_line_suffix
         return
     endif
 
@@ -443,15 +443,15 @@ function! llama#fim(is_auto) abort
         \ 'input_suffix':     l:suffix,
         \ 'input_extra':      l:extra_context,
         \ 'prompt':           l:prompt,
-        \ 'n_predict':        g:llama_config.n_predict,
+        \ 'n_predict':        g:jarvis_config.n_predict,
         \ 'n_indent':         l:indent,
         \ 'top_k':            40,
         \ 'top_p':            0.99,
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "top_p", "infill"],
         \ 'cache_prompt':     v:true,
-        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
-        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
+        \ 't_max_prompt_ms':  g:jarvis_config.t_max_prompt_ms,
+        \ 't_max_predict_ms': g:jarvis_config.t_max_predict_ms
         \ })
 
     let l:curl_command = [
@@ -459,7 +459,7 @@ function! llama#fim(is_auto) abort
         \ "--silent",
         \ "--no-buffer",
         \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
+        \ "--url", g:jarvis_config.endpoint,
         \ "--header", "Content-Type: application/json",
         \ "--data", l:request
         \ ]
@@ -494,17 +494,17 @@ function! llama#fim(is_auto) abort
     " TODO: something more clever? reranking?
     if a:is_auto && l:delta_y > 32
         " expand the prefix even further
-        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+        call s:pick_chunk(getline(max([1,       s:pos_y - g:jarvis_config.ring_scope]), max([1,       s:pos_y - g:jarvis_config.n_prefix])), v:false, v:false)
 
         " pick a suffix chunk
-        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:jarvis_config.n_suffix]),   min([l:max_y, s:pos_y + g:jarvis_config.n_suffix + g:jarvis_config.ring_chunk_size])), v:false, v:false)
 
         let s:pos_y_pick = s:pos_y
     endif
 endfunction
 
 " if first_line == v:true accept only the first line of the response
-function! llama#fim_accept(first_line)
+function! jarvis#fim_accept(first_line)
     " insert the suggestion at the cursor location
     if s:can_accept && len(s:content) > 0
         call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
@@ -522,10 +522,10 @@ function! llama#fim_accept(first_line)
         endif
     endif
 
-    call llama#fim_cancel()
+    call jarvis#fim_cancel()
 endfunction
 
-function! llama#fim_cancel()
+function! jarvis#fim_cancel()
     let s:hint_shown = v:false
 
     " clear the virtual text
@@ -548,7 +548,7 @@ endfunction
 function! s:on_move()
     let s:t_last_move = reltime()
 
-    call llama#fim_cancel()
+    call jarvis#fim_cancel()
 endfunction
 
 " callback that processes the FIM result from the server and displays the suggestion
@@ -695,7 +695,7 @@ function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
 
     let s:content[-1] .= s:line_cur_suffix
 
-    call llama#fim_cancel()
+    call jarvis#fim_cancel()
 
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
@@ -705,25 +705,25 @@ function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
     endif
 
     " construct the info message
-    if g:llama_config.show_info > 0 && l:has_info
+    if g:jarvis_config.show_info > 0 && l:has_info
         let l:prefix = '   '
 
         if l:truncated
-            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
-                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:jarvis_config.ring_n_chunks",
+                \ g:jarvis_config.show_info == 2 ? l:prefix : 'jarvis.vim',
                 \ l:n_cached, l:n_ctx
                 \ )
         else
             let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
-                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
+                \ g:jarvis_config.show_info == 2 ? l:prefix : 'jarvis.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:jarvis_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
                 \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
                 \ l:n_predict, l:t_predict_ms, l:s_predict,
                 \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
                 \ )
         endif
 
-        if g:llama_config.show_info == 1
+        if g:jarvis_config.show_info == 1
             " display the info in the statusline
             let &statusline = l:info
             let l:info = ''
@@ -733,12 +733,12 @@ function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
     " display the suggestion and append the info to the end of the first line
     if s:ghost_text_nvim
         call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
-            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
+            \ 'virt_text': [[s:content[0], 'jarvis_hl_hint'], [l:info, 'jarvis_hl_info']],
             \ 'virt_text_win_col': virtcol('.') - 1
             \ })
 
         call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
-            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'jarvis_hl_hint']]}),
             \ 'virt_text_win_col': virtcol('.')
             \ })
     elseif s:ghost_text_vim
@@ -768,8 +768,8 @@ function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
     endif
 
     " setup accept shortcuts
-    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
-    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
+    inoremap <buffer> <Tab>   <C-O>:call jarvis#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call jarvis#fim_accept(v:true)<CR>
 
     let s:hint_shown = v:true
 endfunction
diff --git a/examples/jeopardy/README.md b/examples/jeopardy/README.md
index ffa13cbf349b2..1d49a6f28fc21 100644
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/example/jeopardy
+# jarvis.cpp/example/jeopardy
 
 This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
 
@@ -12,7 +12,7 @@ MODEL_NAME=(name of your model)
 prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
 opts=(add -instruct here if needed for your model, or anything else you want to test out)
 ```
-Step 2: Run `jeopardy.sh` from the llama.cpp folder
+Step 2: Run `jeopardy.sh` from the jarvis.cpp folder
 
 Step 3: Repeat steps 1 and 2 until you have all the results you need.
 
diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh
index 07bcb3b8d78ac..39ae78f5fea8f 100755
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./jarvis-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
   echo $counter
   echo "Current Question: $question"
   eval "$exe_cmd"
diff --git a/examples/json_schema_pydantic_example.py b/examples/json_schema_pydantic_example.py
index 19c0bdb5b6770..d820415a45916 100644
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@@ -1,5 +1,5 @@
 # Usage:
-#! ./llama-server -m some-model.gguf &
+#! ./jarvis-server -m some-model.gguf &
 #! pip install pydantic
 #! python json_schema_pydantic_example.py
 
@@ -13,7 +13,7 @@
     def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
         '''
         Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
-        (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+        (jarvis.cpp server, jarvis-cpp-python, Anyscale / Together...)
 
         The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
         '''
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index fc9f0097f5f8f..38d10e8211025 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -390,7 +390,7 @@ def _visit_pattern(self, pattern, name):
             Transforms a regular expression pattern into a GBNF rule.
 
             Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+            Output: https://github.com/ggerganov/jarvis.cpp/blob/master/grammars/README.md
 
             Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
 
@@ -751,7 +751,7 @@ def format_grammar(self):
 def main(args_in = None):
     parser = argparse.ArgumentParser(
         description='''
-            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
+            Generates a grammar (suitable for use in ./jarvis-cli) that produces JSON conforming to a
             given JSON schema. Only a subset of JSON schema features are supported; more may be
             added in the future.
         ''',
diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt
deleted file mode 100644
index 5bdbea4e28187..0000000000000
--- a/examples/llama-bench/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-bench)
-add_executable(${TARGET} llama-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml
deleted file mode 100644
index 7a9d314e2969b..0000000000000
--- a/examples/llama.android/app/src/main/res/values/strings.xml
+++ /dev/null
@@ -1,3 +0,0 @@
-<resources>
-    <string name="app_name">LlamaAndroid</string>
-</resources>
diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml
deleted file mode 100644
index 8a24fda56602c..0000000000000
--- a/examples/llama.android/app/src/main/res/values/themes.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-
-    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
-</resources>
diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md
deleted file mode 100644
index 96cf743d48202..0000000000000
--- a/examples/llama.swiftui/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# llama.cpp/examples/llama.swiftui
-
-Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
-point for more advanced projects.
-
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
-
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
-
-Video demonstration:
-
-https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 3d4c1e55259fe..0000000000000
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>IDEDidComputeMac32BitWarning</key>
-    <true/>
-</dict>
-</plist>
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index bbf5fec586feb..b0046a484f8a7 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -5,7 +5,7 @@ add_library(llava OBJECT
             clip.h
             )
 
-target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(llava PRIVATE ggml jarvis ${CMAKE_THREAD_LIBS_INIT})
 
 target_include_directories(llava PUBLIC .)
 target_include_directories(llava PUBLIC ../..)
@@ -16,9 +16,9 @@ target_compile_features(llava PRIVATE cxx_std_11)
 add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
 if (BUILD_SHARED_LIBS)
     set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    target_compile_definitions(llava PRIVATE JARVIS_SHARED JARVIS_BUILD)
     add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
-    target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(llava_shared PRIVATE ggml jarvis ${CMAKE_THREAD_LIBS_INIT})
     install(TARGETS llava_shared LIBRARY)
 endif()
 
@@ -30,16 +30,16 @@ if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
 endif()
 
-set(TARGET llama-llava-cli)
+set(TARGET jarvis-llava-cli)
 add_executable(${TARGET} llava-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME jarvis-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET llama-minicpmv-cli)
+set(TARGET jarvis-minicpmv-cli)
 add_executable(${TARGET} minicpmv-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME jarvis-minicpmv-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 4f783f3ce05fb..d5e39e7d90ec3 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 
 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+Build with cmake or run `make jarvis-llava-cli` to build it.
 
-After building, run: `./llama-llava-cli` to see the usage. For example:
+After building, run: `./jarvis-llava-cli` to see the usage. For example:
 
 ```sh
-./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./jarvis-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
     --image path/to/an/image.jpg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@@ -30,7 +30,7 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 
-2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava_surgery.py` to split the LLaVA model to JARVIS and multimodel projector constituents:
 
 ```sh
 python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
@@ -54,18 +54,18 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
     --projector-type ldpv2
 ```
 
-4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert_legacy_jarvis.py` to convert the JARVIS part of LLaVA to GGUF:
 
 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
+python ./examples/convert_legacy_jarvis.py path/to/MobileVLM-1.7B --skip-unknown
 ```
 
-5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
+5. Use `quantize` to convert JARVIS part's DataType from `fp32` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./jarvis-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 
-Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+Now both the JARVIS part and the image encoder is in the `MobileVLM-1.7B` directory.
 
 ## Android compile and run
 ### compile
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/jarvis-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -93,16 +93,16 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ```sh
 encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
  Susan Wise Bauer
-llama_print_timings:        load time =   23574.72 ms
-llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
-llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
-llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
-llama_print_timings:       total time =   34731.93 ms
+jarvis_print_timings:        load time =   23574.72 ms
+jarvis_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
+jarvis_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
+jarvis_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
+jarvis_print_timings:       total time =   34731.93 ms
 ```
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/jarvis-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -113,11 +113,11 @@ llama_print_timings:       total time =   34731.93 ms
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
  The image depicts a cat sitting in the grass near some tall green plants.
-llama_print_timings:        load time =   23257.32 ms
-llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
-llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
-llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
-llama_print_timings:       total time =   34570.79 ms
+jarvis_print_timings:        load time =   23257.32 ms
+jarvis_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
+jarvis_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
+jarvis_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
+jarvis_print_timings:       total time =   34570.79 ms
 ```
 
 
@@ -126,11 +126,11 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/jarvis-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
-    --image /data/local/tmp/many_llamas.jpeg \
+    --image /data/local/tmp/many_jarviss.jpeg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
 ```
 **output**
@@ -139,13 +139,13 @@ encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 
- A group of llamas are standing in a green pasture.
+ A group of jarviss are standing in a green pasture.
 
-llama_print_timings:        load time =   20357.33 ms
-llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
-llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
-llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
-llama_print_timings:       total time =   28038.34 ms /   205 tokens
+jarvis_print_timings:        load time =   20357.33 ms
+jarvis_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
+jarvis_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
+jarvis_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
+jarvis_print_timings:       total time =   28038.34 ms /   205 tokens
 ```
 #### llava-cli latest-version
 **input**
@@ -162,11 +162,11 @@ user_prompt: \nWhat's that? ASSISTANT:
 
  It is a group of sheep standing together in a grass field.
 
-llama_print_timings:        load time =  818120.91 ms
-llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
-llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
-llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
-llama_print_timings:       total time =  865441.76 ms /   204 tokens
+jarvis_print_timings:        load time =  818120.91 ms
+jarvis_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
+jarvis_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
+jarvis_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
+jarvis_print_timings:       total time =  865441.76 ms /   204 tokens
 ```
 ### MobileVLM_V2-1.7B case
 #### llava-cli release-2005b
@@ -180,15 +180,15 @@ encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 
- This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
+ This image captures a lively scene of 20 jarviss in motion on an expansive, grassy field. The jarvis is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
 
-The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
+The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these jarvis and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the jarviss amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Jarvis
 
-llama_print_timings:        load time =   22406.77 ms
-llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
-llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
-llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
-llama_print_timings:       total time =   44411.01 ms /   377 tokens
+jarvis_print_timings:        load time =   22406.77 ms
+jarvis_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
+jarvis_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
+jarvis_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
+jarvis_print_timings:       total time =   44411.01 ms /   377 tokens
 ```
 
 ## Orin compile and run
@@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llama-llava-cli \
+./jarvis-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     --image /data/local/tmp/demo.jpeg \
@@ -214,17 +214,17 @@ encode_image_with_clip: image encoded in   296.62 ms by CLIP (    2.06 ms per im
 
  Susan Wise Bauer
 
-llama_print_timings:        load time =    1067.64 ms
-llama_print_timings:      sample time =       1.53 ms /     6 runs   (    0.25 ms per token,  3934.43 tokens per second)
-llama_print_timings: prompt eval time =     306.84 ms /   246 tokens (    1.25 ms per token,   801.72 tokens per second)
-llama_print_timings:        eval time =      91.50 ms /     6 runs   (   15.25 ms per token,    65.58 tokens per second)
-llama_print_timings:       total time =    1352.63 ms /   252 tokens
+jarvis_print_timings:        load time =    1067.64 ms
+jarvis_print_timings:      sample time =       1.53 ms /     6 runs   (    0.25 ms per token,  3934.43 tokens per second)
+jarvis_print_timings: prompt eval time =     306.84 ms /   246 tokens (    1.25 ms per token,   801.72 tokens per second)
+jarvis_print_timings:        eval time =      91.50 ms /     6 runs   (   15.25 ms per token,    65.58 tokens per second)
+jarvis_print_timings:       total time =    1352.63 ms /   252 tokens
 ```
 
 ### case 2
 **input**
 ```sh
-./llama-llava-cli \
+./jarvis-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
@@ -237,11 +237,11 @@ encode_image_with_clip: image encoded in   302.15 ms by CLIP (    2.10 ms per im
 
  The image features a cat lying in the grass.
 
-llama_print_timings:        load time =    1057.07 ms
-llama_print_timings:      sample time =       3.27 ms /    11 runs   (    0.30 ms per token,  3360.83 tokens per second)
-llama_print_timings: prompt eval time =     213.60 ms /   232 tokens (    0.92 ms per token,  1086.14 tokens per second)
-llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 ms per token,    66.01 tokens per second)
-llama_print_timings:       total time =    1365.47 ms /   243 tokens
+jarvis_print_timings:        load time =    1057.07 ms
+jarvis_print_timings:      sample time =       3.27 ms /    11 runs   (    0.30 ms per token,  3360.83 tokens per second)
+jarvis_print_timings: prompt eval time =     213.60 ms /   232 tokens (    0.92 ms per token,  1086.14 tokens per second)
+jarvis_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 ms per token,    66.01 tokens per second)
+jarvis_print_timings:       total time =    1365.47 ms /   243 tokens
 ```
 
 ## Running on Intel(R) Core(TM) i7-10750H
@@ -256,7 +256,7 @@ make -j32
 ```sh
 -m /path/to/ggml-model-q4_k.gguf \
     --mmproj /path/to/mmproj-model-f16.gguf \
-    --image /path/to/many_llamas.jpeg
+    --image /path/to/many_jarviss.jpeg
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
 ```
 **output**
@@ -267,13 +267,13 @@ encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that?ASSISTANT:
 
- A group of llamas are walking together in a field.
+ A group of jarviss are walking together in a field.
 
-llama_print_timings:        load time =    5506.60 ms
-llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
-llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
-llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
-llama_print_timings:       total time =    5990.25 ms /   202 tokens
+jarvis_print_timings:        load time =    5506.60 ms
+jarvis_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
+jarvis_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
+jarvis_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
+jarvis_print_timings:       total time =    5990.25 ms /   202 tokens
 ```
 
 ### MobileVLM_V2-1.7B case
@@ -289,17 +289,17 @@ encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that?ASSISTANT:
 
- The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
+ The image captures a tranquil scene in a park, where a group of approximately 20 jarviss are gathered. The jarviss, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
 
-The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
+The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Jarviss Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the jarviss. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
 
-The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
+The jarviss' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the jarviss in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
 
-llama_print_timings:        load time =    6642.61 ms
-llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
-llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
-llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
-llama_print_timings:       total time =   15513.95 ms /   412 tokens
+jarvis_print_timings:        load time =    6642.61 ms
+jarvis_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
+jarvis_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
+jarvis_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
+jarvis_print_timings:       total time =   15513.95 ms /   412 tokens
 ```
 
 ## Run on Intel(R) Core(TM) Ultra7 115H
@@ -322,13 +322,13 @@ encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 
- The image features a group of brown and white llamas standing in a grassy field.
+ The image features a group of brown and white jarviss standing in a grassy field.
 
-llama_print_timings:        load time =    7441.06 ms
-llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
-llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
-llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =    7987.23 ms /   209 tokens
+jarvis_print_timings:        load time =    7441.06 ms
+jarvis_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
+jarvis_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
+jarvis_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
+jarvis_print_timings:       total time =    7987.23 ms /   209 tokens
 ```
 
 ### MobileVLM_V2-1.7B case
@@ -342,22 +342,22 @@ encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per im
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 
- This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
+ This image captures a lively scene of a group of 14 jarviss in a grassy field. The jarviss, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
  of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
 
-The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
+The jarvis in the front of the line stands out due to its black and white coloring, which is quite unusual for jarvis patterns. The jarvis in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
 
-The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
+The image is taken from the side of the jarvis, providing a clear view of the jarvis in the front and its companions. The lameness in the jarvis in
  front is not visible, indicating that it might not be the main focus of the photo.
 
 The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
 
 
-llama_print_timings:        load time =    7015.35 ms
-llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
-llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
-llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =   14371.19 ms /   446 tokens
+jarvis_print_timings:        load time =    7015.35 ms
+jarvis_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
+jarvis_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
+jarvis_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
+jarvis_print_timings:       total time =   14371.19 ms /   446 tokens
 ```
 
 ## TODO
diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md
index 1c8498ff9e151..3ef4b55f17547 100644
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -1,45 +1,45 @@
-## MiniCPM-Llama3-V 2.5
+## MiniCPM-Jarvis3-V 2.5
 
 ### Prepare models and code
 
-Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
+Download [MiniCPM-Jarvis3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Jarvis3-V-2_5) PyTorch model from huggingface to "MiniCPM-Jarvis3-V-2_5" folder.
 
-Clone llama.cpp:
+Clone jarvis.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+git clone https://github.com/ggerganov/jarvis.cpp
+cd jarvis.cpp
 ```
 
 ### Usage
 
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Jarvis3-V-2_5-gguf) by us)
 
 ```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
-python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Jarvis3-V-2_5
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Jarvis3-V-2_5 --minicpmv-projector ../MiniCPM-Jarvis3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Jarvis3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./convert_hf_to_gguf.py ../MiniCPM-Jarvis3-V-2_5/model
 
 # quantize int4 version
-./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./jarvis-quantize ../MiniCPM-Jarvis3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Jarvis3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 
 Build for Linux or Mac
 
 ```bash
 make
-make llama-minicpmv-cli
+make jarvis-minicpmv-cli
 ```
 
 Inference on Linux or Mac
 ```
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./jarvis-minicpmv-cli -m ../MiniCPM-Jarvis3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Jarvis3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./jarvis-minicpmv-cli -m ../MiniCPM-Jarvis3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Jarvis3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 
 # or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+./jarvis-minicpmv-cli -m ../MiniCPM-Jarvis3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Jarvis3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
 
 ### Android
@@ -76,24 +76,24 @@ make
 
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+Finally, copy these built `jarvis` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+(Assumed that you have pushed the built executable files to the /sdcard/jarvis.cpp/bin path using `adb push`)
 ```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cp -r /sdcard/jarvis.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+Download models and push them to `/sdcard/jarvis.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 
 ```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/jarvis.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/jarvis.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
 ```
 
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+$./jarvis-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md
index c4be5e5dd6484..8f39a2c552e39 100644
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@@ -4,10 +4,10 @@
 
 Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
 
-Clone llama.cpp:
+Clone jarvis.cpp:
 ```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
-cd llama.cpp
+git clone git@github.com:OpenBMB/jarvis.cpp.git
+cd jarvis.cpp
 git checkout minicpmv-main
 ```
 
@@ -21,26 +21,26 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
 
 # quantize int4 version
-./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./jarvis-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 
 Build for Linux or Mac
 
 ```bash
 make
-make llama-minicpmv-cli
+make jarvis-minicpmv-cli
 ```
 
 Inference on Linux or Mac
 ```
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./jarvis-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./jarvis-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 
 # or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+./jarvis-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
 
 ### Video
@@ -84,24 +84,24 @@ make
 
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+Finally, copy these built `jarvis` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+(Assumed that you have pushed the built executable files to the /sdcard/jarvis.cpp/bin path using `adb push`)
 ```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cp -r /sdcard/jarvis.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+Download models and push them to `/sdcard/jarvis.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 
 ```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/jarvis.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/jarvis.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
 ```
 
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+$./jarvis-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 012451361763c..2c17cccc067fd 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.
 
 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+Build with cmake or run `make jarvis-llava-cli` to build it.
 
-After building, run: `./llama-llava-cli` to see the usage. For example:
+After building, run: `./jarvis-llava-cli` to see the usage. For example:
 
 ```sh
-./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./jarvis-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -38,7 +38,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```
 
-3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava_surgery.py` to split the LLaVA model to JARVIS and multimodel projector constituents:
 
 ```sh
 python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
@@ -50,13 +50,13 @@ python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
-5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert_legacy_jarvis.py` to convert the JARVIS part of LLaVA to GGUF:
 
 ```sh
-python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert_legacy_jarvis.py ../llava-v1.5-7b --skip-unknown
 ```
 
-Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
+Now both the JARVIS part and the image encoder are in the `llava-v1.5-7b` directory.
 
 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
@@ -92,12 +92,12 @@ python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projecto
 
 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert_legacy_jarvis.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 
 7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./jarvis-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index 45ccf8d70d863..9c395883939aa 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -2,7 +2,7 @@
 
 model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
 projector_name="mmproj-model-f16.gguf"
-llama_name="ggml-model-q4_k.gguf"
+jarvis_name="ggml-model-q4_k.gguf"
 img_dir="/Users/cxt/model/llm"
 img_name="demo.jpg"
 prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 
 program_dir="build_64/bin"
-binName="llama-llava-cli"
+binName="jarvis-llava-cli"
 n_threads=4
 
 
@@ -24,7 +24,7 @@ fi
 function android_run() {
     # # copy resource into device
     # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
-    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    # adb push ${model_dir}/${jarvis_name} ${deviceDir}/${jarvis_name}
     adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
     # copy program into device
     adb push ${program_dir}/${binName} ${deviceDir}/${binName}
@@ -32,14 +32,14 @@ function android_run() {
 
     # run
     adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
-                                                 -m ${deviceDir}/${llama_name} \
+                                                 -m ${deviceDir}/${jarvis_name} \
                                                  --mmproj ${deviceDir}/${projector_name} \
                                                  -t ${n_threads} \
                                                  --image ${deviceDir}/${img_name} \
                                                  -p \"${prompt}\" \
                                                  > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
     adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
-                                                 -m ${deviceDir}/${llama_name} \
+                                                 -m ${deviceDir}/${jarvis_name} \
                                                  --mmproj ${deviceDir}/${projector_name} \
                                                  -t ${n_threads} \
                                                  --image ${deviceDir}/${img_name} \
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 78588bdf1745c..57fbcc66b66d7 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -4,9 +4,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#ifdef LLAMA_SHARED
+#ifdef JARVIS_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
+#        ifdef JARVIS_BUILD
 #            define CLIP_API __declspec(dllexport)
 #        else
 #            define CLIP_API __declspec(dllimport)
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 1610985858fc9..488d0567b976a 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -5,7 +5,7 @@
 #include "sampling.h"
 #include "clip.h"
 #include "llava.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #include <cstdio>
@@ -13,14 +13,14 @@
 #include <cstring>
 #include <vector>
 
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+static bool eval_tokens(struct jarvis_context * ctx_jarvis, std::vector<jarvis_token> tokens, int n_batch, int * n_past) {
     int N = (int) tokens.size();
     for (int i = 0; i < N; i += n_batch) {
         int n_eval = (int) tokens.size() - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+        if (jarvis_decode(ctx_jarvis, jarvis_batch_get_one(&tokens[i], n_eval))) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
@@ -29,31 +29,31 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
     return true;
 }
 
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
+static bool eval_id(struct jarvis_context * ctx_jarvis, int id, int * n_past) {
+    std::vector<jarvis_token> tokens;
     tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
+    return eval_tokens(ctx_jarvis, tokens, 1, n_past);
 }
 
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+static bool eval_string(struct jarvis_context * ctx_jarvis, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    std::vector<jarvis_token> embd_inp = common_tokenize(ctx_jarvis, str2, add_bos, true);
+    eval_tokens(ctx_jarvis, embd_inp, n_batch, n_past);
     return true;
 }
 
 static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
+                           struct jarvis_context * ctx_jarvis,
                            int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    const jarvis_token id = common_sampler_sample(smpl, ctx_jarvis, -1);
     common_sampler_accept(smpl, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (jarvis_token_is_eog(jarvis_get_model(ctx_jarvis), id)) {
         ret = "</s>";
     } else {
-        ret = common_token_to_piece(ctx_llama, id);
+        ret = common_token_to_piece(ctx_jarvis, id);
     }
-    eval_id(ctx_llama, id, n_past);
+    eval_id(ctx_jarvis, id, n_past);
     return ret.c_str();
 }
 
@@ -110,8 +110,8 @@ static std::string remove_image_from_prompt(const std::string& prompt, const cha
 
 struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
+    struct jarvis_context * ctx_jarvis = NULL;
+    struct jarvis_model * model = NULL;
 };
 
 static void print_usage(int, char ** argv) {
@@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
         LOG_INF("system_prompt: %s\n", system_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_jarvis, system_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_jarvis, tmp[i]).c_str());
             }
         }
         LOG_INF("user_prompt: %s\n", user_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_jarvis, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_jarvis, tmp[i]).c_str());
             }
         }
     } else {
@@ -176,16 +176,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
         user_prompt = prompt + "\nASSISTANT:";
         if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_jarvis, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_jarvis, tmp[i]).c_str());
             }
         }
     }
 
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
-    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_jarvis, system_prompt.c_str(), params->n_batch, &n_past, true);
+    llava_eval_image_embed(ctx_llava->ctx_jarvis, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_jarvis, user_prompt.c_str(), params->n_batch, &n_past, false);
 
     // generate the response
 
@@ -199,7 +199,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     std::string response = "";
     for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
+        const char * tmp = sample(smpl, ctx_llava->ctx_jarvis, &n_past);
         response += tmp;
         if (strcmp(tmp, "</s>") == 0) break;
         if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -215,13 +215,13 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     LOG("\n");
 }
 
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
+static struct jarvis_model * llava_init(common_params * params) {
+    jarvis_backend_init();
+    jarvis_numa_init(params->numa);
 
-    llama_model_params model_params = common_model_params_to_llama(*params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(*params);
 
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -229,7 +229,7 @@ static struct llama_model * llava_init(common_params * params) {
     return model;
 }
 
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
+static struct llava_context * llava_init_context(common_params * params, jarvis_model * model) {
     const char * clip_path = params->mmproj.c_str();
 
     auto prompt = params->prompt;
@@ -240,19 +240,19 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
 
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx_jarvis = jarvis_new_context_with_model(model, ctx_params);
 
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+    if (ctx_jarvis == NULL) {
+        LOG_ERR("%s: failed to create the jarvis_context\n" , __func__);
         return NULL;
     }
 
     auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
-    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_jarvis = ctx_jarvis;
     ctx_llava->ctx_clip = ctx_clip;
     ctx_llava->model = model;
     return ctx_llava;
@@ -264,9 +264,9 @@ static void llava_free(struct llava_context * ctx_llava) {
         ctx_llava->ctx_clip = NULL;
     }
 
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
+    jarvis_free(ctx_llava->ctx_jarvis);
+    jarvis_free_model(ctx_llava->model);
+    jarvis_backend_free();
 }
 
 int main(int argc, char ** argv) {
@@ -274,7 +274,7 @@ int main(int argc, char ** argv) {
 
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_LLAVA, print_usage)) {
         return 1;
     }
 
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_perf_context_print(ctx_llava->ctx_llama);
+        jarvis_perf_context_print(ctx_llava->ctx_jarvis);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -316,14 +316,14 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_perf_context_print(ctx_llava->ctx_llama);
+            jarvis_perf_context_print(ctx_llava->ctx_jarvis);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
         }
     }
 
-    llama_free_model(model);
+    jarvis_free_model(model);
 
     return 0;
 }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index be69885408433..b93d9a38945f2 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,7 +1,7 @@
 #include "clip.h"
 #include "llava.h"
 
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <cerrno>
@@ -367,12 +367,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     return true;
 }
 
-bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
+bool llava_validate_embed_size(const jarvis_context * ctx_jarvis, const clip_ctx * ctx_clip) {
         // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    int n_jarvis_embd = jarvis_n_embd(jarvis_get_model(ctx_jarvis));
     auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
-    if (n_image_embd != n_llama_embd) {
-        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+    if (n_image_embd != n_jarvis_embd) {
+        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of JARVIS (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_jarvis_embd);
         return false;
     }
     return true;
@@ -402,13 +402,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
 }
 
 struct llava_embd_batch {
-    std::vector<llama_pos>      pos;
+    std::vector<jarvis_pos>      pos;
     std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
+    std::vector<jarvis_seq_id>   seq_id_0;
+    std::vector<jarvis_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
-    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    jarvis_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, jarvis_pos pos_0, jarvis_seq_id seq_id) {
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
@@ -434,8 +434,8 @@ struct llava_embd_batch {
     }
 };
 
-bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+bool llava_eval_image_embed(jarvis_context * ctx_jarvis, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
+    int n_embd  = jarvis_n_embd(jarvis_get_model(ctx_jarvis));
 
     for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
         int n_eval = image_embed->n_image_pos - i;
@@ -444,7 +444,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         }
         float * embd = image_embed->embed+i*n_embd;
         llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-        if (llama_decode(ctx_llama, llava_batch.batch)) {
+        if (jarvis_decode(ctx_jarvis, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index b6feb3027b2da..47422565d269c 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -3,9 +3,9 @@
 
 #include "ggml.h"
 
-#ifdef LLAMA_SHARED
+#ifdef JARVIS_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
+#        ifdef JARVIS_BUILD
 #            define LLAVA_API __declspec(dllexport)
 #        else
 #            define LLAVA_API __declspec(dllimport)
@@ -28,7 +28,7 @@ struct llava_image_embed {
 };
 
 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct jarvis_context * ctx_jarvis, const struct clip_ctx * ctx_clip);
 
 LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 
@@ -39,8 +39,8 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
 /** free an embedding made with llava_image_embed_make_* */
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 
-/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
-LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+/** write the image represented by embed into the jarvis context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+LLAVA_API bool llava_eval_image_embed(struct jarvis_context * ctx_jarvis, const struct llava_image_embed * embed, int n_batch, int * n_past);
 
 #ifdef __cplusplus
 }
diff --git a/examples/llava/llava_surgery.py b/examples/llava/llava_surgery.py
index 4f2da3beefff6..8f8a14991eeba 100644
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
@@ -34,5 +34,5 @@
 
 
 print("Done!")
-print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular JARVIS GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py
index 2d5b32fe6b236..fcf9368f328fd 100644
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@@ -155,5 +155,5 @@ def proj_criteria(checkpoint):
     save_model(projector, f"{args.model}/llava.projector", 'pytorch')
 
 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a a regular JARVIS GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index cbecec343c640..71a6b535133ec 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -4,7 +4,7 @@
 #include "sampling.h"
 #include "clip.h"
 #include "llava.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #include <algorithm>
@@ -16,8 +16,8 @@
 
 struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
+    struct jarvis_context * ctx_jarvis = NULL;
+    struct jarvis_model * model = NULL;
 };
 
 static void show_additional_info(int /*argc*/, char ** argv) {
@@ -25,13 +25,13 @@ static void show_additional_info(int /*argc*/, char ** argv) {
     LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
+static struct jarvis_model * llava_init(common_params * params) {
+    jarvis_backend_init();
+    jarvis_numa_init(params->numa);
 
-    llama_model_params model_params = common_model_params_to_llama(*params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(*params);
 
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -39,13 +39,13 @@ static struct llama_model * llava_init(common_params * params) {
     return model;
 }
 
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
+static struct llava_context * llava_init_context(common_params * params, jarvis_model * model) {
     auto prompt = params->prompt;
     if (prompt.empty()) {
         prompt = "describe the image in detail.";
     }
 
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(*params);
     if (params->n_ctx < 2048) {
         // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
         LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
@@ -54,16 +54,16 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
         ctx_params.n_ctx = params->n_ctx;
     }
 
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx_jarvis = jarvis_new_context_with_model(model, ctx_params);
 
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+    if (ctx_jarvis == NULL) {
+        LOG_ERR("%s: failed to create the jarvis_context\n" , __func__);
         return NULL;
     }
 
     auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
-    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_jarvis = ctx_jarvis;
     ctx_llava->model = model;
     return ctx_llava;
 }
@@ -74,9 +74,9 @@ static void llava_free(struct llava_context * ctx_llava) {
         ctx_llava->ctx_clip = NULL;
     }
 
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
+    jarvis_free(ctx_llava->ctx_jarvis);
+    jarvis_free_model(ctx_llava->model);
+    jarvis_backend_free();
 }
 
 static struct clip_ctx * clip_init_context(common_params * params) {
@@ -90,14 +90,14 @@ static struct clip_ctx * clip_init_context(common_params * params) {
     return ctx_clip;
 }
 
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+static bool eval_tokens(struct jarvis_context * ctx_jarvis, std::vector<jarvis_token> tokens, int n_batch, int * n_past) {
     int N = (int) tokens.size();
     for (int i = 0; i < N; i += n_batch) {
         int n_eval = (int) tokens.size() - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+        if (jarvis_decode(ctx_jarvis, jarvis_batch_get_one(&tokens[i], n_eval))) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
@@ -106,16 +106,16 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
     return true;
 }
 
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
+static bool eval_id(struct jarvis_context * ctx_jarvis, int id, int * n_past) {
+    std::vector<jarvis_token> tokens;
     tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
+    return eval_tokens(ctx_jarvis, tokens, 1, n_past);
 }
 
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+static bool eval_string(struct jarvis_context * ctx_jarvis, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    std::vector<jarvis_token> embd_inp = common_tokenize(ctx_jarvis, str2, add_bos, true);
+    return eval_tokens(ctx_jarvis, embd_inp, n_batch, n_past);
 }
 
 static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
@@ -125,7 +125,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
     auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
     slice_embed->embed = image_embed;
     slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
-    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
+    llava_eval_image_embed(ctx_llava->ctx_jarvis, slice_embed, n_batch, n_past);
     llava_image_embed_free(slice_embed);
 }
 
@@ -141,39 +141,39 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
         system_prompt = "<|im_start|>user\n";
     }
     LOG_INF("%s: image token past: %d\n", __func__, n_past);
-    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_jarvis, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_jarvis, std::string("</image>").c_str(), params->n_batch, &n_past, false);
     if (num_image_embeds > 1) {
         size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
+        eval_string(ctx_llava->ctx_jarvis, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
         for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
             for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+                eval_string(ctx_llava->ctx_jarvis, std::string("<image>").c_str(), params->n_batch, &n_past, false);
                 process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+                eval_string(ctx_llava->ctx_jarvis, std::string("</image>").c_str(), params->n_batch, &n_past, false);
                 if (j == num_image_embeds_col - 1) {
-                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
+                    eval_string(ctx_llava->ctx_jarvis, std::string("\n").c_str(), params->n_batch, &n_past, false);
                 }
             }
         }
-        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
+        eval_string(ctx_llava->ctx_jarvis, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
     }
     LOG_INF("%s: image token past: %d\n", __func__, n_past);
 }
 
 static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
+                           struct jarvis_context * ctx_jarvis,
                            int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    const jarvis_token id = common_sampler_sample(smpl, ctx_jarvis, -1);
     common_sampler_accept(smpl, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (jarvis_token_is_eog(jarvis_get_model(ctx_jarvis), id)) {
         ret = "</s>";
     } else {
-        ret = common_token_to_piece(ctx_llama, id);
+        ret = common_token_to_piece(ctx_jarvis, id);
     }
-    eval_id(ctx_llama, id, n_past);
+    eval_id(ctx_jarvis, id, n_past);
     return ret.c_str();
 }
 
@@ -207,13 +207,13 @@ static struct llava_context * minicpmv_init(common_params * params, const std::s
     process_image(ctx_llava, embeds, params, n_past);
     const int64_t t_process_image_end_us = ggml_time_us();
     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_INF("%s: jarvis process image in %8.2f ms.\n", __func__, t_process_image_ms);
 
     llava_image_embed_free(embeds);
     return ctx_llava;
 }
 
-static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
+static struct common_sampler * jarvis_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
     std::string user_prompt = prompt;
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (!is_first) {
@@ -225,12 +225,12 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
         }
     }
 
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_jarvis, user_prompt.c_str(), params->n_batch, &n_past, false);
     if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
+        eval_string(ctx_llava->ctx_jarvis, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
     }
     else if (has_minicpmv_projector == 3) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+        eval_string(ctx_llava->ctx_jarvis, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
     }
 
     // generate the response
@@ -241,9 +241,9 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
     return smpl;
 }
 
-static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
+static const char * jarvis_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
 
-    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
+    const char * tmp = sample(smpl, ctx_llava->ctx_jarvis, &n_past);
     return tmp;
 }
 
@@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
 
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_LLAVA, show_additional_info)) {
         return 1;
     }
 
@@ -270,12 +270,12 @@ int main(int argc, char ** argv) {
         if (!params.prompt.empty()) {
             LOG("<user>%s\n", params.prompt.c_str());
             LOG("<assistant>");
-            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
+            auto * smpl = jarvis_init(ctx_llava, &params, params.prompt, n_past, true);
             const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
             std::string response;
             bool have_tmp = false;
             for (int i = 0; i < max_tgt_len; i++) {
-                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                const auto * tmp = jarvis_loop(ctx_llava, smpl, n_past);
                 response += tmp;
                 if (strcmp(tmp, "</s>") == 0){
                     if (!have_tmp) {
@@ -297,11 +297,11 @@ int main(int argc, char ** argv) {
                 std::string prompt;
                 std::getline(std::cin, prompt);
                 LOG("<assistant>");
-                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                auto * smpl = jarvis_init(ctx_llava, &params, prompt, n_past, true);
                 const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                 std::string response;
                 for (int i = 0; i < max_tgt_len; i++) {
-                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                    const auto * tmp = jarvis_loop(ctx_llava, smpl, n_past);
                     response += tmp;
                     if (strcmp(tmp, "</s>") == 0) break;
                     if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -313,7 +313,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
+        jarvis_perf_context_print(ctx_llava->ctx_jarvis);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
index ea773742a832b..04e7d4a8325af 100644
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -114,7 +114,7 @@ def __init__(
     # See all SigLIP models at https://huggingface.co/models?filter=siglip
 ]
 
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+# Copied from transformers.models.jarvis.modeling_jarvis._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py
index 748ff5c57824e..2b0751396a7df 100644
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@@ -41,5 +41,5 @@
 tok.save_pretrained(f"{args.model}/model")
 
 print("Done!")
-print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular JARVIS GGUF file.")
 print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
index cbcbf26c9b4e9..d51f4bb37d975 100644
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,4 +1,4 @@
--r ../../requirements/requirements-convert_legacy_llama.txt
+-r ../../requirements/requirements-convert_legacy_jarvis.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
index f0ae5cd89244c..5d1505c7c9594 100644
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-lookahead)
+set(TARGET jarvis-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md
index a69a471b47d39..532d549a9e6da 100644
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -1,7 +1,7 @@
-# llama.cpp/examples/lookahead
+# jarvis.cpp/examples/lookahead
 
 Demonstration of lookahead decoding technique:
 
 https://lmsys.org/blog/2023-11-21-lookahead-decoding/
 
-More info: https://github.com/ggerganov/llama.cpp/pull/4207
+More info: https://github.com/ggerganov/jarvis.cpp/pull/4207
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 3c0ccfea2ccd7..27d19c7bdc326 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "sampling.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cstdio>
 #include <string>
@@ -11,11 +11,11 @@
 struct ngram_data {
     bool active = false;
 
-    llama_seq_id seq_id = -1;
+    jarvis_seq_id seq_id = -1;
 
     std::vector<int> i_batch;
 
-    std::vector<llama_token> tokens;
+    std::vector<jarvis_token> tokens;
 };
 
 // n-gram container
@@ -33,13 +33,13 @@ struct ngram_container {
 
     // [n_vocab][G][N - 1]
     // for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
-    std::vector<llama_token> tokens;
+    std::vector<jarvis_token> tokens;
 };
 
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) {
         return 1;
     }
 
@@ -51,24 +51,24 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the target model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     // Tokenize the prompt
-    std::vector<llama_token> inp;
-    std::vector<llama_token> all;
+    std::vector<jarvis_token> inp;
+    std::vector<jarvis_token> all;
 
     inp = common_tokenize(ctx, params.prompt, true, true);
     all = inp;
 
-    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_context_size     = jarvis_n_ctx(ctx);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
@@ -89,11 +89,11 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
+    jarvis_decode(ctx, jarvis_batch_get_one( inp.data(), n_input - 1));
+    jarvis_decode(ctx, jarvis_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+        jarvis_kv_cache_seq_cp(ctx, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -103,7 +103,7 @@ int main(int argc, char ** argv) {
 
     int n_past = inp.size();
 
-    llama_token id = 0;
+    jarvis_token id = 0;
 
     // used to determine end of generation
     bool has_eos = false;
@@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
     // seq_id == 0           : the current input token
     // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
     // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    jarvis_batch batch = jarvis_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
     struct common_sampler * smpl = common_sampler_init(model, params.sparams);
@@ -121,8 +121,8 @@ int main(int argc, char ** argv) {
     std::vector<ngram_data> ngrams_cur(G);
 
     // tokens for the past N - 1 Jacobi iterations
-    std::vector<llama_token> tokens_j_prev(W);
-    std::vector<std::vector<llama_token>> tokens_j(N - 1);
+    std::vector<jarvis_token> tokens_j_prev(W);
+    std::vector<std::vector<jarvis_token>> tokens_j(N - 1);
     for (int j = 0; j < N - 1; j++) {
         tokens_j[j].resize(W);
 
@@ -138,19 +138,19 @@ int main(int argc, char ** argv) {
         }
     }
 
-    std::vector<llama_seq_id> seq_id_look;
+    std::vector<jarvis_seq_id> seq_id_look;
 
     // the input token belongs both to all sequences
-    std::vector<llama_seq_id> seq_id_all(W + G + 1);
+    std::vector<jarvis_seq_id> seq_id_all(W + G + 1);
     for (int i = 0; i < W + G + 1; i++) {
         seq_id_all[i] = i;
     }
 
     // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_n_vocab(model), N, G);
+    ngram_container ngrams_observed(jarvis_n_vocab(model), N, G);
 
     // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
+    struct jarvis_kv_cache_view kvc_view = jarvis_kv_cache_view_init(ctx, W + G + 1);
 
     const auto t_dec_start = ggml_time_us();
 
@@ -171,7 +171,7 @@ int main(int argc, char ** argv) {
     while (true) {
         // debug
         if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
+            jarvis_kv_cache_view_update(ctx, &kvc_view);
             common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
                     for (int g = 0; g < g_cur; g++) {
                         const int idx = id*(N - 1)*G + g*(N - 1);
 
-                        const llama_token t = ngrams_observed.tokens[idx + j];
+                        const jarvis_token t = ngrams_observed.tokens[idx + j];
 
                         ngrams_cur[g].tokens [j + 1] = t;
                         ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
             }
         }
 
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
+        if (jarvis_decode(ctx, batch) != 0) {
+            LOG_ERR("\n\n%s: jarvis_decode failed - increase KV cache size\n", __func__);
             return 1;
         }
 
@@ -297,7 +297,7 @@ int main(int argc, char ** argv) {
                 }
                 fflush(stdout);
 
-                if (llama_token_is_eog(model, id)) {
+                if (jarvis_token_is_eog(model, id)) {
                     has_eos = true;
                 }
 
@@ -377,7 +377,7 @@ int main(int argc, char ** argv) {
             // update observed ngrams
             if (v == 0) {
                 // the first token of the n-gram is determined by the index in the container so it is not stored
-                std::vector<llama_token> ngram(N - 1);
+                std::vector<jarvis_token> ngram(N - 1);
 
                 // n-gram generation
                 // ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
@@ -435,17 +435,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
+        jarvis_kv_cache_seq_rm(ctx, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
+            jarvis_kv_cache_seq_keep(ctx, seq_id_best);
+            jarvis_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
+            jarvis_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+                jarvis_kv_cache_seq_cp(ctx, 0, s, -1, -1);
             }
         }
     }
@@ -470,14 +470,14 @@ int main(int argc, char ** argv) {
 
     common_sampler_free(smpl);
 
-    llama_kv_cache_view_free(&kvc_view);
+    jarvis_kv_cache_view_free(&kvc_view);
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index ef19fe25e31a3..cf174ea6eb127 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,23 +1,23 @@
-set(TARGET llama-lookup)
+set(TARGET jarvis-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET llama-lookup-create)
+set(TARGET jarvis-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET llama-lookup-merge)
+set(TARGET jarvis-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET llama-lookup-stats)
+set(TARGET jarvis-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/lookup/README.md b/examples/lookup/README.md
index 71c345c037a2f..86d2e9cad9bee 100644
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/examples/lookup
+# jarvis.cpp/examples/lookup
 
 Demonstration of Prompt Lookup Decoding
 
@@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft
 
 More info:
 
-https://github.com/ggerganov/llama.cpp/pull/4484
-https://github.com/ggerganov/llama.cpp/issues/4226
+https://github.com/ggerganov/jarvis.cpp/pull/4484
+https://github.com/ggerganov/jarvis.cpp/issues/4226
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 7ced0aa971805..44f3c5448fb73 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "ngram-cache.h"
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cstdint>
 #include <fstream>
@@ -14,29 +14,29 @@
 int main(int argc, char ** argv){
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
     GGML_ASSERT(model != nullptr);
 
     // tokenize the prompt
-    std::vector<llama_token> inp;
+    std::vector<jarvis_token> inp;
     inp = common_tokenize(ctx, params.prompt, true, true);
     fprintf(stderr, "%s: tokenization done\n", __func__);
 
 
     common_ngram_cache ngram_cache;
-    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    common_ngram_cache_update(ngram_cache, JARVIS_NGRAM_STATIC, JARVIS_NGRAM_STATIC, inp, inp.size(), true);
     fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
 
     common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
index 6871c0f5fdb37..f9cfc2ae4e5b6 100644
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "ngram-cache.h"
 
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 7faebe7ba11fc..176259bfe0784 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "ngram-cache.h"
-#include "llama.h"
+#include "jarvis.h"
 #include "ggml.h"
 
 #include <cstdint>
@@ -15,7 +15,7 @@
 int main(int argc, char ** argv){
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
@@ -23,18 +23,18 @@ int main(int argc, char ** argv){
 
     const int n_draft = params.n_draft;
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     // tokenize the prompt
-    std::vector<llama_token> inp;
+    std::vector<jarvis_token> inp;
     inp = common_tokenize(ctx, params.prompt, true, true);
 
     common_ngram_cache ngram_cache_context;
@@ -65,7 +65,7 @@ int main(int argc, char ** argv){
     }
 
     const int n_input = inp.size();
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     int n_drafted = 0;
     int n_accept  = 0;
@@ -75,26 +75,26 @@ int main(int argc, char ** argv){
     // Iterate over input tokens in chunks of size n_ctx.
     // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
     for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
-        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
-        std::vector<llama_token> pseudo_output;
+        const std::vector<jarvis_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
+        std::vector<jarvis_token> pseudo_output;
         pseudo_output.push_back(inp_slice[0]);
 
         while ((int) pseudo_output.size() < n_ctx) {
             // Simulate drafting and decoding from draft:
-            std::vector<llama_token> draft;
+            std::vector<jarvis_token> draft;
             draft.push_back(pseudo_output.back());
 
             {
                 const int64_t t_start_draft_us = ggml_time_us();
-                common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                common_ngram_cache_draft(pseudo_output, draft, n_draft, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
                 t_draft_us += ggml_time_us() - t_start_draft_us;
             }
 
             n_drafted += draft.size() - 1;
 
             for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
-                const llama_token ground_truth = inp_slice[pseudo_output.size()];
-                const llama_token drafted = draft[j];
+                const jarvis_token ground_truth = inp_slice[pseudo_output.size()];
+                const jarvis_token drafted = draft[j];
 
                 if (ground_truth != drafted) {
                     break;
@@ -105,7 +105,7 @@ int main(int argc, char ** argv){
 
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, pseudo_output, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }
@@ -115,7 +115,7 @@ int main(int argc, char ** argv){
                 pseudo_output.push_back(inp_slice[pseudo_output.size()]);
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, pseudo_output, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }
@@ -149,10 +149,10 @@ int main(int argc, char ** argv){
     LOG_INF("n_accept     = %d\n", n_accept);
     LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index a04728b1834cc..ec99a05dd586e 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -4,7 +4,7 @@
 #include "ngram-cache.h"
 #include "sampling.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cstdint>
 #include <cstdio>
@@ -15,7 +15,7 @@
 int main(int argc, char ** argv){
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
@@ -26,18 +26,18 @@ int main(int argc, char ** argv){
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     // tokenize the prompt
-    std::vector<llama_token> inp;
+    std::vector<jarvis_token> inp;
     inp = common_tokenize(ctx, params.prompt, true, true);
 
     common_ngram_cache ngram_cache_context;
@@ -49,7 +49,7 @@ int main(int argc, char ** argv){
     {
         // Fill up context ngram cache with tokens from user input:
         const int64_t t_start_draft_us = ggml_time_us();
-        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
+        common_ngram_cache_update(ngram_cache_context, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, inp, inp.size(), false);
 
         if (!params.lookup_cache_static.empty()) {
             try {
@@ -69,7 +69,7 @@ int main(int argc, char ** argv){
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
     }
 
-    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_context_size     = jarvis_n_ctx(ctx);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
 
     const auto t_enc_start = ggml_time_us();
 
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
+    jarvis_decode(ctx, jarvis_batch_get_one( inp.data(), n_input - 1));
+    jarvis_decode(ctx, jarvis_batch_get_one(&inp.back(),           1));
 
     const auto t_enc_end = ggml_time_us();
 
@@ -104,19 +104,19 @@ int main(int argc, char ** argv){
 
     struct common_sampler * smpl = common_sampler_init(model, params.sparams);
 
-    std::vector<llama_token> draft;
+    std::vector<jarvis_token> draft;
 
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+    jarvis_batch batch_tgt = jarvis_batch_init(params.n_ctx, 0, 1);
 
     // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
+    struct jarvis_kv_cache_view kvc_view = jarvis_kv_cache_view_init(ctx, 1);
 
     const auto t_dec_start = ggml_time_us();
 
     while (true) {
         // debug
         if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
+            jarvis_kv_cache_view_update(ctx, &kvc_view);
             common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
@@ -126,7 +126,7 @@ int main(int argc, char ** argv){
         int i_dft = 0;
         while (true) {
             // sample from the target model
-            llama_token id = common_sampler_sample(smpl, ctx, i_dft);
+            jarvis_token id = common_sampler_sample(smpl, ctx, i_dft);
 
             common_sampler_accept(smpl, id, true);
 
@@ -136,7 +136,7 @@ int main(int argc, char ** argv){
                 LOG("%s", token_str.c_str());
             }
 
-            if (llama_token_is_eog(model, id)) {
+            if (jarvis_token_is_eog(model, id)) {
                 has_eos = true;
             }
 
@@ -152,7 +152,7 @@ int main(int argc, char ** argv){
                 {
                     // Update context ngram cache with the newly accepted token:
                     const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, inp, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
 
@@ -178,7 +178,7 @@ int main(int argc, char ** argv){
             {
                 // Update context ngram cache with the newly accepted token:
                 const int64_t t_start_draft_us = ggml_time_us();
-                common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                common_ngram_cache_update(ngram_cache_context, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, inp, 1, false);
                 t_draft_us += ggml_time_us() - t_start_draft_us;
             }
             break;
@@ -190,7 +190,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        jarvis_kv_cache_seq_rm(ctx, 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
@@ -200,7 +200,7 @@ int main(int argc, char ** argv){
         GGML_ASSERT(draft[0] == inp.back());
         const int64_t t_start_draft_us = ggml_time_us();
 
-        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+        common_ngram_cache_draft(inp, draft, n_draft, JARVIS_NGRAM_MIN, JARVIS_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
 
         for (size_t i = 1; i < draft.size(); ++i) {
             common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
@@ -209,7 +209,7 @@ int main(int argc, char ** argv){
         t_draft_us += ggml_time_us() - t_start_draft_us;
         n_drafted += draft.size() - 1;
 
-        llama_decode(ctx, batch_tgt);
+        jarvis_decode(ctx, batch_tgt);
         ++n_past;
 
         draft.erase(draft.begin());
@@ -241,12 +241,12 @@ int main(int argc, char ** argv){
 
     common_sampler_free(smpl);
 
-    llama_batch_free(batch_tgt);
+    jarvis_batch_free(batch_tgt);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
index 3b38db292320f..eb81bab12aff1 100644
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -1,12 +1,12 @@
 cmake_minimum_required(VERSION 3.12)
-project("llama-cli-cmake-pkg" C CXX)
-set(TARGET llama-cli-cmake-pkg)
+project("jarvis-cli-cmake-pkg" C CXX)
+set(TARGET jarvis-cli-cmake-pkg)
 
-find_package(Llama 0.0.1 REQUIRED)
+find_package(Jarvis 0.0.1 REQUIRED)
 
 # Bake common functionality in with target. Because applications
-# using the relocatable Llama package should be outside of the
-# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
+# using the relocatable Jarvis package should be outside of the
+# source tree, jarvis-cli-cmake-pkg pretends the dependencies are built-in.
 set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
 add_library(common OBJECT)
 file(GLOB _common_files
@@ -15,18 +15,18 @@ file(GLOB _common_files
 )
 target_sources(common PRIVATE ${_common_files})
 
-# If the common project was part of "llama-cli-cmake-pkg" the transient
+# If the common project was part of "jarvis-cli-cmake-pkg" the transient
 # defines would automatically be attached. Because the common func-
 # tionality is separate, but dependent upon the defines, it must be
-# explicitly extracted from the "llama" target.
+# explicitly extracted from the "jarvis" target.
 #
-get_target_property(_llama_transient_defines llama
+get_target_property(_jarvis_transient_defines jarvis
     INTERFACE_COMPILE_DEFINITIONS)
 
-target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
+target_compile_definitions(common PRIVATE "${_jarvis_transient_defines}")
 
 add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
index 08d83dd08636a..2126db432f525 100644
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -1,31 +1,31 @@
-# llama.cpp/example/main-cmake-pkg
+# jarvis.cpp/example/main-cmake-pkg
 
-This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+This program builds [jarvis-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [jarvis.cpp](https://github.com/ggerganov/jarvis.cpp) in projects which live outside of the source tree.
 
 ## Building
 
-Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
+Because this example is "outside of the source tree", it is important to first build/install jarvis.cpp using CMake. An example is provided here, but please see the [jarvis.cpp build instructions](../..) for more detailed build instructions.
 
 ### Considerations
 
 When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
 
-### Build llama.cpp and install to C:\LlamaCPP directory
+### Build jarvis.cpp and install to C:\JarvisCPP directory
 
 ```cmd
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+git clone https://github.com/ggerganov/jarvis.cpp
+cd jarvis.cpp
 cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
-cmake --install build --prefix C:/LlamaCPP
+cmake --install build --prefix C:/JarvisCPP
 ```
 
-### Build llama-cli-cmake-pkg
+### Build jarvis-cli-cmake-pkg
 
 
 ```cmd
 cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/JarvisCPP/lib/cmake/Jarvis" -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
-cmake --install build --prefix C:/MyLlamaApp
+cmake --install build --prefix C:/MyJarvisApp
 ```
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index 5f6efaa9aa94b..f7751fa7bc0a2 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-cli)
+set(TARGET jarvis-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main/README.md b/examples/main/README.md
index c7c8231717980..018370753934b 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,6 +1,6 @@
-# llama.cpp/examples/main
+# jarvis.cpp/examples/main
 
-This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various JARVIS language models easily and efficiently. It is specifically designed to work with the [jarvis.cpp](https://github.com/ggerganov/jarvis.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with JARVIS models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
 
 ## Table of Contents
 
@@ -20,60 +20,60 @@ To get started right away, run the following command, making sure to use the cor
 First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
 [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
 
-Once downloaded, place your model in the models folder in llama.cpp.
+Once downloaded, place your model in the models folder in jarvis.cpp.
 
 ### Unix-based systems (Linux, macOS, etc.):
 
 ##### Input prompt (One-and-done)
 
 ```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
+./jarvis-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
 ##### Conversation mode (Allow for continuous interaction with the model)
 
 ```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
+./jarvis-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 ```
 
 ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+./jarvis-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 
 ### Windows:
 
 ##### Input prompt (One-and-done)
 ```powershell
-./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
+./jarvis-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
 ##### Conversation mode (Allow for continuous interaction with the model)
 
 ```powershell
-./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
+./jarvis-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 ```
 
 #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 
 ```powershell
-llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+jarvis-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 
 ## Common Options
 
-In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
+In this section, we cover the most commonly used options for running the `jarvis-cli` program with the JARVIS models:
 
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
+-   `-m FNAME, --model FNAME`: Specify the path to the JARVIS model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but JARVIS models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 
 ## Input Prompts
 
-The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
+The `jarvis-cli` program provides several ways to interact with the JARVIS models using input prompts:
 
 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
@@ -81,9 +81,9 @@ The `llama-cli` program provides several ways to interact with the LLaMA models
 
 ## Interaction
 
-The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
+The `jarvis-cli` program offers a seamless way to interact with JARVIS models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
 
-In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the JARVIS model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
 
 ### Interaction Options
 
@@ -92,11 +92,11 @@ In interactive mode, users can participate in text generation by injecting their
 -   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the JARVIS models, tailoring the text generation process to your specific needs.
 
 ### Reverse Prompts
 
-Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+Reverse prompts are a powerful way to create a chat-like experience with a JARVIS model by pausing the text generation when specific text strings are encountered:
 
 -   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
 
@@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./llama-cli -r "User:" --in-prefix " "
+./jarvis-cli -r "User:" --in-prefix " "
 ```
 
 ### In-Suffix
@@ -115,23 +115,23 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./jarvis-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
 
 ### Chat templates
 
- `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
+ `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Jarvis.cpp only supports [some pre-defined templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template). These include jarvis2, jarvis3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
 
  Example usage: `--chat-template gemma`
 
 ## Context Management
 
-During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+During text generation, JARVIS models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
 
 ### Context Size
 
-- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The JARVIS models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
 
 ### Extended Context Size
 
@@ -145,7 +145,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
 
 -   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
 
-By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the JARVIS models, ensuring that the generated text remains relevant to the original prompt or conversation.
 
 ## Generation Flags
 
@@ -286,7 +286,7 @@ The logit bias option allows you to manually adjust the likelihood of specific t
 
 For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
 
-A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
+A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in JARVIS model inference.)
 
 Example usage: `--logit-bias 29905-inf`
 
@@ -298,7 +298,7 @@ The RNG seed is used to initialize the random number generator that influences t
 
 ## Performance Tuning and Memory Options
 
-These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
+These options help improve the performance and memory usage of the JARVIS models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
 
 ### Number of Threads
 
@@ -339,11 +339,11 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### Quantization
 
-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to jarvis.cpp's primary [README](../../README.md#prepare-and-quantize).
 
 ## Additional Options
 
-These options provide extra functionality and customization when running the LLaMA models:
+These options provide extra functionality and customization when running the JARVIS models:
 
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
@@ -351,4 +351,4 @@ These options provide extra functionality and customization when running the LLa
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
+-   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `JARVIS_CACHE` environment variable  or in an OS-specific local cache.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 374ed47ad6311..2ad207e996a8e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -3,7 +3,7 @@
 #include "console.h"
 #include "log.h"
 #include "sampling.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cassert>
 #include <cstdio>
@@ -31,13 +31,13 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
+static jarvis_context           ** g_ctx;
+static jarvis_model             ** g_model;
 static common_sampler          ** g_smpl;
 static common_params            * g_params;
-static std::vector<llama_token> * g_input_tokens;
+static std::vector<jarvis_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
+static std::vector<jarvis_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
@@ -45,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
     (void) argc;
 
     LOG("\nexample usage:\n");
-    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n  text generation:      -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation):  -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
     LOG("\n");
 }
 
@@ -63,9 +63,9 @@ static bool file_is_empty(const std::string & path) {
 }
 
 static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
+    const jarvis_context * ctx, const common_params & params, const jarvis_model * model,
+    const std::vector<jarvis_token> & input_tokens, const std::string & output,
+    const std::vector<jarvis_token> & output_tokens
 ) {
     if (params.logdir.empty()) {
         return;
@@ -89,7 +89,7 @@ static void write_logfile(
 
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
+    jarvis_model_desc(model, model_desc, sizeof(model_desc));
     yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
 
     fprintf(logfile, "\n");
@@ -101,7 +101,7 @@ static void write_logfile(
     yaml_dump_string_multiline(logfile, "output", output.c_str());
     yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
 
-    llama_perf_dump_yaml(logfile, ctx);
+    jarvis_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -127,7 +127,7 @@ static void sigint_handler(int signo) {
 }
 #endif
 
-static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+static std::string chat_add_and_format(struct jarvis_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
     common_chat_msg new_msg{role, content};
     auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
@@ -138,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<c
 int main(int argc, char ** argv) {
     common_params params;
     g_params = &params;
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
 
@@ -180,13 +180,13 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_INF("%s: llama backend init\n", __func__);
+    LOG_INF("%s: jarvis backend init\n", __func__);
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
+    jarvis_model * model = nullptr;
+    jarvis_context * ctx = nullptr;
     common_sampler * smpl = nullptr;
 
     std::vector<common_chat_msg> chat_msgs;
@@ -197,17 +197,17 @@ int main(int argc, char ** argv) {
 
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    model = llama_init.model;
-    ctx = llama_init.context;
+    model = jarvis_init.model;
+    ctx = jarvis_init.context;
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
-    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+    LOG_INF("%s: jarvis threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
 
     struct ggml_threadpool_params tpp_batch =
             ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
@@ -234,10 +234,10 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+    jarvis_attach_threadpool(ctx, threadpool, threadpool_batch);
 
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     if (n_ctx > n_ctx_train) {
         LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
@@ -260,7 +260,7 @@ int main(int argc, char ** argv) {
     }
 
     std::string path_session = params.path_prompt_cache;
-    std::vector<llama_token> session_tokens;
+    std::vector<jarvis_token> session_tokens;
 
     if (!path_session.empty()) {
         LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
@@ -272,7 +272,7 @@ int main(int argc, char ** argv) {
             // The file exists and is not empty
             session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
-            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!jarvis_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                 LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
                 return 1;
             }
@@ -281,14 +281,14 @@ int main(int argc, char ** argv) {
         }
     }
 
-    const bool add_bos = llama_add_bos_token(model);
-    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(!llama_add_eos_token(model));
+    const bool add_bos = jarvis_add_bos_token(model);
+    if (!jarvis_model_has_encoder(model)) {
+        GGML_ASSERT(!jarvis_add_eos_token(model));
     }
 
     LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
 
-    std::vector<llama_token> embd_inp;
+    std::vector<jarvis_token> embd_inp;
 
     {
         auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
@@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
     // Should not run without any tokens
     if (embd_inp.empty()) {
         if (add_bos) {
-            embd_inp.push_back(llama_token_bos(model));
+            embd_inp.push_back(jarvis_token_bos(model));
             LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
         } else {
             LOG_ERR("input is empty\n");
@@ -326,7 +326,7 @@ int main(int argc, char ** argv) {
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
     if (!session_tokens.empty()) {
-        for (llama_token id : session_tokens) {
+        for (jarvis_token id : session_tokens) {
             if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                 break;
             }
@@ -345,7 +345,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        jarvis_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -514,28 +514,28 @@ int main(int argc, char ** argv) {
     console::set_display(console::prompt);
     display = params.display_prompt;
 
-    std::vector<llama_token> embd;
+    std::vector<jarvis_token> embd;
 
     // tokenized antiprompts
-    std::vector<std::vector<llama_token>> antiprompt_ids;
+    std::vector<std::vector<jarvis_token>> antiprompt_ids;
 
     antiprompt_ids.reserve(params.antiprompt.size());
     for (const std::string & antiprompt : params.antiprompt) {
         antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
     }
 
-    if (llama_model_has_encoder(model)) {
+    if (jarvis_model_has_encoder(model)) {
         int enc_input_size = embd_inp.size();
-        llama_token * enc_input_buf = embd_inp.data();
+        jarvis_token * enc_input_buf = embd_inp.data();
 
-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
+        if (jarvis_encode(ctx, jarvis_batch_get_one(enc_input_buf, enc_input_size))) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
 
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
         if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
+            decoder_start_token_id = jarvis_token_bos(model);
         }
 
         embd_inp.clear();
@@ -582,8 +582,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    jarvis_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    jarvis_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -606,9 +606,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    jarvis_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    jarvis_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    jarvis_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
@@ -648,7 +648,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                if (jarvis_decode(ctx, jarvis_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -674,12 +674,12 @@ int main(int argc, char ** argv) {
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                 need_to_save_session = false;
-                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                jarvis_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
 
                 LOG_DBG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = common_sampler_sample(smpl, ctx, -1);
+            const jarvis_token id = common_sampler_sample(smpl, ctx, -1);
 
             common_sampler_accept(smpl, id, /* accept_grammar= */ true);
 
@@ -765,8 +765,8 @@ int main(int argc, char ** argv) {
                 }
 
                 // check for reverse prompt using special tokens
-                llama_token last_token = common_sampler_last(smpl);
-                for (std::vector<llama_token> ids : antiprompt_ids) {
+                jarvis_token last_token = common_sampler_last(smpl);
+                for (std::vector<jarvis_token> ids : antiprompt_ids) {
                     if (ids.size() == 1 && last_token == ids[0]) {
                         if (params.interactive) {
                             is_interacting = true;
@@ -782,7 +782,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, common_sampler_last(smpl))) {
+            if (jarvis_token_is_eog(model, common_sampler_last(smpl))) {
                 LOG_DBG("found an EOG token\n");
 
                 if (params.interactive) {
@@ -816,7 +816,7 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    embd_inp.push_back(jarvis_token_bos(model));
                 }
 
                 std::string buffer;
@@ -870,8 +870,8 @@ int main(int argc, char ** argv) {
 
                     // if user stop generation mid-way, we must add EOT to finish model's last response
                     if (need_insert_eot && format_chat) {
-                        llama_token eot = llama_token_eot(model);
-                        embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
+                        jarvis_token eot = jarvis_token_eot(model);
+                        embd_inp.push_back(eot == -1 ? jarvis_token_eos(model) : eot);
                         need_insert_eot = false;
                     }
 
@@ -880,7 +880,7 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
+                        const jarvis_token token = embd_inp[i];
                         output_tokens.push_back(token);
                         output_ss << common_token_to_piece(ctx, token);
                     }
@@ -906,7 +906,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
+        if (!embd.empty() && jarvis_token_is_eog(model, embd.back()) && !(params.interactive)) {
             LOG(" [end of text]\n");
             break;
         }
@@ -921,7 +921,7 @@ int main(int argc, char ** argv) {
 
     if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
         LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
-        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        jarvis_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
     LOG("\n\n");
@@ -930,10 +930,10 @@ int main(int argc, char ** argv) {
 
     common_sampler_free(smpl);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     ggml_threadpool_free(threadpool);
     ggml_threadpool_free(threadpool_batch);
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index c13557bac2bac..119dd0e9ce059 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-parallel)
+set(TARGET jarvis-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/parallel/README.md b/examples/parallel/README.md
index df04567337b15..80330eb3767ea 100644
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@@ -1,3 +1,3 @@
-# llama.cpp/example/parallel
+# jarvis.cpp/example/parallel
 
 Simplified simulation of serving incoming requests in parallel
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 43c8f3ed56ba9..b9e62cab3eba1 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -5,7 +5,7 @@
 #include "common.h"
 #include "sampling.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cmath>
 #include <cstdio>
@@ -41,7 +41,7 @@ User:)";
 
 static std::vector<std::string> k_prompts = {
     "What is the meaning of life?",
-    "Tell me an interesting fact about llamas.",
+    "Tell me an interesting fact about jarviss.",
     "What is the best way to cook a steak?",
     "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
     "Recommend some interesting books to read.",
@@ -60,9 +60,9 @@ struct client {
 
     int32_t id = 0;
 
-    llama_seq_id seq_id = -1;
+    jarvis_seq_id seq_id = -1;
 
-    llama_token sampled;
+    jarvis_token sampled;
 
     int64_t t_start_prompt;
     int64_t t_start_gen;
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
 
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_PARALLEL)) {
         return 1;
     }
 
@@ -125,15 +125,15 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the target model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
@@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
 
     LOG_INF("\n\n");
 
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     std::vector<client> clients(n_clients);
     for (size_t i = 0; i < clients.size(); ++i) {
@@ -163,21 +163,21 @@ int main(int argc, char ** argv) {
         client.smpl = common_sampler_init(model, params.sparams);
     }
 
-    std::vector<llama_token> tokens_system;
+    std::vector<jarvis_token> tokens_system;
     tokens_system = common_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
 
-    llama_seq_id g_seq_id = 0;
+    jarvis_seq_id g_seq_id = 0;
 
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    jarvis_batch batch = jarvis_batch_init(n_ctx, 0, 1);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
     int32_t n_cache_miss   = 0;
 
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
+    struct jarvis_kv_cache_view kvc_view = jarvis_kv_cache_view_init(ctx, n_clients);
 
     const auto t_main_start = ggml_time_us();
 
@@ -192,14 +192,14 @@ int main(int argc, char ** argv) {
             common_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+        if (jarvis_decode(ctx, batch) != 0) {
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return 1;
         }
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+            jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -209,7 +209,7 @@ int main(int argc, char ** argv) {
 
     while (true) {
         if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
+            jarvis_kv_cache_view_update(ctx, &kvc_view);
             common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
@@ -231,9 +231,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                jarvis_kv_cache_seq_rm(ctx, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
                     common_sampler_reset(client.smpl);
 
                     // do not prepend BOS because we have a system prompt!
-                    std::vector<llama_token> tokens_prompt;
+                    std::vector<jarvis_token> tokens_prompt;
                     tokens_prompt = common_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
@@ -300,7 +300,7 @@ int main(int argc, char ** argv) {
 
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
-            llama_batch batch_view = {
+            jarvis_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
                 batch.logits   + i,
             };
 
-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = jarvis_decode(ctx, batch_view);
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
@@ -339,7 +339,7 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const jarvis_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
 
                 common_sampler_accept(client.smpl, id, true);
 
@@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (llama_token_is_eog(model, id) ||
+                        (jarvis_token_is_eog(model, id) ||
                          (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
@@ -369,8 +369,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    jarvis_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
+                    jarvis_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
@@ -412,14 +412,14 @@ int main(int argc, char ** argv) {
     LOG_INF("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt
index dc467a5d3e411..0a200f3d9f30e 100644
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-passkey)
+set(TARGET jarvis-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 2b8e910f9658d..99c4dd18ad9ee 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -1,15 +1,15 @@
-# llama.cpp/example/passkey
+# jarvis.cpp/example/passkey
 
 A passkey retrieval task is an evaluation method used to measure a language
 models ability to recall information from long contexts.
 
 See the following PRs for more info:
 
-- https://github.com/ggerganov/llama.cpp/pull/3856
-- https://github.com/ggerganov/llama.cpp/pull/4810
+- https://github.com/ggerganov/jarvis.cpp/pull/3856
+- https://github.com/ggerganov/jarvis.cpp/pull/4810
 
 ### Usage
 
 ```bash
-make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
+make -j && ./jarvis-passkey -m ./models/jarvis-7b-v2/ggml-model-f16.gguf --junk 250
 ```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 09bba708f6f91..b98f00c67e01e 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cmath>
 #include <cstdio>
@@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
     params.n_keep = 32;
     params.i_pos  = -1;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_PASSKEY, print_usage)) {
         return 1;
     }
 
@@ -56,14 +56,14 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // initialize the model
 
-    llama_model_params model_params = common_model_params_to_llama(params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(params);
 
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
@@ -72,26 +72,26 @@ int main(int argc, char ** argv) {
 
     // initialize the context
 
-    llama_context_params ctx_params = common_context_params_to_llama(params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
 
-    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
+    ctx_params.n_ctx = jarvis_n_ctx_train(model)*n_grp + n_keep;
 
     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
     if (ctx == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the jarvis_context\n" , __func__);
         return 1;
     }
 
-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = jarvis_sampler_chain_default_params();
 
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_greedy());
 
     // tokenize the prompt
-    std::vector<llama_token> tokens_list;
+    std::vector<jarvis_token> tokens_list;
     tokens_list = common_tokenize(ctx, params.prompt, true);
 
     // tokenize the prefix and use it as a sink
@@ -105,8 +105,8 @@ int main(int argc, char ** argv) {
     // total length of the sequences including the prompt
     const int n_len = n_tokens_all + n_predict;
 
-    const int n_ctx       = llama_n_ctx(ctx) - n_keep;
-    const int n_kv_req    = llama_n_ctx(ctx);
+    const int n_ctx       = jarvis_n_ctx(ctx) - n_keep;
+    const int n_kv_req    = jarvis_n_ctx(ctx);
     const int n_batch     = ctx_params.n_batch;
     const int n_batch_grp = ctx_params.n_batch/n_grp;
 
@@ -119,7 +119,7 @@ int main(int argc, char ** argv) {
     LOG_INF("prompt tokens: %d\n", n_tokens_all);
     //LOG_INF("prompt: %s\n", params.prompt.c_str());
 
-    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
+    jarvis_batch batch = jarvis_batch_init(params.n_batch, 0, 1);
 
     int n_past = 0;
 
@@ -130,11 +130,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            jarvis_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            jarvis_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            jarvis_kv_cache_update  (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = jarvis_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -147,8 +147,8 @@ int main(int argc, char ** argv) {
             batch.logits[batch.n_tokens - 1] = true;
         }
 
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_INF("%s: llama_decode() failed\n", __func__);
+        if (jarvis_decode(ctx, batch) != 0) {
+            LOG_INF("%s: jarvis_decode() failed\n", __func__);
             return 1;
         }
 
@@ -164,12 +164,12 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        jarvis_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        jarvis_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //jarvis_kv_cache_defrag (ctx);
+        jarvis_kv_cache_update (ctx);
 
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past = jarvis_kv_cache_seq_pos_max(ctx, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -181,8 +181,8 @@ int main(int argc, char ** argv) {
             batch.logits[batch.n_tokens - 1] = true;
         }
 
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+        if (jarvis_decode(ctx, batch) != 0) {
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return 1;
         }
 
@@ -195,12 +195,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            jarvis_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            jarvis_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //jarvis_kv_cache_defrag (ctx);
+            jarvis_kv_cache_update (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = jarvis_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
     }
 
@@ -220,10 +220,10 @@ int main(int argc, char ** argv) {
     while (n_cur <= n_len) {
         // sample the next token
         {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            const jarvis_token new_token_id = jarvis_sampler_sample(smpl, ctx, batch.n_tokens - 1);
 
             // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (jarvis_token_is_eog(model, new_token_id) || n_cur == n_len) {
                 LOG("\n");
 
                 break;
@@ -243,7 +243,7 @@ int main(int argc, char ** argv) {
         n_cur += 1;
 
         // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (jarvis_decode(ctx, batch)) {
             LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
@@ -257,18 +257,18 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
     LOG("\n");
 
-    llama_sampler_free(smpl);
+    jarvis_sampler_free(smpl);
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index be0f2fd029e67..61eb6cdbb4a90 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-perplexity)
+set(TARGET jarvis-perplexity)
 add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
index 33a46d1a2e38b..7e3de8017a64f 100644
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -5,10 +5,10 @@ Perplexity measures how well the model can predict the next token with lower val
 Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
 Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
 
-Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
+Within jarvis.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
 The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
 When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
-llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
+jarvis.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
 
 By default only the mean perplexity value and the corresponding uncertainty is calculated.
 The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
@@ -16,7 +16,7 @@ The uncertainty is determined empirically by assuming a Gaussian distribution of
 More statistics can be obtained by recording the logits from the FP16 version of a model.
 To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
 The program will then record all logits and save them to the provided path in binary format.
-**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
+**The logit file will be very large, 11 GiB for JARVIS 2 or 37 GiB for JARVIS 3 when using the Wikitext-2 test set.**
 Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
 and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
 This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
@@ -29,10 +29,10 @@ In addition to the KL divergence the following statistics are calculated with `-
 * Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
 * Pearson correlation coefficient of the "correct" token probabilites between models.
 * Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
-* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
+* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/jarvis.cpp/discussions/2875 .
 * Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
 
-## LLaMA 3 8b Scoreboard
+## JARVIS 3 8b Scoreboard
 
 | Revision | f364eb6f           |
 |:---------|:-------------------|
@@ -41,7 +41,7 @@ In addition to the KL divergence the following statistics are calculated with `-
 | GPU      | 1x NVIDIA RTX 4090 |
 
 Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
-The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
+The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/jarvis.cpp_importance_matrices/blob/main/imatrix-jarvis_3-8b-f16-2.7m_tokens.dat).
 Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
 In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
 So the "f16" results are to be understood as the difference resulting only from this downcast.
@@ -98,7 +98,7 @@ So the "f16" results are to be understood as the difference resulting only from
 There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
 K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
 
-## LLaMA 2 vs. LLaMA 3 Quantization comparison
+## JARVIS 2 vs. JARVIS 3 Quantization comparison
 
 | Revision | f364eb6f           |
 |:---------|:-------------------|
@@ -124,7 +124,7 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 | RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
 | Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
 
-## LLaMA 3 BF16 vs. FP16 comparison
+## JARVIS 3 BF16 vs. FP16 comparison
 
 | Revision | 83330d8c      |
 |:---------|:--------------|
@@ -132,7 +132,7 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 | CPU      | AMD Epyc 7742 |
 | GPU      | N/A           |
 
-Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
+Results were calculated with JARVIS 3 8b BF16 as `--kl-divergence-base` and JARVIS 3 8b FP16 as the `--model` for comparison.
 
 | Metric                         |                    Value |
 |--------------------------------|--------------------------|
@@ -172,7 +172,7 @@ Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA
 ## Old Numbers
 
 <details>
-<summary>Llama 2 70B Scoreboard</summary>
+<summary>Jarvis 2 70B Scoreboard</summary>
 
 | Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
 |--------------|------------------|------------|---------------|
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index e803ff143f7d1..7f002a5c5972f 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <array>
@@ -22,7 +22,7 @@
 #endif
 
 struct results_perplexity {
-    std::vector<llama_token> tokens;
+    std::vector<jarvis_token> tokens;
     double                   ppl_value;
     std::vector<float>       logits;
     std::vector<float>       probs;
@@ -35,7 +35,7 @@ struct results_log_softmax {
 };
 
 static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
+    const jarvis_context * ctx, const common_params & params, const jarvis_model * model,
     const struct results_perplexity & results
 ) {
     if (params.logdir.empty()) {
@@ -66,7 +66,7 @@ static void write_logfile(
 
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
+    jarvis_model_desc(model, model_desc, sizeof(model_desc));
     yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
 
     fprintf(logfile, "\n");
@@ -79,7 +79,7 @@ static void write_logfile(
     fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
     yaml_dump_vector_float(logfile, "probs", results.probs);
 
-    llama_perf_dump_yaml(logfile, ctx);
+    jarvis_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -339,20 +339,20 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
     }
 }
 
-static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
+static results_perplexity perplexity_v2(jarvis_context * ctx, const common_params & params) {
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
 
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
+    GGML_ASSERT(!jarvis_add_eos_token(jarvis_get_model(ctx)));
 
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+    std::vector<jarvis_token> tokens = common_tokenize(ctx, params.prompt, true);
 
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
     if (int(tokens.size()) < 2*n_ctx) {
         LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
@@ -387,7 +387,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     int count = 0;
     double nll = 0.0;
@@ -406,9 +406,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -420,9 +420,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
             }
 
             //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_decode(ctx, batch)) {
+            if (jarvis_decode(ctx, batch)) {
                 //LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
+                jarvis_batch_free(batch);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -431,10 +431,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = jarvis_token_bos(jarvis_get_model(ctx));
             }
 
-            const auto * batch_logits = llama_get_logits(ctx);
+            const auto * batch_logits = jarvis_get_logits(ctx);
             logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
 
             if (j == 0) {
@@ -442,7 +442,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
             }
         }
 
-        llama_batch_free(batch);
+        jarvis_batch_free(batch);
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
@@ -483,18 +483,18 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
 
-static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
+static results_perplexity perplexity(jarvis_context * ctx, const common_params & params, const int32_t n_ctx) {
     if (params.ppl_stride > 0) {
         return perplexity_v2(ctx, params);
     }
 
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Run `./jarvis-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
 
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
+    GGML_ASSERT(!jarvis_add_eos_token(jarvis_get_model(ctx)));
 
     std::ofstream logits_stream;
     if (!params.logits_file.empty()) {
@@ -511,7 +511,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     auto tim1 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+    std::vector<jarvis_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -534,7 +534,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     int count = 0;
     double nll = 0.0;
@@ -546,7 +546,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
     GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
 
-    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
+    jarvis_batch batch = jarvis_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
 
     std::vector<float> logits;
     if (num_batches > 1) {
@@ -567,7 +567,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     }
 
     // We get the logits for all the tokens in the context window (params.n_ctx)
-    // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+    // from jarvis_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
     // calculate the perplexity over the last half of the window (so the model always has
     // some context to predict the token).
     //
@@ -589,7 +589,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -606,7 +606,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
 
                 // add BOS token for the first batch of each chunk
                 if (add_bos && j == 0) {
-                    tokens[seq_start] = llama_token_bos(llama_get_model(ctx));
+                    tokens[seq_start] = jarvis_token_bos(jarvis_get_model(ctx));
                 }
 
                 for (int k = 0; k < batch_size; ++k) {
@@ -625,20 +625,20 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
                 tokens[seq_start] = token_org;
             }
 
-            if (llama_decode(ctx, batch)) {
+            if (jarvis_decode(ctx, batch)) {
                 LOG_INF("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
             if (num_batches > 1 && n_outputs > 0) {
-                const auto * batch_logits = llama_get_logits(ctx);
+                const auto * batch_logits = jarvis_get_logits(ctx);
                 logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
             }
         }
 
 
         if (i == 0) {
-            llama_synchronize(ctx);
+            jarvis_synchronize(ctx);
             const auto t_end = std::chrono::high_resolution_clock::now();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
@@ -651,9 +651,9 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         }
 
         for (int seq = 0; seq < n_seq_batch; seq++) {
-            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
+            const float * all_logits = num_batches > 1 ? logits.data() : jarvis_get_logits_ith(ctx, seq*n_ctx + first);
 
-            llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+            jarvis_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
             if (!params.logits_file.empty()) {
                 process_logits(logits_stream, n_vocab, all_logits,
                         tokens_data, n_ctx - 1 - first,
@@ -695,17 +695,17 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
     }
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
     return {tokens, ppl, logit_history, prob_history};
 }
 
-static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
+static bool decode_helper(jarvis_context * ctx, jarvis_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
     int prev_outputs = 0;
     for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
         const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
 
-        llama_batch batch_view = {
+        jarvis_batch batch_view = {
             n_tokens,
             batch.token    + i,
             nullptr,
@@ -715,7 +715,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             batch.logits   + i,
         };
 
-        const int ret = llama_decode(ctx, batch_view);
+        const int ret = jarvis_decode(ctx, batch_view);
         if (ret != 0) {
             LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
             return false;
@@ -726,7 +726,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             n_outputs += batch_view.logits[i] != 0;
         }
 
-        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
+        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, jarvis_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
 
         prev_outputs += n_outputs;
     }
@@ -737,7 +737,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
 #define K_TOKEN_CHUNK 4
 
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
-        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
+        const std::vector<std::pair<size_t, jarvis_token>>& eval_pairs, std::vector<float>& eval_results) {
     if (eval_results.size() != eval_pairs.size()) {
         eval_results.resize(eval_pairs.size());
     }
@@ -780,7 +780,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     }
 }
 
-static void hellaswag_score(llama_context * ctx, const common_params & params) {
+static void hellaswag_score(jarvis_context * ctx, const common_params & params) {
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -814,7 +814,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     size_t hs_task_count = prompt_lines.size()/6;
     LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
 
-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = jarvis_vocab_type(jarvis_get_model(ctx)) == JARVIS_VOCAB_TYPE_SPM;
     LOG_INF("================================= is_spm = %d\n", is_spm);
 
     // The tasks should be randomized so the score stabilizes quickly.
@@ -836,10 +836,10 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
         size_t ending_logprob_count[4];
         double ending_logprob[4];
 
-        size_t i_logits;        // starting index of logits in the llama_batch
+        size_t i_logits;        // starting index of logits in the jarvis_batch
         size_t common_prefix;   // max number of initial tokens that are the same in all sentences
         size_t required_tokens; // needed number of tokens to evaluate all 4 endings
-        std::vector<llama_token> seq_tokens[4];
+        std::vector<jarvis_token> seq_tokens[4];
     };
 
     LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
@@ -880,7 +880,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
             hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
 
-        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
+        //GGML_ASSERT(hs_cur.common_prefix >= ::jarvis_tokenize(ctx, hs_cur.context, true).size());
 
         // Delete the selected random example from the prompt
         if (randomize_tasks) {
@@ -894,21 +894,21 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
 
     double acc = 0.0f;
 
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_ctx   = jarvis_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     const int max_tasks_per_batch = 32;
-    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) jarvis_n_seq_max(ctx));
 
-    llama_batch batch = llama_batch_init(n_ctx, 0, 4);
+    jarvis_batch batch = jarvis_batch_init(n_ctx, 0, 4);
 
     std::vector<float> tok_logits(n_vocab);
     // TODO: this could be made smaller; it's currently the worst-case size
     std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
-    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<std::pair<size_t, jarvis_token>> eval_pairs;
     std::vector<float> eval_results;
     std::vector<std::thread> workers(std::thread::hardware_concurrency());
 
@@ -963,11 +963,11 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return;
         }
 
@@ -1031,7 +1031,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
         i0 = i1 - 1;
     }
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
     LOG("\n");
 }
@@ -1047,7 +1047,7 @@ struct winogrande_entry {
     size_t required_tokens;
     size_t n_base1; // number of tokens for context + choice 1
     size_t n_base2; // number of tokens for context + choice 2
-    std::vector<llama_token> seq_tokens[2];
+    std::vector<jarvis_token> seq_tokens[2];
 };
 
 static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
@@ -1114,13 +1114,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
 /*
  * Evaluates the Winogrande score.
  * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
- * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
+ * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-jarvis.cpp
  * As an example, the 1st row in the above dataset is
  *
  *    0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
  *
  */
-static void winogrande_score(llama_context * ctx, const common_params & params) {
+static void winogrande_score(jarvis_context * ctx, const common_params & params) {
 
     constexpr int k_min_trailing_ctx = 3;
 
@@ -1176,21 +1176,21 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
 
     LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_ctx   = jarvis_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     const int max_tasks_per_batch = 128;
-    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+    const int max_seq = std::min(2*max_tasks_per_batch, (int) jarvis_n_seq_max(ctx));
 
-    llama_batch batch = llama_batch_init(n_ctx, 0, 2);
+    jarvis_batch batch = jarvis_batch_init(n_ctx, 0, 2);
 
     std::vector<float> tok_logits(n_vocab);
     // TODO: this could be made smaller; it's currently the worst-case size
     std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
-    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<std::pair<size_t, jarvis_token>> eval_pairs;
     std::vector<float> eval_results;
     std::vector<std::thread> workers(std::thread::hardware_concurrency());
 
@@ -1240,11 +1240,11 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return;
         }
 
@@ -1356,14 +1356,14 @@ struct multiple_choice_task {
     }
 
     // For evaluation
-    size_t i_logits;        // starting index of logits in the llama_batch
+    size_t i_logits;        // starting index of logits in the jarvis_batch
     size_t common_prefix;   // max number of initial tokens that are the same in all sentences
     size_t required_tokens; // needed number of tokens to evaluate all answers
-    std::vector<std::vector<llama_token>> seq_tokens;
+    std::vector<std::vector<jarvis_token>> seq_tokens;
     std::vector<float> log_probs;
 };
 
-static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
+static bool multiple_choice_prepare_one_task(jarvis_context * ctx, multiple_choice_task& task, bool log_error) {
     if (task.question.empty() || task.mc1.answers.empty()) {
         if (log_error) {
             LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
@@ -1415,14 +1415,14 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
 //   * TruthfulQA
 //
 // Validation datasets for these 4 tests can be found at
-//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
+//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-jarvis.cpp
 // The data for these datasets was extracted from
 //     git@hf.co:datasets/allenai/ai2_arc
 //     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
 //     git@hf.co:datasets/Stevross/mmlu
 //     https://huggingface.co/datasets/truthful_qa
 //
-static void multiple_choice_score(llama_context * ctx, const common_params & params) {
+static void multiple_choice_score(jarvis_context * ctx, const common_params & params) {
 
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
@@ -1528,20 +1528,20 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
 
     LOG("\ntask\tacc_norm\n");
 
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_ctx   = jarvis_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     const int max_tasks_per_batch = 32;
-    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) jarvis_n_seq_max(ctx));
 
-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    jarvis_batch batch = jarvis_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
     std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
-    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<std::pair<size_t, jarvis_token>> eval_pairs;
     std::vector<float> eval_results;
     std::vector<std::thread> workers(std::thread::hardware_concurrency());
     std::vector<int> batch_indeces;
@@ -1578,7 +1578,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
 
             for (size_t i = 0; i < cur_task.common_prefix; ++i) {
-                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                //jarvis_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
                 common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
             }
             batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
@@ -1610,11 +1610,11 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
             return;
         }
 
@@ -1688,7 +1688,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
         i0 = i1 - 1;
     }
 
-    llama_batch_free(batch);
+    jarvis_batch_free(batch);
 
     if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
 
@@ -1703,7 +1703,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     LOG_INF("\n");
 }
 
-static void kl_divergence(llama_context * ctx, const common_params & params) {
+static void kl_divergence(jarvis_context * ctx, const common_params & params) {
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1724,7 +1724,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
 
     uint32_t n_ctx;
     in.read((char *)&n_ctx, sizeof(n_ctx));
-    if (n_ctx > llama_n_ctx(ctx)) {
+    if (n_ctx > jarvis_n_ctx(ctx)) {
         LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                 __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
     }
@@ -1737,11 +1737,11 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
-    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+    if (n_vocab != jarvis_n_vocab(jarvis_get_model(ctx))) {
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, jarvis_n_vocab(jarvis_get_model(ctx)));
     }
 
-    std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
+    std::vector<jarvis_token> tokens(size_t(n_ctx) * n_chunk);
     if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
         LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
         return;
@@ -1750,8 +1750,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     const int n_batch = params.n_batch;
     const int num_batches = (n_ctx + n_batch - 1)/n_batch;
     const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx));
+    GGML_ASSERT(!jarvis_add_eos_token(jarvis_get_model(ctx)));
 
     std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
     std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1797,9 +1797,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
 
-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -1810,7 +1810,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = jarvis_token_bos(jarvis_get_model(ctx));
             }
 
             common_batch_clear(batch);
@@ -1818,9 +1818,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
                 common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
             }
 
-            if (llama_decode(ctx, batch)) {
+            if (jarvis_decode(ctx, batch)) {
                 LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
+                jarvis_batch_free(batch);
                 return;
             }
 
@@ -1828,12 +1828,12 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
             tokens[batch_start] = token_org;
 
             if (num_batches > 1) {
-                const auto * batch_logits = llama_get_logits(ctx);
+                const auto * batch_logits = jarvis_get_logits(ctx);
                 logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
             }
         }
 
-        llama_batch_free(batch);
+        jarvis_batch_free(batch);
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
@@ -1851,7 +1851,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
 
         const int first = n_ctx/2;
-        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+        const float * all_logits = num_batches > 1 ? logits.data() : jarvis_get_logits(ctx);
         process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                 workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
         p_diff_ptr += n_ctx - 1 - first;
@@ -1991,7 +1991,7 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.escape = false;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_PERPLEXITY)) {
         return 1;
     }
 
@@ -2030,20 +2030,20 @@ int main(int argc, char ** argv) {
         params.n_ctx += params.ppl_stride/2;
     }
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model and apply lora adapter, if any
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
 
     if (params.n_ctx > n_ctx_train) {
         LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
@@ -2070,14 +2070,14 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
     write_logfile(ctx, params, model, results);
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py
index eb000d5ccba24..edcc1bb89f591 100755
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -20,10 +20,10 @@
 
 
 def create_completion(host, prompt, gbnf_grammar):
-    """Calls the /completion API on llama-server.
+    """Calls the /completion API on jarvis-server.
 
     See
-    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    https://github.com/ggerganov/jarvis.cpp/tree/HEAD/examples/server#api-endpoints
     """
     print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
     headers = {"Content-Type": "application/json"}
@@ -295,7 +295,7 @@ def example_concurrent(host):
 
 def main():
     parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
-    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
+    parser.add_argument("--host", default="localhost:8080", help="jarvis.cpp server")
     parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index bb986a716883d..d6afae7064ec6 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(TARGET llama-quantize-stats)
+set(TARGET jarvis-quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE jarvis build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index e372856c6a515..36d505b1bb349 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,7 +1,7 @@
 #include "common.h"
 #include "ggml.h"
-#include "llama.h"
-#include "llama-impl.h"
+#include "jarvis.h"
+#include "jarvis-impl.h"
 
 #include <algorithm>
 #include <cassert>
@@ -304,33 +304,33 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "Loading model\n");
 
     const int64_t t_main_start_us = ggml_time_us();
-    llama_model * model;
-    llama_context * ctx;
+    jarvis_model * model;
+    jarvis_context * ctx;
 
     {
-        auto mparams = llama_model_default_params();
+        auto mparams = jarvis_model_default_params();
         mparams.use_mlock  = false;
 
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = jarvis_load_model_from_file(params.model.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
             return 1;
         }
 
-        auto cparams = llama_context_default_params();
+        auto cparams = jarvis_context_default_params();
         cparams.n_ctx = 256;
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = jarvis_new_context_with_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
-            llama_free_model(model);
+            jarvis_free_model(model);
             return 1;
         }
     }
 
-    const auto &tensors = llama_internal_get_tensor_map(ctx);
+    const auto &tensors = jarvis_internal_get_tensor_map(ctx);
 
     // check layer tensors
     int included_layers = 0;
@@ -348,8 +348,8 @@ int main(int argc, char ** argv) {
         } else if (kv_tensor.second->type != GGML_TYPE_F32) {
             fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                 "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
-            llama_free(ctx);
-            llama_free_model(model);
+            jarvis_free(ctx);
+            jarvis_free_model(model);
             return 1;
         }
         included_layers++;
@@ -409,8 +409,8 @@ int main(int argc, char ** argv) {
     }
 
 
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
     // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 62680cda4455f..093823d434678 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(TARGET llama-quantize)
+set(TARGET jarvis-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index 704f0d56bea72..a831bf1f1b429 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -2,14 +2,14 @@
 
 You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
 
-Note: It is synced from llama.cpp `main` every 6 hours.
+Note: It is synced from jarvis.cpp `main` every 6 hours.
 
 Example usage:
 
 ```bash
-# obtain the official LLaMA model weights and place them in ./models
+# obtain the official JARVIS model weights and place them in ./models
 ls ./models
-llama-2-7b tokenizer_checklist.chk tokenizer.model
+jarvis-2-7b tokenizer_checklist.chk tokenizer.model
 # [Optional] for models using BPE tokenizers
 ls ./models
 <folder containing weights and tokenizer json> vocab.json
@@ -24,17 +24,17 @@ python3 -m pip install -r requirements.txt
 python3 convert_hf_to_gguf.py models/mymodel/
 
 # quantize the model to 4-bits (using Q4_K_M method)
-./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
+./jarvis-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
 
 # update the gguf filetype to current version if older version is now unsupported
-./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
+./jarvis-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
 ```
 
 Run the quantized model:
 
 ```bash
 # start inference on a gguf model
-./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
+./jarvis-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
 ```
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -71,24 +71,24 @@ The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interle
 |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 
-- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
+- [k-quants](https://github.com/ggerganov/jarvis.cpp/pull/1684)
 - recent k-quants improvements and new i-quants
-  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
-  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
-  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
-  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
-  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
-  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
-  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
-  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
-  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
-  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
-  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
-  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
-  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
-
-**Llama 2 7B**
+  - [#2707](https://github.com/ggerganov/jarvis.cpp/pull/2707)
+  - [#2807](https://github.com/ggerganov/jarvis.cpp/pull/2807)
+  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/jarvis.cpp/pull/4773)
+  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/jarvis.cpp/pull/4856)
+  - [#4861 - importance matrix](https://github.com/ggerganov/jarvis.cpp/pull/4861)
+  - [#4872 - MoE models](https://github.com/ggerganov/jarvis.cpp/pull/4872)
+  - [#4897 - 2-bit quantization](https://github.com/ggerganov/jarvis.cpp/pull/4897)
+  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/jarvis.cpp/pull/4930)
+  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/jarvis.cpp/pull/4957)
+  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/jarvis.cpp/pull/4969)
+  - [#4996 - k-qunats tuning](https://github.com/ggerganov/jarvis.cpp/pull/4996)
+  - [#5060 - Q3_K_XS](https://github.com/ggerganov/jarvis.cpp/pull/5060)
+  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/jarvis.cpp/pull/5196)
+  - [quantization tuning](https://github.com/ggerganov/jarvis.cpp/pull/5320), [another one](https://github.com/ggerganov/jarvis.cpp/pull/5334), and [another one](https://github.com/ggerganov/jarvis.cpp/pull/5361)
+
+**Jarvis 2 7B**
 
 | Quantization | Bits per Weight (BPW) |
 |--------------|-----------------------|
@@ -102,7 +102,7 @@ The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interle
 | Q5_K_M       | 5.68                  |
 | Q6_K         | 6.56                  |
 
-**Llama 2 13B**
+**Jarvis 2 13B**
 
 Quantization | Bits per Weight (BPW)
 -- | --
@@ -116,7 +116,7 @@ Q5_K_S | 5.51
 Q5_K_M | 5.67
 Q6_K | 6.56
 
-**Llama 2 70B**
+**Jarvis 2 70B**
 
 Quantization | Bits per Weight (BPW)
 -- | --
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index b989932107dba..fd3c9311e60d5 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,5 +1,5 @@
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cstdio>
 #include <cstring>
@@ -11,51 +11,51 @@
 
 struct quant_option {
     std::string name;
-    llama_ftype ftype;
+    jarvis_ftype ftype;
     std::string desc;
 };
 
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
-    { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
-    { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
-    { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
-    { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
-    { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
-    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
-    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
-    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
-    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
-    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
-    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
-    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
-    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
-    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
-    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
-    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
-    { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
-    { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
-    { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
-    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
-    { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
-    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
-    { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
-    { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
-    { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
-    { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
-    { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
-    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
-    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
-    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
-    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
-    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
+    { "Q4_0",     JARVIS_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Jarvis-3-8B",  },
+    { "Q4_1",     JARVIS_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Jarvis-3-8B",  },
+    { "Q5_0",     JARVIS_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Jarvis-3-8B",  },
+    { "Q5_1",     JARVIS_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Jarvis-3-8B",  },
+    { "IQ2_XXS",  JARVIS_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
+    { "IQ2_XS",   JARVIS_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
+    { "IQ2_S",    JARVIS_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
+    { "IQ2_M",    JARVIS_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
+    { "IQ1_S",    JARVIS_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
+    { "IQ1_M",    JARVIS_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+    { "TQ1_0",    JARVIS_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    JARVIS_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
+    { "Q2_K",     JARVIS_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Jarvis-3-8B",  },
+    { "Q2_K_S",   JARVIS_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Jarvis-3-8B",  },
+    { "IQ3_XXS",  JARVIS_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
+    { "IQ3_S",    JARVIS_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
+    { "IQ3_M",    JARVIS_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
+    { "Q3_K",     JARVIS_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
+    { "IQ3_XS",   JARVIS_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
+    { "Q3_K_S",   JARVIS_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Jarvis-3-8B",  },
+    { "Q3_K_M",   JARVIS_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Jarvis-3-8B",  },
+    { "Q3_K_L",   JARVIS_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Jarvis-3-8B",  },
+    { "IQ4_NL",   JARVIS_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
+    { "IQ4_XS",   JARVIS_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
+    { "Q4_K",     JARVIS_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
+    { "Q4_K_S",   JARVIS_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Jarvis-3-8B",  },
+    { "Q4_K_M",   JARVIS_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Jarvis-3-8B",  },
+    { "Q5_K",     JARVIS_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
+    { "Q5_K_S",   JARVIS_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Jarvis-3-8B",  },
+    { "Q5_K_M",   JARVIS_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Jarvis-3-8B",  },
+    { "Q6_K",     JARVIS_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Jarvis-3-8B",  },
+    { "Q8_0",     JARVIS_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Jarvis-3-8B",  },
+    { "Q4_0_4_4", JARVIS_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Jarvis-3-8B",  },
+    { "Q4_0_4_8", JARVIS_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Jarvis-3-8B",  },
+    { "Q4_0_8_8", JARVIS_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Jarvis-3-8B",  },
+    { "F16",      JARVIS_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
+    { "BF16",     JARVIS_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
+    { "F32",      JARVIS_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
+    { "COPY",     JARVIS_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
@@ -73,7 +73,7 @@ static bool striequals(const char * a, const char * b) {
     return *a == *b;
 }
 
-static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+static bool try_parse_ftype(const std::string & ftype_str_in, jarvis_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
 
     for (auto ch : ftype_str_in) {
@@ -103,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 
 // usage:
-//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./jarvis-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/jarvis/ggml-model.gguf [models/jarvis/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
@@ -175,7 +175,7 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
             for (auto& v : e) v /= ncall;
         }
 
-        if (getenv("LLAMA_TRACE")) {
+        if (getenv("JARVIS_TRACE")) {
             printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
         }
     }
@@ -251,12 +251,12 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_model_quantize_params params = llama_model_quantize_default_params();
+    jarvis_model_quantize_params params = jarvis_model_quantize_default_params();
 
     int arg_idx = 1;
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
-    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<jarvis_model_kv_override> kv_overrides;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -326,34 +326,34 @@ int main(int argc, char ** argv) {
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
         {
-            llama_model_kv_override kvo;
+            jarvis_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
             strncpy(kvo.val_str, imatrix_file.c_str(), 127);
             kvo.val_str[127] = '\0';
             kv_overrides.emplace_back(std::move(kvo));
         }
         if (!imatrix_dataset.empty()) {
-            llama_model_kv_override kvo;
+            jarvis_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
             strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
             kvo.val_str[127] = '\0';
             kv_overrides.emplace_back(std::move(kvo));
         }
 
         {
-            llama_model_kv_override kvo;
+            jarvis_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
             kvo.val_i64 = imatrix_data.size();
             kv_overrides.emplace_back(std::move(kvo));
         }
 
         if (m_last_call > 0) {
-            llama_model_kv_override kvo;
+            jarvis_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
             kvo.val_i64 = m_last_call;
             kv_overrides.emplace_back(std::move(kvo));
         }
@@ -364,7 +364,7 @@ int main(int argc, char ** argv) {
         params.kv_overrides = &kv_overrides;
     }
 
-    llama_backend_init();
+    jarvis_backend_init();
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
@@ -421,11 +421,11 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+    if ((params.ftype == JARVIS_FTYPE_MOSTLY_IQ2_XS || params.ftype == JARVIS_FTYPE_MOSTLY_IQ2_XXS ||
+         params.ftype == JARVIS_FTYPE_MOSTLY_IQ2_S  ||
+         params.ftype == JARVIS_FTYPE_MOSTLY_Q2_K_S ||
+         params.ftype == JARVIS_FTYPE_MOSTLY_IQ1_S  ||
+         params.ftype == JARVIS_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
         fprintf(stderr, "==========================================================================================================\n\n\n");
@@ -440,32 +440,32 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "\n");
 
-    const int64_t t_main_start_us = llama_time_us();
+    const int64_t t_main_start_us = jarvis_time_us();
 
     int64_t t_quantize_us = 0;
 
     // load the model
     {
-        const int64_t t_start_us = llama_time_us();
+        const int64_t t_start_us = jarvis_time_us();
 
-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
+        if (jarvis_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }
 
-        t_quantize_us = llama_time_us() - t_start_us;
+        t_quantize_us = jarvis_time_us() - t_start_us;
     }
 
     // report timing
     {
-        const int64_t t_main_end_us = llama_time_us();
+        const int64_t t_main_end_us = jarvis_time_us();
 
         printf("\n");
         printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
     }
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
index 24bc970e8632b..651e5f26434d7 100644
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -18,9 +18,9 @@ fi
 
 set -x
 
-SPLIT=$1/llama-gguf-split
-QUANTIZE=$1/llama-quantize
-MAIN=$1/llama-cli
+SPLIT=$1/jarvis-gguf-split
+QUANTIZE=$1/jarvis-quantize
+MAIN=$1/jarvis-cli
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/reason-act.sh b/examples/reason-act.sh
index 06d592799cf12..53979d19582a9 100755
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
   MODEL="-m $2 "
 fi
 
-./llama-cli $MODEL --color \
+./jarvis-cli $MODEL --color \
     -f ./prompts/reason-act.txt \
     -i --interactive-first \
     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
index 66610f3111405..be52396407c75 100644
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-retrieval)
+set(TARGET jarvis-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md
index bc5f22e2ff156..e48c669092c18 100644
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -1,9 +1,9 @@
-# llama.cpp/examples/retrieval
+# jarvis.cpp/examples/retrieval
 
 Demonstration of simple retrieval technique based on cosine similarity
 
 More info:
-https://github.com/ggerganov/llama.cpp/pull/6193
+https://github.com/ggerganov/jarvis.cpp/pull/6193
 
 ### How to use
 
@@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193
 `retrieval` example can be tested as follows:
 
 ```bash
-make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+make -j && ./jarvis-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
 ```
 
 This chunks and embeds all given files and starts a loop requesting query inputs:
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 1768aae510067..47cf707060603 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <fstream>
@@ -21,7 +21,7 @@ struct chunk {
     // original text data
     std::string textdata;
     // tokenized text data
-    std::vector<llama_token> tokens;
+    std::vector<jarvis_token> tokens;
     // embedding
     std::vector<float> embedding;
 };
@@ -74,20 +74,20 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     return chunks;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+static void batch_add_seq(jarvis_batch & batch, const std::vector<int32_t> & tokens, jarvis_seq_id seq_id) {
     size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
         common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(jarvis_context * ctx, jarvis_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    jarvis_kv_cache_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
+    if (jarvis_decode(ctx, batch) < 0) {
         LOG_ERR("%s : failed to decode\n", __func__);
     }
 
@@ -97,9 +97,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        const float * embd = jarvis_get_embeddings_seq(ctx, batch.seq_id[i][0]);
         if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
+            embd = jarvis_get_embeddings_ith(ctx, i);
             if (embd == NULL) {
                 LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
                 continue;
@@ -114,7 +114,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_RETRIEVAL, print_usage)) {
         return 1;
     }
 
@@ -145,25 +145,25 @@ int main(int argc, char ** argv) {
     }
     LOG_INF("Number of chunks: %ld\n", chunks.size());
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx_train = jarvis_n_ctx_train(model);
+    const int n_ctx = jarvis_n_ctx(ctx);
 
-    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+    const enum jarvis_pooling_type pooling_type = jarvis_pooling_type(ctx);
+    if (pooling_type == JARVIS_POOLING_TYPE_NONE) {
         LOG_ERR("%s: pooling type NONE not supported\n", __func__);
         return 1;
     }
@@ -192,8 +192,8 @@ int main(int argc, char ** argv) {
             return 1;
         }
         // add eos if not present
-        if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
-            inp.push_back(llama_token_eos(model));
+        if (jarvis_token_eos(model) >= 0 && (inp.empty() || inp.back() != jarvis_token_eos(model))) {
+            inp.push_back(jarvis_token_eos(model));
         }
         chunk.tokens = inp;
     }
@@ -212,10 +212,10 @@ int main(int argc, char ** argv) {
 
     // initialize batch
     const int n_chunks = chunks.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    struct jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1);
 
     // allocate output
-    const int n_embd = llama_n_embd(model);
+    const int n_embd = jarvis_n_embd(model);
     std::vector<float> embeddings(n_chunks * n_embd, 0);
     float * emb = embeddings.data();
 
@@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
         chunks[i].tokens.clear();
     }
 
-    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+    struct jarvis_batch query_batch = jarvis_batch_init(n_batch, 0, 1);
 
     // start loop, receive query and return top k similar chunks based on cosine similarity
     std::string query;
@@ -294,11 +294,11 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);
 
     // clean up
-    llama_batch_free(query_batch);
-    llama_free(ctx);
-    llama_free_model(model);
-    llama_backend_free();
+    jarvis_batch_free(query_batch);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
+    jarvis_backend_free();
 }
diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt
index ae48fb98d0913..0f9e2536092db 100644
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_executable(rpc-server rpc-server.cpp)
-target_link_libraries(rpc-server PRIVATE ggml llama)
+target_link_libraries(rpc-server PRIVATE ggml jarvis)
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index 312bb634dc920..8cc7f1fd086c3 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -6,7 +6,7 @@
 
 The `rpc-server` allows  running `ggml` backend on a remote host.
 The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
-This can be used for distributed LLM inference with `llama.cpp` in the following way:
+This can be used for distributed LLM inference with `jarvis.cpp` in the following way:
 
 ```mermaid
 flowchart TD
@@ -23,8 +23,8 @@ flowchart TD
     srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
     end
     subgraph host[Main Host]
-    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
-    ggml[llama-cli]<-->rpcb[RPC backend]
+    local["Backend (CUDA,Metal,etc.)"]<-->ggml[jarvis-cli]
+    ggml[jarvis-cli]<-->rpcb[RPC backend]
     end
     style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@@ -63,11 +63,11 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
 
 
-On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
-Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+On the main host build `jarvis.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
+Finally, when running `jarvis-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
 
 ```bash
-$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+$ bin/jarvis-cli -m ../models/tinyjarvis-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
 
 This way you can offload model layers to both local and remote devices.
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index 0fb5e359bc9ad..a47f20b35b34c 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-save-load-state)
+set(TARGET jarvis-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 8c49a52a66124..17cc08290e5e2 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,6 +1,6 @@
 #include "arg.h"
 #include "common.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <vector>
 #include <cstdio>
@@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
     params.prompt = "The quick brown fox";
     params.sparams.seed = 1234;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) {
         return 1;
     }
 
@@ -28,40 +28,40 @@ int main(int argc, char ** argv) {
     std::string result2;
 
     // init
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result jarvis_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    jarvis_model * model = jarvis_init.model;
+    jarvis_context * ctx = jarvis_init.context;
 
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
     }
 
-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = jarvis_sampler_chain_default_params();
 
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist(params.sparams.seed));
 
     // tokenize prompt
     auto tokens = common_tokenize(ctx, params.prompt, true);
 
     // prepare the batch
-    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    jarvis_batch batch = jarvis_batch_init(tokens.size(), 0, 1);
     for (size_t i = 0; i < tokens.size(); i++) {
         common_batch_add(batch, tokens[i], i, {0}, false);
     }
     batch.logits[batch.n_tokens - 1] = true; // generate next token
 
     // evaluate prompt
-    llama_decode(ctx, batch);
+    jarvis_decode(ctx, batch);
     n_past += batch.n_tokens;
 
     // save state (rng, logits, embedding and kv_cache) to file
     {
-        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
-        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
+        std::vector<uint8_t> state_mem(jarvis_state_get_size(ctx));
+        const size_t written = jarvis_state_get_data(ctx, state_mem.data(), state_mem.size());
 
         FILE *fp_write = fopen("dump_state.bin", "wb");
         fwrite(state_mem.data(), 1, written, fp_write);
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
     printf("\nfirst run: %s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
+        auto next_token     = jarvis_sampler_sample(smpl, ctx, -1);
         auto next_token_str = common_token_to_piece(ctx, next_token);
 
         printf("%s", next_token_str.c_str());
@@ -86,11 +86,11 @@ int main(int argc, char ** argv) {
         common_batch_clear(batch);
         common_batch_add(batch, next_token, n_past, {0}, true);
 
-        if (llama_decode(ctx, batch)) {
+        if (jarvis_decode(ctx, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            llama_free(ctx);
-            llama_free_model(model);
+            jarvis_batch_free(batch);
+            jarvis_free(ctx);
+            jarvis_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -99,14 +99,14 @@ int main(int argc, char ** argv) {
     printf("\n\n");
 
     // free old context
-    llama_free(ctx);
+    jarvis_free(ctx);
 
     // make new context
-    auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
+    auto * ctx2 = jarvis_new_context_with_model(model, common_context_params_to_jarvis(params));
 
-    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl2 = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    jarvis_sampler_chain_add(smpl2, jarvis_sampler_init_dist(params.sparams.seed));
 
     printf("\nsecond run: %s", params.prompt.c_str());
 
@@ -121,10 +121,10 @@ int main(int argc, char ** argv) {
         const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
         fclose(fp_read);
 
-        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
+        if (read != jarvis_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
+            jarvis_free(ctx2);
+            jarvis_free_model(model);
             return 1;
         }
 
@@ -136,7 +136,7 @@ int main(int argc, char ** argv) {
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
+        auto next_token     = jarvis_sampler_sample(smpl2, ctx2, -1);
         auto next_token_str = common_token_to_piece(ctx2, next_token);
 
         printf("%s", next_token_str.c_str());
@@ -145,11 +145,11 @@ int main(int argc, char ** argv) {
         common_batch_clear(batch);
         common_batch_add(batch, next_token, n_past, {0}, true);
 
-        if (llama_decode(ctx2, batch)) {
+        if (jarvis_decode(ctx2, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            llama_free(ctx2);
-            llama_free_model(model);
+            jarvis_batch_free(batch);
+            jarvis_free(ctx2);
+            jarvis_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
 
     printf("\n\n");
 
-    llama_free(ctx2);
+    jarvis_free(ctx2);
 
     if (result0 != result1) {
         fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
@@ -165,11 +165,11 @@ int main(int argc, char ** argv) {
     }
 
     // make new context
-    auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
+    auto * ctx3 = jarvis_new_context_with_model(model, common_context_params_to_jarvis(params));
 
-    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl3 = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    jarvis_sampler_chain_add(smpl3, jarvis_sampler_init_dist(params.sparams.seed));
 
     printf("\nsingle seq run: %s", params.prompt.c_str());
 
@@ -184,10 +184,10 @@ int main(int argc, char ** argv) {
         const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
         fclose(fp_read);
 
-        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
+        if (read != jarvis_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            llama_free(ctx3);
-            llama_free_model(model);
+            jarvis_free(ctx3);
+            jarvis_free_model(model);
             return 1;
         }
 
@@ -200,26 +200,26 @@ int main(int argc, char ** argv) {
     // save seq 0 and load into seq 1
     {
         // save kv of seq 0
-        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
-        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
+        std::vector<uint8_t> seq_store(jarvis_state_seq_get_size(ctx3, 0));
+        const size_t ncopy = jarvis_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
         if (ncopy != seq_store.size()) {
             fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
-            llama_free(ctx3);
-            llama_free_model(model);
+            jarvis_free(ctx3);
+            jarvis_free_model(model);
             return 1;
         }
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_cache_clear(ctx3);
+        jarvis_kv_cache_clear(ctx3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
-        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
+        const size_t nset = jarvis_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
         if (nset != seq_store.size()) {
             fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
-            llama_free(ctx3);
-            llama_free_model(model);
+            jarvis_free(ctx3);
+            jarvis_free_model(model);
             return 1;
         }
         fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
@@ -227,7 +227,7 @@ int main(int argc, char ** argv) {
 
     // third run with seq 1 instead of 0
     for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
+        auto next_token     = jarvis_sampler_sample(smpl3, ctx3, -1);
         auto next_token_str = common_token_to_piece(ctx3, next_token);
 
         printf("%s", next_token_str.c_str());
@@ -236,11 +236,11 @@ int main(int argc, char ** argv) {
         common_batch_clear(batch);
         common_batch_add(batch, next_token, n_past, {1}, true);
 
-        if (llama_decode(ctx3, batch)) {
+        if (jarvis_decode(ctx3, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            llama_free(ctx3);
-            llama_free_model(model);
+            jarvis_batch_free(batch);
+            jarvis_free(ctx3);
+            jarvis_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -248,13 +248,13 @@ int main(int argc, char ** argv) {
 
     printf("\n");
 
-    llama_sampler_free(smpl);
-    llama_sampler_free(smpl2);
-    llama_sampler_free(smpl3);
+    jarvis_sampler_free(smpl);
+    jarvis_sampler_free(smpl2);
+    jarvis_sampler_free(smpl3);
 
-    llama_batch_free(batch);
-    llama_free(ctx3);
-    llama_free_model(model);
+    jarvis_batch_free(batch);
+    jarvis_free(ctx3);
+    jarvis_free_model(model);
 
     if (result0 != result2) {
         fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
diff --git a/examples/server-llama2-13B.sh b/examples/server-jarvis2-13B.sh
old mode 100755
new mode 100644
similarity index 88%
rename from examples/server-llama2-13B.sh
rename to examples/server-jarvis2-13B.sh
index 4ce79b7fac477..71a02e147241d
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-jarvis2-13B.sh
@@ -5,7 +5,7 @@ set -e
 cd "$(dirname "$0")/.." || exit
 
 # Specify the model you want to use here:
-MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
+MODEL="${MODEL:-./models/jarvis-2-13b-chat.ggmlv3.q5_K_M.bin}"
 PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
 
 # Adjust to the number of CPU cores you want to use.
@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
 
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-server $GEN_OPTIONS \
+./jarvis-server $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --rope-freq-scale 1.0 \
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 3e717e882b4bf..ce13a2b7390f1 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,11 +1,11 @@
-set(TARGET llama-server)
+set(TARGET jarvis-server)
 
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+option(JARVIS_SERVER_SSL "Build SSL support for the server" OFF)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 
 if (MINGW)
-    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    # fix: https://github.com/ggerganov/jarvis.cpp/actions/runs/9651004652/job/26617901362?pr=8006
     add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()
 
@@ -49,7 +49,7 @@ install(TARGETS ${TARGET} RUNTIME)
 
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
-if (LLAMA_SERVER_SSL)
+if (JARVIS_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
     target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
     target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
diff --git a/examples/server/README.md b/examples/server/README.md
index bc737237eb018..3660298017b9e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,24 +1,24 @@
-# LLaMA.cpp HTTP Server
+# JARVIS.cpp HTTP Server
 
-Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
+Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **jarvis.cpp**.
 
-Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
+Set of LLM REST APIs and a simple web front end to interact with jarvis.cpp.
 
 **Features:**
  * LLM inference of F16 and quantized models on GPU and CPU
  * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
+ * Reranking endoint (WIP: https://github.com/ggerganov/jarvis.cpp/pull/9510)
  * Parallel decoding with multi-user support
  * Continuous batching
  * Multimodal (wip)
  * Monitoring endpoints
  * Schema-constrained JSON response format
 
-The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
+The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/jarvis.cpp/issues/4216).
 
 ## Usage
 
-<!-- Note for contributors: The list below is generated by llama-gen-docs -->
+<!-- Note for contributors: The list below is generated by jarvis-gen-docs -->
 
 **Common params**
 
@@ -27,7 +27,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
-| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: JARVIS_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
 | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
@@ -39,40 +39,40 @@ The project is under active development, and we are [looking for feedback and co
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
 | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
-| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
-| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
-| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: JARVIS_ARG_CTX_SIZE) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: JARVIS_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: JARVIS_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: JARVIS_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: JARVIS_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
+| `--no-perf` | disable internal libjarvis performance timings (default: false)<br/>(env: JARVIS_ARG_NO_PERF) |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
-| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
-| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
-| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
-| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
-| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
-| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
-| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
-| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
-| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: JARVIS_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: JARVIS_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: JARVIS_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: JARVIS_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: JARVIS_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: JARVIS_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: JARVIS_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: JARVIS_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: JARVIS_ARG_YARN_BETA_FAST) |
 | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
-| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
-| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
-| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
-| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: JARVIS_ARG_NO_KV_OFFLOAD) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: JARVIS_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: JARVIS_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: JARVIS_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: JARVIS_ARG_N_PARALLEL) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: JARVIS_ARG_MLOCK) |
+| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: JARVIS_ARG_NO_MMAP) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/jarvis.cpp/issues/1437<br/>(env: JARVIS_ARG_NUMA) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: JARVIS_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: JARVIS_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: JARVIS_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: JARVIS_ARG_MAIN_GPU) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
 | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
 | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
@@ -80,19 +80,19 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
 | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
-| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: JARVIS_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: JARVIS_ARG_MODEL_URL) |
+| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: JARVIS_ARG_HF_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: JARVIS_ARG_HF_FILE) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-disable` | Log disable |
 | `--log-file FNAME` | Log to file |
-| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
+| `--log-colors` | Enable colored logging<br/>(env: JARVIS_LOG_COLORS) |
 | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
-| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
-| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: JARVIS_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefx in log messages<br/>(env: JARVIS_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: JARVIS_LOG_TIMESTAMPS) |
 
 
 **Sampling params**
@@ -134,31 +134,31 @@ The project is under active development, and we are [looking for feedback and co
 
 | Argument | Explanation |
 | -------- | ----------- |
-| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: JARVIS_ARG_NO_CONTEXT_SHIFT) |
 | `-sp, --special` | special tokens output enabled (default: false) |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
-| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
-| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
-| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
-| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: JARVIS_ARG_POOLING) |
+| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: JARVIS_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: JARVIS_ARG_NO_CONT_BATCHING) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: JARVIS_ARG_ALIAS) |
+| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: JARVIS_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)<br/>(env: JARVIS_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: )<br/>(env: JARVIS_ARG_STATIC_PATH) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: JARVIS_ARG_EMBEDDINGS) |
+| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: JARVIS_ARG_RERANKING) |
+| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: JARVIS_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
-| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
-| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
-| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: JARVIS_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: JARVIS_ARG_SSL_CERT_FILE) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: JARVIS_ARG_TIMEOUT) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: JARVIS_ARG_THREADS_HTTP) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: JARVIS_ARG_CACHE_REUSE) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: JARVIS_ARG_ENDPOINT_METRICS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: JARVIS_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: JARVIS_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint<br/>(env: JARVIS_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template<br/>(env: JARVIS_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 
@@ -169,43 +169,43 @@ Example usage of docker compose with environment variables:
 
 ```yml
 services:
-  llamacpp-server:
-    image: ghcr.io/ggerganov/llama.cpp:server
+  jarviscpp-server:
+    image: ghcr.io/ggerganov/jarvis.cpp:server
     ports:
       - 8080:8080
     volumes:
       - ./models:/models
     environment:
-      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
-      LLAMA_ARG_MODEL: /models/my_model.gguf
-      LLAMA_ARG_CTX_SIZE: 4096
-      LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1
-      LLAMA_ARG_PORT: 8080
+      # alternatively, you can use "JARVIS_ARG_MODEL_URL" to download the model
+      JARVIS_ARG_MODEL: /models/my_model.gguf
+      JARVIS_ARG_CTX_SIZE: 4096
+      JARVIS_ARG_N_PARALLEL: 2
+      JARVIS_ARG_ENDPOINT_METRICS: 1
+      JARVIS_ARG_PORT: 8080
 ```
 
 ## Build
 
-`llama-server` is built alongside everything else from the root of the project
+`jarvis-server` is built alongside everything else from the root of the project
 
 - Using `make`:
 
   ```bash
-  make llama-server
+  make jarvis-server
   ```
 
 - Using `CMake`:
 
   ```bash
   cmake -B build
-  cmake --build build --config Release -t llama-server
+  cmake --build build --config Release -t jarvis-server
   ```
 
-  Binary is at `./build/bin/llama-server`
+  Binary is at `./build/bin/jarvis-server`
 
 ## Build with SSL
 
-`llama-server` can also be built with SSL support using OpenSSL 3
+`jarvis-server` can also be built with SSL support using OpenSSL 3
 
 - Using `make`:
 
@@ -213,14 +213,14 @@ services:
   # NOTE: For non-system openssl, use the following:
   #   CXXFLAGS="-I /path/to/openssl/include"
   #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true llama-server
+  make JARVIS_SERVER_SSL=true jarvis-server
   ```
 
 - Using `CMake`:
 
   ```bash
-  cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t llama-server
+  cmake -B build -DJARVIS_SERVER_SSL=ON
+  cmake --build build --config Release -t jarvis-server
   ```
 
 ## Quick Start
@@ -230,13 +230,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.)
 
 ```bash
-./llama-server -m models/7B/ggml-model.gguf -c 2048
+./jarvis-server -m models/7B/ggml-model.gguf -c 2048
 ```
 
 ### Windows
 
 ```powershell
-llama-server.exe -m models\7B\ggml-model.gguf -c 2048
+jarvis-server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
@@ -245,10 +245,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
 ### Docker
 
 ```bash
-docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
 
 # or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/jarvis.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
 ```
 
 ## Testing with CURL
@@ -273,8 +273,8 @@ We implemented a [server test framework](./tests/README.md) using human-readable
 You need to have [Node.js](https://nodejs.org/en) installed.
 
 ```bash
-mkdir llama-client
-cd llama-client
+mkdir jarvis-client
+cd jarvis-client
 ```
 
 Create a index.js file and put this inside:
@@ -486,7 +486,7 @@ If `with_pieces` is `true`:
 }
 ```
 
-With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+With input 'á' (utf8 hex: C3 A1) on tinyjarvis/stories260k
 ```json
 {
   "tokens": [
@@ -607,11 +607,11 @@ To use this endpoint with POST method, you need to start server with `--props`
 
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 
-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
     *Options:*
 
-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, jarvis.cpp `/completion`-specific features such as `mirostat` are supported.
 
     The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
 
@@ -717,7 +717,7 @@ Example:
         "mirostat": 0,
         "mirostat_eta": 0.10000000149011612,
         "mirostat_tau": 5.0,
-        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
+        "model": "jarvis-2-7b-32k-instruct.Q2_K.gguf",
         "n_ctx": 2048,
         "n_keep": 0,
         "n_predict": 100000,
@@ -733,7 +733,7 @@ Example:
         },
         "penalize_nl": true,
         "presence_penalty": 0.0,
-        "prompt": "Say hello to llama.cpp",
+        "prompt": "Say hello to jarvis.cpp",
         "repeat_last_n": 64,
         "repeat_penalty": 1.100000023841858,
         "samplers": [
@@ -769,14 +769,14 @@ Possible values for `slot[i].state` are:
 This endpoint is only accessible if `--metrics` is set.
 
 Available metrics:
-- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
-- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
-- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
-- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
-- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
-- `llamacpp:kv_cache_tokens`: KV-cache tokens.
-- `llamacpp:requests_processing`: Number of requests processing.
-- `llamacpp:requests_deferred`: Number of requests deferred.
+- `jarviscpp:prompt_tokens_total`: Number of prompt tokens processed.
+- `jarviscpp:tokens_predicted_total`: Number of generation tokens processed.
+- `jarviscpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
+- `jarviscpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
+- `jarviscpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
+- `jarviscpp:kv_cache_tokens`: KV-cache tokens.
+- `jarviscpp:requests_processing`: Number of requests processing.
+- `jarviscpp:requests_deferred`: Number of requests deferred.
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 
@@ -890,11 +890,11 @@ bash chat.sh
 
 ### OAI-like API
 
-The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
+The HTTP `jarvis-server` supports an OAI-like API: https://github.com/openai/openai-openapi
 
 ### API errors
 
-`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+`jarvis-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
 
 Example of an error:
 
@@ -908,7 +908,7 @@ Example of an error:
 }
 ```
 
-Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
+Apart from error types supported by OAI, we also have custom types that are specific to functionalities of jarvis.cpp:
 
 **When /metrics or /slots endpoint is disabled**
 
@@ -936,9 +936,9 @@ Apart from error types supported by OAI, we also have custom types that are spec
 
 ### Extending or building alternative Web Front End
 
-You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the jarvisComplete() method.
 
-Read the documentation in `/completion.js` to see convenient ways to access llama.
+Read the documentation in `/completion.js` to see convenient ways to access jarvis.
 
 A simple example is below:
 
@@ -947,7 +947,7 @@ A simple example is below:
   <body>
     <pre>
       <script type="module">
-        import { llama } from '/completion.js'
+        import { jarvis } from '/completion.js'
 
         const prompt = `### Instruction:
 Write dad jokes, each one paragraph.
@@ -955,7 +955,7 @@ You can use html formatting if needed.
 
 ### Response:`
 
-        for await (const chunk of llama(prompt)) {
+        for await (const chunk of jarvis(prompt)) {
           document.write(chunk.data.content)
         }
       </script>
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 353368e13b0c8..ad666a96220ed 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -72,15 +72,15 @@ To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http
 #### Metrics
 
 Following metrics are available computed from the OAI chat completions response `usage`:
-- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
-- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
-- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
-- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
-- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
-- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
-- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
+- `jarviscpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `jarviscpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `jarviscpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `jarviscpp_completion_tokens` Trend of `usage.completion_tokens`
+- `jarviscpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
+- `jarviscpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `jarviscpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
 
-The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+The script will fail if too many completions are truncated, see `jarviscpp_completions_truncated_rate`.
 
 K6 metrics might be compared against [server metrics](../README.md), with:
 
@@ -98,7 +98,7 @@ The `bench.py` script does several steps:
 It aims to be used in the CI, but you can run it manually:
 
 ```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
+JARVIS_SERVER_BIN_PATH=../../../cmake-build-release/bin/jarvis-server python bench.py \
               --runner-label local \
               --name local \
               --branch `git rev-parse --abbrev-ref HEAD` \
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index a9ed747f51db5..e6c77c98a8f72 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -104,7 +104,7 @@ def main(args_in: list[str] | None = None) -> None:
         while is_server_listening(args.host, args.port):
             time.sleep(0.1)
 
-    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
+    title = (f"jarvis.cpp {args.name} on {args.runner_label}\n "
              f"duration={args.duration} {iterations} iterations")
     xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
               f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
@@ -119,7 +119,7 @@ def main(args_in: list[str] | None = None) -> None:
 
         for metric in metrics:
             resp = requests.get(f"http://localhost:9090/api/v1/query_range",
-                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+                                params={'query': 'jarviscpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
 
             with open(f"{metric}.json", 'w') as metric_json:
                 metric_json.write(resp.text)
@@ -138,7 +138,7 @@ def main(args_in: list[str] | None = None) -> None:
                 plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
                 plt.yticks(fontsize=12, alpha=.7)
 
-                ylabel = f"llamacpp:{metric}"
+                ylabel = f"jarviscpp:{metric}"
                 plt.title(title,
                           fontsize=14, wrap=True)
                 plt.grid(axis='both', alpha=.3)
@@ -173,8 +173,8 @@ def main(args_in: list[str] | None = None) -> None:
 ---
 xychart-beta
     title "{title}"
-    y-axis "llamacpp:{metric}"
-    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
+    y-axis "jarviscpp:{metric}"
+    x-axis "jarviscpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
     line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
                     """)
                     mermaid_f.write(mermaid)
@@ -187,13 +187,13 @@ def main(args_in: list[str] | None = None) -> None:
             "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
         },
         "pp": {
-            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
-            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
+            "p95": round(data['metrics']["jarviscpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["jarviscpp_prompt_processing_second"]["avg"], 2),
             "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
         },
         "tg": {
-            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
-            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
+            "p95": round(data['metrics']["jarviscpp_tokens_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["jarviscpp_tokens_second"]["avg"], 2),
             "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
         },
     }
@@ -248,9 +248,9 @@ def start_server(args):
 
 def start_server_background(args):
     # Start the server
-    server_path = '../../../build/bin/llama-server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+    server_path = '../../../build/bin/jarvis-server'
+    if 'JARVIS_SERVER_BIN_PATH' in os.environ:
+        server_path = os.environ['JARVIS_SERVER_BIN_PATH']
     server_args = [
         '--host', args.host,
         '--port', args.port,
diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml
index b15ee52443fe8..4fa7e35958f3e 100644
--- a/examples/server/bench/prometheus.yml
+++ b/examples/server/bench/prometheus.yml
@@ -1,9 +1,9 @@
 global:
   scrape_interval:     10s
   external_labels:
-    llamacpp: 'server'
+    jarviscpp: 'server'
 
 scrape_configs:
-  - job_name: 'llama.cpp server'
+  - job_name: 'jarvis.cpp server'
     static_configs:
       - targets: ['localhost:8080']
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index bdf4f5abc87f7..f56147593d0e0 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -51,21 +51,21 @@ const data = new SharedArray('conversations', function () {
         .slice(0, n_prompt)
 })
 
-const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
-const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+const jarviscpp_prompt_tokens = new Trend('jarviscpp_prompt_tokens')
+const jarviscpp_completion_tokens = new Trend('jarviscpp_completion_tokens')
 
-const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
-const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const jarviscpp_tokens_second = new Trend('jarviscpp_tokens_second')
+const jarviscpp_prompt_processing_second = new Trend('jarviscpp_prompt_processing_second')
 
-const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
-const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
+const jarviscpp_prompt_tokens_total_counter = new Counter('jarviscpp_prompt_tokens_total_counter')
+const jarviscpp_completion_tokens_total_counter = new Counter('jarviscpp_completion_tokens_total_counter')
 
-const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
-const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
+const jarviscpp_completions_truncated_rate = new Rate('jarviscpp_completions_truncated_rate')
+const jarviscpp_completions_stop_rate = new Rate('jarviscpp_completions_stop_rate')
 
 export const options = {
     thresholds: {
-        llamacpp_completions_truncated_rate: [
+        jarviscpp_completions_truncated_rate: [
             // more than 80% of truncated input will abort the test
             {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
         ],
@@ -115,12 +115,12 @@ export default function () {
 
             if (chunk.usage) {
                 prompt_tokens = chunk.usage.prompt_tokens
-                llamacpp_prompt_tokens.add(prompt_tokens)
-                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+                jarviscpp_prompt_tokens.add(prompt_tokens)
+                jarviscpp_prompt_tokens_total_counter.add(prompt_tokens)
 
                 completions_tokens = chunk.usage.completion_tokens
-                llamacpp_completion_tokens.add(completions_tokens)
-                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+                jarviscpp_completion_tokens.add(completions_tokens)
+                jarviscpp_completion_tokens_total_counter.add(completions_tokens)
             }
         })
 
@@ -136,15 +136,15 @@ export default function () {
 
     const promptEvalTime = promptEvalEndTime - startTime
     if (promptEvalTime > 0) {
-        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
+        jarviscpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
     }
 
     const completion_time = endTime - promptEvalEndTime
     if (completions_tokens > 0 && completion_time > 0) {
-        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+        jarviscpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
     }
-    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
-    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+    jarviscpp_completions_truncated_rate.add(finish_reason === 'length')
+    jarviscpp_completions_stop_rate.add(finish_reason === 'stop')
 
     sleep(0.3)
 }
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index 36818f7644aa1..9254e58c99088 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -12,14 +12,14 @@ let generation_settings = null;
 //
 // Example:
 //
-//    import { llama } from '/completion.js'
+//    import { jarvis } from '/completion.js'
 //
-//    const request = llama("Tell me a joke", {n_predict: 800})
+//    const request = jarvis("Tell me a joke", {n_predict: 800})
 //    for await (const chunk of request) {
 //      document.write(chunk.data.content)
 //    }
 //
-export async function* llama(prompt, params = {}, config = {}) {
+export async function* jarvis(prompt, params = {}, config = {}) {
   let controller = config.controller;
   const api_url = config.api_url?.replace(/\/+$/, '') || "";
 
@@ -79,7 +79,7 @@ export async function* llama(prompt, params = {}, config = {}) {
         const match = regex.exec(line);
         if (match) {
           result[match[1]] = match[2]
-          // since we know this is llama.cpp, let's just decode the json in data
+          // since we know this is jarvis.cpp, let's just decode the json in data
           if (result.data) {
             result.data = JSON.parse(result.data);
             content += result.data.content;
@@ -103,10 +103,10 @@ export async function* llama(prompt, params = {}, config = {}) {
                 // Throw an error to be caught by upstream callers
                 throw new Error('slot unavailable');
               } else {
-                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+                console.error(`jarvis.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
               }
             } catch(e) {
-              console.error(`llama.cpp error ${result.error}`)
+              console.error(`jarvis.cpp error ${result.error}`)
             }
           }
         }
@@ -114,7 +114,7 @@ export async function* llama(prompt, params = {}, config = {}) {
     }
   } catch (e) {
     if (e.name !== 'AbortError') {
-      console.error("llama error: ", e);
+      console.error("jarvis error: ", e);
     }
     throw e;
   }
@@ -125,22 +125,22 @@ export async function* llama(prompt, params = {}, config = {}) {
   return content;
 }
 
-// Call llama, return an event target that you can subscribe to
+// Call jarvis, return an event target that you can subscribe to
 //
 // Example:
 //
-//    import { llamaEventTarget } from '/completion.js'
+//    import { jarvisEventTarget } from '/completion.js'
 //
-//    const conn = llamaEventTarget(prompt)
+//    const conn = jarvisEventTarget(prompt)
 //    conn.addEventListener("message", (chunk) => {
 //      document.write(chunk.detail.content)
 //    })
 //
-export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+export const jarvisEventTarget = (prompt, params = {}, config = {}) => {
   const eventTarget = new EventTarget();
   (async () => {
     let content = "";
-    for await (const chunk of llama(prompt, params, config)) {
+    for await (const chunk of jarvis(prompt, params, config)) {
       if (chunk.data) {
         content += chunk.data.content;
         eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
@@ -157,24 +157,24 @@ export const llamaEventTarget = (prompt, params = {}, config = {}) => {
   return eventTarget;
 }
 
-// Call llama, return a promise that resolves to the completed text. This does not support streaming
+// Call jarvis, return a promise that resolves to the completed text. This does not support streaming
 //
 // Example:
 //
-//     llamaPromise(prompt).then((content) => {
+//     jarvisPromise(prompt).then((content) => {
 //       document.write(content)
 //     })
 //
 //     or
 //
-//     const content = await llamaPromise(prompt)
+//     const content = await jarvisPromise(prompt)
 //     document.write(content)
 //
-export const llamaPromise = (prompt, params = {}, config = {}) => {
+export const jarvisPromise = (prompt, params = {}, config = {}) => {
   return new Promise(async (resolve, reject) => {
     let content = "";
     try {
-      for await (const chunk of llama(prompt, params, config)) {
+      for await (const chunk of jarvis(prompt, params, config)) {
         content += chunk.data.content;
       }
       resolve(content);
@@ -187,14 +187,14 @@ export const llamaPromise = (prompt, params = {}, config = {}) => {
 /**
  * (deprecated)
  */
-export const llamaComplete = async (params, controller, callback) => {
-  for await (const chunk of llama(params.prompt, params, { controller })) {
+export const jarvisComplete = async (params, controller, callback) => {
+  for await (const chunk of jarvis(params.prompt, params, { controller })) {
     callback(chunk);
   }
 }
 
 // Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async (config = {}) => {
+export const jarvisModelInfo = async (config = {}) => {
   if (!generation_settings) {
     const api_url = config.api_url?.replace(/\/+$/, '') || "";
     const props = await fetch(`${api_url}/props`).then(r => r.json());
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index cb3995abee0ef..48d01b173caeb 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -6,7 +6,7 @@
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
-  <title>llama.cpp - chat</title>
+  <title>jarvis.cpp - chat</title>
 
   <link rel="icon" type="image/x-icon" href="favicon.ico">
   <link rel="stylesheet" href="style.css">
@@ -16,7 +16,7 @@
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
     } from './index.js';
 
-    import { llama } from './completion.js';
+    import { jarvis } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
     import { promptFormats } from './prompt-formats.js';
     import { systemPrompts } from './system-prompts.js'; // multilingual is wip
@@ -68,7 +68,7 @@
 
     /* START: Support for storing prompt templates and parameters in browser's LocalStorage */
 
-    const local_storage_storageKey = "llamacpp_server_local_storage";
+    const local_storage_storageKey = "jarviscpp_server_local_storage";
 
     function local_storage_setDataFromObject(tag, content) {
       localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
@@ -199,7 +199,7 @@
 
     /* END: Support for storing prompt templates and parameters in browser's LocalStorage */
 
-    const llamaStats = signal(null)
+    const jarvisStats = signal(null)
     const controller = signal(null)
 
     // currently generating a completion?
@@ -224,14 +224,14 @@
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
 
-    async function runLlama(prompt, llamaParams, char) {
+    async function runJarvis(prompt, jarvisParams, char) {
       const currentMessages = [];
       const history = session.value.transcript;
       if (controller.value) {
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+      for await (const chunk of jarvis(prompt, jarvisParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
         const data = chunk.data;
         if (data.stop) {
           while (
@@ -251,8 +251,8 @@
           transcriptUpdate([...history, [char, currentMessages]])
         }
         if (data.timings) {
-          // llamaStats.value = data.timings;
-          llamaStats.value = data;
+          // jarvisStats.value = data.timings;
+          jarvisStats.value = data;
         }
       }
       controller.value = null;
@@ -264,7 +264,7 @@
         console.log('already running...');
         return;
       }
-    // just in case (e.g. llama2)
+    // just in case (e.g. jarvis2)
     const suffix = session.value.userMsgSuffix || "";
     const prefix = session.value.userMsgPrefix || "";
     const userMsg = prefix + msg + suffix;
@@ -289,7 +289,7 @@
       if (selected_image) {
         prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
       }
-      await runLlama(prompt, {
+      await runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: ["</s>", "<|end|>", "<|eot_id|>", "<|end_of_text|>", "<|im_end|>", "<|EOT|>", "<|END_OF_TURN_TOKEN|>", "<|end_of_turn|>", "<|endoftext|>", template("{{char}}"), template("{{user}}")],
@@ -303,7 +303,7 @@
       }
       const { prompt } = session.value;
       transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      runLlama(prompt, {
+      runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: [],
@@ -483,7 +483,7 @@
       ...promptFormats[promptFormat]
     };
   } else {
-    // Use vicuna as llama.cpp's default setting, since it's most common
+    // Use vicuna as jarvis.cpp's default setting, since it's most common
     session.value = {
       ...session.value,
       template: "{{prompt}}\n{{history}}{{char}}",
@@ -681,8 +681,8 @@
               <option value="alpaca">Alpaca</option>
               <option value="chatml">ChatML</option>
               <option value="commandr">Command R/+</option>
-              <option value="llama2">Llama 2</option>
-              <option value="llama3">Llama 3</option>
+              <option value="jarvis2">Jarvis 2</option>
+              <option value="jarvis3">Jarvis 3</option>
               <option value="phi3">Phi-3</option>
               <option value="openchat">OpenChat/Starling</option>
               <option value="vicuna">Vicuna</option>
@@ -703,7 +703,7 @@
               <option value="vicuna">Marx</option>
               <option value="med42">Med42</option>
               <option value="alpaca">MetaMath</option>
-              <option value="llama2">Mistral Instruct</option>
+              <option value="jarvis2">Mistral Instruct</option>
               <option value="chatml">Mistral 7B OpenOrca</option>
               <option value="alpaca">MythoMax</option>
               <option value="neuralchat">Neural Chat</option>
@@ -950,12 +950,12 @@
     };
 
     const ModelGenerationInfo = (params) => {
-      if (!llamaStats.value) {
+      if (!jarvisStats.value) {
         return html`<span/>`
       }
       return html`
       <span class=generation-statistics>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+          ${jarvisStats.value.tokens_predicted} predicted, ${jarvisStats.value.tokens_cached} cached, ${jarvisStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
         </span>
       `
     }
@@ -1075,7 +1075,7 @@
       return html`
         <div class="mode-${session.value.type}">
           <header>
-            <h2>llama.cpp</h2>
+            <h2>jarvis.cpp</h2>
             <div class="dropdown">
               <button class="dropbtn"><svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="12" cy="12" r="10" stroke-width="2"/></svg></button>
               <div class="dropdown-content" id="theme-selector">
@@ -1099,7 +1099,7 @@ <h2>llama.cpp</h2>
           </section>
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered By <a href="https://github.com/ggerganov/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
+            <p>Powered By <a href="https://github.com/ggerganov/jarvis.cpp#readme" target="_blank">jarvis.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
           </footer>
         </div>
       `;
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 7f9b02bfbf83b..e63f89793dda0 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -3,7 +3,7 @@
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
-  <title>llama.cpp - chat</title>
+  <title>jarvis.cpp - chat</title>
 
   <style>
     body {
@@ -281,19 +281,19 @@
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
     } from './index.js';
 
-    import { llama } from './completion.js';
+    import { jarvis } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
 
     let selected_image = false;
     var slot_id = -1;
 
     const session = signal({
-      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      prompt: "This is a conversation between User and Jarvis, a friendly chatbot. Jarvis is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
       type: "chat",  // "chat" | "completion"
-      char: "Llama",
+      char: "Jarvis",
       user: "User",
       image_selected: ''
     })
@@ -330,7 +330,7 @@
 
     /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
 
-    const local_storage_storageKey = "llamacpp_server_local_storage";
+    const local_storage_storageKey = "jarviscpp_server_local_storage";
 
     function local_storage_setDataFromObject(tag, content) {
       localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
@@ -464,7 +464,7 @@
     const tts = window.speechSynthesis;
     const ttsVoice = signal(null)
 
-    const llamaStats = signal(null)
+    const jarvisStats = signal(null)
     const controller = signal(null)
 
     // currently generating a completion?
@@ -489,14 +489,14 @@
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
 
-    async function runLlama(prompt, llamaParams, char) {
+    async function runJarvis(prompt, jarvisParams, char) {
       const currentMessages = [];
       const history = session.value.transcript;
       if (controller.value) {
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+      for await (const chunk of jarvis(prompt, jarvisParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
         const data = chunk.data;
 
         if (data.stop) {
@@ -519,7 +519,7 @@
         }
 
         if (data.timings) {
-          llamaStats.value = data;
+          jarvisStats.value = data;
         }
       }
 
@@ -553,7 +553,7 @@
       if (selected_image) {
         prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
       }
-      await runLlama(prompt, {
+      await runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
@@ -567,7 +567,7 @@
       }
       const { prompt } = session.value;
       transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      runLlama(prompt, {
+      runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: [],
@@ -697,7 +697,7 @@
                 <a href="#" style="cursor: help;" title="Help" onclick=${e => {
                   e.preventDefault();
                   alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
-                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `(TTS and speech recognition are not provided by jarvis.cpp)\n` +
                   `Note: STT requires HTTPS to work.`);
                 }}>[?]</a>
                 <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
@@ -709,7 +709,7 @@
               <div>
                 <a href="#" style="cursor: help;" title="Help" onclick=${e => {
                   e.preventDefault();
-                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by jarvis.cpp)`);
                 }}>[?]</a>
                 <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
                 <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
@@ -1137,12 +1137,12 @@
     };
 
     const ModelGenerationInfo = (params) => {
-      if (!llamaStats.value) {
+      if (!jarvisStats.value) {
         return html`<span/>`
       }
       return html`
         <span>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+          ${jarvisStats.value.tokens_predicted} predicted, ${jarvisStats.value.tokens_cached} cached, ${jarvisStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${jarvisStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
         </span>
       `
     }
@@ -1270,7 +1270,7 @@
           <header>
             <div class="grid-container">
               <div class="grid-item"></div>
-              <div class="grid-item"><h1>llama.cpp</h1></div>
+              <div class="grid-item"><h1>jarvis.cpp</h1></div>
               <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
             </div>
           </header>
@@ -1285,7 +1285,7 @@
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggerganov/jarvis.cpp">jarvis.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;
diff --git a/examples/server/public/prompt-formats.js b/examples/server/public/prompt-formats.js
index 73ddb7187eb7a..bca7ee6282504 100644
--- a/examples/server/public/prompt-formats.js
+++ b/examples/server/public/prompt-formats.js
@@ -55,7 +55,7 @@ export const promptFormats = {
 
   // ----------------------------
 
-  "llama2": {
+  "jarvis2": {
   template: `<s>[INST] <<SYS>>\n{{prompt}}\n<</SYS>>\n\nTest Message [/INST] Test Successfull </s>{{history}}{{char}}`,
 
   historyTemplate: `{{name}}: {{message}}`,
@@ -70,11 +70,11 @@ export const promptFormats = {
 
   stops: ""
   },
-  // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+  // ref: https://huggingface.co/blog/jarvis2#how-to-prompt-jarvis-2
 
   // ----------------------------
 
-  "llama3": {
+  "jarvis3": {
   template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`,
 
   historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`,
@@ -89,7 +89,7 @@ export const promptFormats = {
 
   stops: "<|eot_id|>"
   },
-  // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3
+  // ref: https://jarvis.meta.com/docs/model-cards-and-prompt-formats/meta-jarvis-3/#special-tokens-used-with-meta-jarvis-3
 
   // ----------------------------
 
diff --git a/examples/server/public/style.css b/examples/server/public/style.css
index 087cc62dab028..8e00036847aca 100644
--- a/examples/server/public/style.css
+++ b/examples/server/public/style.css
@@ -4,7 +4,7 @@ body {
   font-family: 'Arial', sans-serif;
   font-size: 90%;
   background-color: var(--background-color-1);
-  color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */
+  color: var(--text-color-subtile-1); /* head 1 jarvis.cpp & triangle options for some reason */
   max-width: 600px;
   min-width: 300px;
   line-height: 1.2;
diff --git a/examples/server/public/theme-mangotango.css b/examples/server/public/theme-mangotango.css
index e433802453b9d..e1fb4aeb34705 100755
--- a/examples/server/public/theme-mangotango.css
+++ b/examples/server/public/theme-mangotango.css
@@ -1,5 +1,5 @@
 /* Author: Yazan Agha-Schrader */
-/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
+/* Inspiration from jarvis.cpp logo/banner https://github.com/ggerganov/jarvis.cpp#readme */
 
 .theme-mangotango {
 
diff --git a/examples/server/public_simplechat/index.html b/examples/server/public_simplechat/index.html
index f6413016fcc53..a7c5645b38317 100644
--- a/examples/server/public_simplechat/index.html
+++ b/examples/server/public_simplechat/index.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
     <head>
-        <title>SimpleChat LlamaCppEtal </title>
+        <title>SimpleChat JarvisCppEtal </title>
         <meta charset="UTF-8" />
         <meta name="viewport" content="width=device-width, initial-scale=1" />
         <meta name="message" content="Save Nature Save Earth" />
diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md
index 21410199f6016..7399320c8e79a 100644
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@@ -7,7 +7,7 @@ by Humans for All.
 
 To run from the build dir
 
-bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
+bin/jarvis-server -m path/model.gguf --path ../examples/server/public_simplechat
 
 Continue reading for the details.
 
@@ -53,12 +53,12 @@ http module.
 
 ### running using examples/server
 
-./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
+./jarvis-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
 
 ### running using python3's server module
 
 first run examples/server
-* ./llama-server -m path/model.gguf
+* ./jarvis-server -m path/model.gguf
 
 next run this web front end in examples/server/public_simplechat
 * cd ../examples/server/public_simplechat
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ff1d9b03cec5d..7bdcd99db9726 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,7 +5,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
-#include "llama.h"
+#include "jarvis.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -50,7 +50,7 @@ enum stop_type {
     STOP_TYPE_PARTIAL,
 };
 
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+// state diagram: https://github.com/ggerganov/jarvis.cpp/pull/9283
 enum slot_state {
     SLOT_STATE_IDLE,
     SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
@@ -86,7 +86,7 @@ struct server_task {
     int id        = -1; // to be filled by server_queue
     int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
 
-    llama_tokens prompt_tokens;
+    jarvis_tokens prompt_tokens;
     server_task_type type;
     json data;
 
@@ -153,12 +153,12 @@ struct server_slot {
     int32_t n_prompt_tokens_processed = 0;
 
     // input prompt tokens
-    llama_tokens prompt_tokens;
+    jarvis_tokens prompt_tokens;
 
     size_t last_nl_pos = 0;
 
     std::string generated_text;
-    llama_tokens cache_tokens;
+    jarvis_tokens cache_tokens;
     std::vector<completion_token_output> generated_token_probs;
 
     server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
@@ -181,7 +181,7 @@ struct server_slot {
     struct common_sampler_params sparams;
     struct common_sampler * smpl = nullptr;
 
-    llama_token sampled;
+    jarvis_token sampled;
 
     // stats
     size_t n_sent_text        = 0; // number of sent text character
@@ -593,13 +593,13 @@ struct server_response {
 };
 
 struct server_context {
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
+    jarvis_model * model = nullptr;
+    jarvis_context * ctx = nullptr;
     std::vector<common_lora_adapter_container> loras;
 
     common_params params;
 
-    llama_batch batch = {};
+    jarvis_batch batch = {};
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -621,12 +621,12 @@ struct server_context {
 
     ~server_context() {
         if (ctx) {
-            llama_free(ctx);
+            jarvis_free(ctx);
             ctx = nullptr;
         }
 
         if (model) {
-            llama_free_model(model);
+            jarvis_free_model(model);
             model = nullptr;
         }
 
@@ -637,7 +637,7 @@ struct server_context {
             }
         }
 
-        llama_batch_free(batch);
+        jarvis_batch_free(batch);
     }
 
     bool load_model(const common_params & params_) {
@@ -646,11 +646,11 @@ struct server_context {
         // reserve one extra sequence (seq_id == 0) for extra features
         params.n_parallel += 1;
 
-        common_init_result llama_init = common_init_from_params(params);
+        common_init_result jarvis_init = common_init_from_params(params);
 
-        model = llama_init.model;
-        ctx   = llama_init.context;
-        loras = llama_init.lora_adapters;
+        model = jarvis_init.model;
+        ctx   = jarvis_init.context;
+        loras = jarvis_init.lora_adapters;
 
         params.n_parallel -= 1; // but be sneaky about it
 
@@ -659,18 +659,18 @@ struct server_context {
             return false;
         }
 
-        n_ctx = llama_n_ctx(ctx);
+        n_ctx = jarvis_n_ctx(ctx);
 
-        add_bos_token = llama_add_bos_token(model);
-        has_eos_token = !llama_add_eos_token(model);
+        add_bos_token = jarvis_add_bos_token(model);
+        has_eos_token = !jarvis_add_eos_token(model);
 
         return true;
     }
 
     bool validate_model_chat_template() const {
-        llama_chat_message chat[] = {{"user", "test"}};
+        jarvis_chat_message chat[] = {{"user", "test"}};
 
-        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        const int res = jarvis_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
 
         return res > 0;
     }
@@ -706,10 +706,10 @@ struct server_context {
         // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
-            const int32_t n_batch = llama_n_batch(ctx);
+            const int32_t n_batch = jarvis_n_batch(ctx);
 
             // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
+            batch = jarvis_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
         }
 
         metrics.init();
@@ -880,12 +880,12 @@ struct server_context {
             slot.sparams.logit_bias.clear();
 
             if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.sparams.logit_bias.push_back({jarvis_token_eos(model), -INFINITY});
             }
 
             const auto & logit_bias = data.find("logit_bias");
             if (logit_bias != data.end() && logit_bias->is_array()) {
-                const int n_vocab = llama_n_vocab(model);
+                const int n_vocab = jarvis_n_vocab(model);
                 for (const auto & el : *logit_bias) {
                     // TODO: we may want to throw errors here, in case "el" is incorrect
                     if (el.is_array() && el.size() == 2) {
@@ -899,7 +899,7 @@ struct server_context {
                         }
 
                         if (el[0].is_number_integer()) {
-                            llama_token tok = el[0].get<llama_token>();
+                            jarvis_token tok = el[0].get<jarvis_token>();
                             if (tok >= 0 && tok < n_vocab) {
                                 slot.sparams.logit_bias.push_back({tok, bias});
                             }
@@ -966,7 +966,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        jarvis_kv_cache_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -1103,14 +1103,14 @@ struct server_context {
                     slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
         }
 
-        if (llama_token_is_eog(model, result.tok)) {
+        if (jarvis_token_is_eog(model, result.tok)) {
             slot.stopped_eos    = true;
             slot.has_next_token = false;
 
             SLT_DBG(slot, "%s", "stopped by EOS\n");
         }
 
-        const auto n_ctx_train = llama_n_ctx_train(model);
+        const auto n_ctx_train = jarvis_n_ctx_train(model);
 
         if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
             slot.truncated      = true;
@@ -1212,7 +1212,7 @@ struct server_context {
         };
 
         if (slot.sparams.n_probs > 0) {
-            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            const jarvis_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
             const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
 
@@ -1263,7 +1263,7 @@ struct server_context {
         if (slot.sparams.n_probs > 0) {
             std::vector<completion_token_output> probs;
             if (!slot.params.stream && slot.stopped_word) {
-                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+                const jarvis_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
                 probs = std::vector<completion_token_output>(
@@ -1286,13 +1286,13 @@ struct server_context {
         queue_results.send(res);
     }
 
-    void send_embedding(const server_slot & slot, const llama_batch & batch) {
+    void send_embedding(const server_slot & slot, const jarvis_batch & batch) {
         server_task_result res;
         res.id       = slot.id_task;
         res.error    = false;
         res.stop     = true;
 
-        const int n_embd = llama_n_embd(model);
+        const int n_embd = jarvis_n_embd(model);
 
         std::vector<float> embd_res(n_embd, 0.0f);
 
@@ -1301,9 +1301,9 @@ struct server_context {
                 continue;
             }
 
-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            const float * embd = jarvis_get_embeddings_seq(ctx, batch.seq_id[i][0]);
             if (embd == NULL) {
-                embd = llama_get_embeddings_ith(ctx, i);
+                embd = jarvis_get_embeddings_ith(ctx, i);
             }
 
             if (embd == NULL) {
@@ -1330,7 +1330,7 @@ struct server_context {
         queue_results.send(res);
     }
 
-    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+    void send_rerank(const server_slot & slot, const jarvis_batch & batch) {
         server_task_result res;
         res.id       = slot.id_task;
         res.error    = false;
@@ -1341,9 +1341,9 @@ struct server_context {
                 continue;
             }
 
-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            const float * embd = jarvis_get_embeddings_seq(ctx, batch.seq_id[i][0]);
             if (embd == NULL) {
-                embd = llama_get_embeddings_ith(ctx, i);
+                embd = jarvis_get_embeddings_ith(ctx, i);
             }
 
             if (embd == NULL) {
@@ -1375,7 +1375,7 @@ struct server_context {
     // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s)
     std::vector<server_task> create_tasks_inference(json data, server_task_inf_type inf_type) {
         std::vector<server_task> tasks;
-        auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
+        auto create_task = [&](json & task_data, jarvis_tokens & prompt_tokens) {
             SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
             server_task task;
             task.id            = queue_tasks.get_new_id();
@@ -1391,9 +1391,9 @@ struct server_context {
             throw std::runtime_error(error_msg);
         }
 
-        // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread
+        // because jarvis_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread
         bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL;
-        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true);
+        std::vector<jarvis_tokens> tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true);
         switch (inf_type) {
             case SERVER_TASK_INF_TYPE_RERANK:
                 {
@@ -1625,8 +1625,8 @@ struct server_context {
                         { "n_decode_total",                  metrics.n_decode_total},
                         { "n_busy_slots_total",              metrics.n_busy_slots_total},
 
-                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
-                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},
+                        { "kv_cache_tokens_count",           jarvis_get_kv_cache_token_count(ctx)},
+                        { "kv_cache_used_cells",             jarvis_get_kv_cache_used_cells(ctx)},
 
                         { "slots",                           slots_data },
                     };
@@ -1657,7 +1657,7 @@ struct server_context {
                     std::string filename = task.data.at("filename");
                     std::string filepath = task.data.at("filepath");
 
-                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
+                    const size_t nwrite = jarvis_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
 
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -1699,7 +1699,7 @@ struct server_context {
 
                     slot->cache_tokens.resize(slot->n_ctx);
                     size_t token_count = 0;
-                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    size_t nread = jarvis_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
                     if (nread == 0) {
                         slot->cache_tokens.resize(0);
                         send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
@@ -1742,7 +1742,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
+                    jarvis_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
                     slot->cache_tokens.clear();
 
                     server_task_result result;
@@ -1819,8 +1819,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
+                jarvis_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
+                jarvis_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1860,8 +1860,8 @@ struct server_context {
         }
 
         // process in chunks of params.n_batch
-        int32_t n_batch  = llama_n_batch(ctx);
-        int32_t n_ubatch = llama_n_ubatch(ctx);
+        int32_t n_batch  = jarvis_n_batch(ctx);
+        int32_t n_ubatch = jarvis_n_ubatch(ctx);
 
         // track if this is an embedding or non-embedding batch
         // if we've added sampled tokens above, we are in non-embedding mode
@@ -1944,7 +1944,7 @@ struct server_context {
                                 const int n_block_size = n_left / 2;
                                 const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                                llama_tokens new_tokens(
+                                jarvis_tokens new_tokens(
                                         prompt_tokens.begin(),
                                         prompt_tokens.begin() + slot.params.n_keep);
 
@@ -1993,8 +1993,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1,     kv_shift);
+                                            jarvis_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
+                                            jarvis_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2043,9 +2043,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
+                    if (!jarvis_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
+                        jarvis_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -2107,13 +2107,13 @@ struct server_context {
         SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         // make sure we're in the right embedding mode
-        llama_set_embeddings(ctx, batch_type == 1);
+        jarvis_set_embeddings(ctx, batch_type == 1);
 
         // process the created batch of tokens
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            llama_batch batch_view = {
+            jarvis_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -2123,7 +2123,7 @@ struct server_context {
                 batch.logits   + i,
             };
 
-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = jarvis_decode(ctx, batch_view);
             metrics.on_decoded(slots);
 
             if (ret != 0) {
@@ -2174,7 +2174,7 @@ struct server_context {
                 }
 
                 completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                const jarvis_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
                 common_sampler_accept(slot.smpl, id, true);
 
@@ -2213,12 +2213,12 @@ struct server_context {
 
     json model_meta() const {
         return json {
-            {"vocab_type",  llama_vocab_type    (model)},
-            {"n_vocab",     llama_n_vocab       (model)},
-            {"n_ctx_train", llama_n_ctx_train   (model)},
-            {"n_embd",      llama_n_embd        (model)},
-            {"n_params",    llama_model_n_params(model)},
-            {"size",        llama_model_size    (model)},
+            {"vocab_type",  jarvis_vocab_type    (model)},
+            {"n_vocab",     jarvis_n_vocab       (model)},
+            {"n_ctx_train", jarvis_n_ctx_train   (model)},
+            {"n_embd",      jarvis_n_embd        (model)},
+            {"n_params",    jarvis_model_n_params(model)},
+            {"size",        jarvis_model_size    (model)},
         };
     }
 };
@@ -2253,7 +2253,7 @@ int main(int argc, char ** argv) {
     // own arguments required by this example
     common_params params;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_SERVER)) {
         return 1;
     }
 
@@ -2263,15 +2263,15 @@ int main(int argc, char ** argv) {
     // see format_final_response_oaicompat()
     const bool verbose = params.verbosity > 9;
 
-    // struct that contains llama context and inference
+    // struct that contains jarvis context and inference
     server_context ctx_server;
 
     if (params.model_alias == "unknown") {
         params.model_alias = params.model;
     }
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
     LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
     LOG_INF("\n");
@@ -2299,7 +2299,7 @@ int main(int argc, char ** argv) {
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
-    svr->set_default_headers({{"Server", "llama.cpp"}});
+    svr->set_default_headers({{"Server", "jarvis.cpp"}});
 
     // CORS preflight
     svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
@@ -2524,11 +2524,11 @@ int main(int argc, char ** argv) {
                     {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
             }, {
                     {"name",  "n_decode_total"},
-                    {"help",  "Total number of llama_decode() calls"},
+                    {"help",  "Total number of jarvis_decode() calls"},
                     {"value",  n_decode_total}
             }, {
                     {"name",  "n_busy_slots_per_decode"},
-                    {"help",  "Average number of busy slots per llama_decode() call"},
+                    {"help",  "Average number of busy slots per jarvis_decode() call"},
                     {"value",  (float) n_busy_slots_total / (float) n_decode_total}
             }}},
             {"gauge", {{
@@ -2569,9 +2569,9 @@ int main(int argc, char ** argv) {
                 const std::string help = metric_def.at("help");
 
                 auto value = json_value(metric_def, "value", 0.);
-                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
-                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
-                            << "llamacpp:"        << name << " " << value << "\n";
+                prometheus << "# HELP jarviscpp:" << name << " " << help  << "\n"
+                            << "# TYPE jarviscpp:" << name << " " << type  << "\n"
+                            << "jarviscpp:"        << name << " " << value << "\n";
             }
         }
 
@@ -2695,7 +2695,7 @@ int main(int argc, char ** argv) {
         json data = {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params.n_parallel },
-            { "chat_template",               llama_get_chat_template(ctx_server.model) },
+            { "chat_template",               jarvis_get_chat_template(ctx_server.model) },
         };
 
         res_ok(res, data);
@@ -2772,13 +2772,13 @@ int main(int argc, char ** argv) {
     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
         // check model compatibility
         std::string err;
-        if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_fim_pre(ctx_server.model) == JARVIS_TOKEN_NULL) {
             err += "prefix token is missing. ";
         }
-        if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_fim_suf(ctx_server.model) == JARVIS_TOKEN_NULL) {
             err += "suffix token is missing. ";
         }
-        if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_fim_mid(ctx_server.model) == JARVIS_TOKEN_NULL) {
             err += "middle token is missing. ";
         }
         if (!err.empty()) {
@@ -2884,7 +2884,7 @@ int main(int argc, char ** argv) {
                     {"id",       params.model_alias},
                     {"object",   "model"},
                     {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
+                    {"owned_by", "jarviscpp"},
                     {"meta",     ctx_server.model_meta()}
                 },
              }}
@@ -2901,7 +2901,7 @@ int main(int argc, char ** argv) {
             const bool add_special = json_value(body, "add_special", false);
             const bool with_pieces = json_value(body, "with_pieces", false);
 
-            llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
+            jarvis_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
 
             if (with_pieces) {
                 for (const auto& token : tokens) {
@@ -2938,7 +2938,7 @@ int main(int argc, char ** argv) {
 
         std::string content;
         if (body.count("tokens") != 0) {
-            const llama_tokens tokens = body.at("tokens");
+            const jarvis_tokens tokens = body.at("tokens");
             content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
         }
 
@@ -3206,7 +3206,7 @@ int main(int argc, char ** argv) {
     // clean up function, to be called before exit
     auto clean_up = [&svr]() {
         svr->stop();
-        llama_backend_free();
+        jarvis_backend_free();
     };
 
     // bind HTTP listen port, run the HTTP server in a thread
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 10f22c4471ea7..0a5f86d2166e5 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -27,8 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
 ```shell
 cd ../../..
-cmake -B build -DLLAMA_CURL=ON
-cmake --build build --target llama-server
+cmake -B build -DJARVIS_CURL=ON
+cmake --build build --target jarvis-server
 ```
 
 2. Start the test: `./tests.sh`
@@ -38,13 +38,13 @@ It's possible to override some scenario steps values with environment variables:
 | variable                 | description                                                                                    |
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
+| `JARVIS_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/jarvis-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
 
 ### Run @bug, @wip or @wrong_usage annotated scenario
 
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
+Feature or Scenario must be annotated with `@jarvis.cpp` to be included in the default scope.
 
 - `@bug` annotation aims to link a scenario with a GitHub issue.
 - `@wrong_usage` are meant to show user issue that are actually an expected behavior
diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature
index ae6c6b01b0221..ae9a298be888b 100644
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@@ -1,12 +1,12 @@
-@llama.cpp
+@jarvis.cpp
 @ctx_shift
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K.gguf from HF repo ggml-org/models
     And   a model file test-model.gguf
-    And   a model alias tinyllama-2
+    And   a model alias tinyjarvis-2
     And   BOS token is 1
     And   42 as server seed
     And   256 KV cache size
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index f4fe2ee4335ff..e91126f8cd579 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -1,6 +1,6 @@
-@llama.cpp
+@jarvis.cpp
 @embeddings
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index e7845dc2f51fc..a24ec5ae34d06 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -27,8 +27,8 @@ def after_scenario(context, scenario):
         if scenario.status == "failed":
             if 'GITHUB_ACTIONS' in os.environ:
                 print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
-                if os.path.isfile('llama.log'):
-                    with closing(open('llama.log', 'r')) as f:
+                if os.path.isfile('jarvis.log'):
+                    with closing(open('jarvis.log', 'r')) as f:
                         for line in f:
                             print(line)
             if not is_server_listening(context.server_fqdn, context.server_port):
diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature
index a0bbfef77707b..504245bb2d31b 100644
--- a/examples/server/tests/features/infill.feature
+++ b/examples/server/tests/features/infill.feature
@@ -1,15 +1,15 @@
-@llama.cpp
+@jarvis.cpp
 @infill
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   # The current model is made by adding FIM tokens to the existing stories260K
   # We may want to use a better model in the future, maybe something like SmolLM 360M
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K-infill.gguf from HF repo ggml-org/models
     And   a model file test-model-infill.gguf
-    And   a model alias tinyllama-infill
+    And   a model alias tinyjarvis-infill
     And   42 as server seed
     And   1024 as batch size
     And   1024 as ubatch size
@@ -22,15 +22,15 @@ Feature: llama.cpp server
   Scenario: Infill without input_extra
     Given a prompt "Complete this"
     And   an infill input extra none none
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
+    And   an infill input prefix "#include <cstdio>\n#include \"jarvis.h\"\n\nint main() {\n    int n_threads = jarvis_"
     And   an infill input suffix "}\n"
     And   an infill request with no api error
     Then  64 tokens are predicted matching One|day|she|saw|big|scary|bird
 
   Scenario: Infill with input_extra
     Given a prompt "Complete this"
-    And   an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
+    And   an infill input extra "jarvis.h" "JARVIS_API int32_t jarvis_n_threads();\n"
+    And   an infill input prefix "#include <cstdio>\n#include \"jarvis.h\"\n\nint main() {\n    int n_threads = jarvis_"
     And   an infill input suffix "}\n"
     And   an infill request with no api error
     Then  64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature
index 7b85988ac6e87..250ec13917439 100644
--- a/examples/server/tests/features/lora.feature
+++ b/examples/server/tests/features/lora.feature
@@ -1,6 +1,6 @@
-@llama.cpp
+@jarvis.cpp
 @lora
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index 423d0f1d42f55..8223298ac1479 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -1,10 +1,10 @@
-@llama.cpp
+@jarvis.cpp
 @parallel
 Feature: Parallel
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
     And   a model file test-model-00001-of-00003.gguf
     And   42 as server seed
     And   128 as batch size
@@ -35,7 +35,7 @@ Feature: Parallel
 
   Scenario Outline: Multi users OAI completions compatibility
     Given a system prompt You are a writer.
-    And   a model tinyllama-2
+    And   a model tinyjarvis-2
     Given a prompt:
       """
       Write a very long book.
@@ -57,7 +57,7 @@ Feature: Parallel
 
   Scenario Outline: Multi users OAI completions compatibility no v1
     Given a system prompt You are a writer.
-    And   a model tinyllama-2
+    And   a model tinyjarvis-2
     Given a prompt:
       """
       Write a very long book.
@@ -79,7 +79,7 @@ Feature: Parallel
 
   Scenario Outline: Multi users with number of prompts exceeding number of slots
     Given a system prompt You are a writer.
-    And   a model tinyllama-2
+    And   a model tinyjarvis-2
     Given a prompt:
       """
       Write a very long book.
diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature
index ff0a82cc46581..2e286418af6ab 100644
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -51,6 +51,6 @@ Feature: Passkey / Self-extend with context shift
       | hf_repo                         | hf_file                     | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content     |
       | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 4    | 512    | 250    | 50    | 42      | 1           | 42             |
       | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 2    | 512    | 250    | 50    | 42      | 1           | \b((?!42)\w)+\b  |
-      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
+      #| TheBloke/Jarvis-2-7B-GGUF        | jarvis-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
       #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
       # 987           |
diff --git a/examples/server/tests/features/rerank.feature b/examples/server/tests/features/rerank.feature
index c36cc8e215fa6..7405ff3591299 100644
--- a/examples/server/tests/features/rerank.feature
+++ b/examples/server/tests/features/rerank.feature
@@ -1,6 +1,6 @@
-@llama.cpp
+@jarvis.cpp
 @rerank
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
index e8e1b54147b05..e73b523fde049 100644
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@@ -1,10 +1,10 @@
-@llama.cpp
+@jarvis.cpp
 @results
 Feature: Results
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
     And   a model file test-model-00001-of-00003.gguf
     And   128 as batch size
     And   1024 KV cache size
@@ -77,8 +77,8 @@ Feature: Results
       | 1          | 1.0  |
       # FIXME: unified KV cache nondeterminism
       # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # and https://github.com/ggerganov/jarvis.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/jarvis.cpp/pull/7347 .
       # | 2          | 0.0  |
       # | 4          | 0.0  |
       # | 2          | 1.0  |
@@ -110,8 +110,8 @@ Feature: Results
       | 4       | 1024 | 1         | 1          |
       # FIXME: unified KV cache nondeterminism
       # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # and https://github.com/ggerganov/jarvis.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/jarvis.cpp/pull/7347 .
       # | 4       | 1024 | 1         | 4          |
       # | 4       | 1024 | 100       | 1          |
       # This test still fails even the above patches; the first token probabilities are already different.
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
index 0a3c5cc7754f7..fd433638a73cd 100644
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -1,10 +1,10 @@
-@llama.cpp
+@jarvis.cpp
 @security
 Feature: Security
 
   Background: Server startup with an api key defined
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K.gguf from HF repo ggml-org/models
     And   a server api key THIS_IS_THE_KEY
     Then  the server is starting
     Then  the server is healthy
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 15e24c624af37..663d7302cb3f6 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -1,12 +1,12 @@
-@llama.cpp
+@jarvis.cpp
 @server
-Feature: llama.cpp server
+Feature: jarvis.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K.gguf from HF repo ggml-org/models
     And   a model file test-model.gguf
-    And   a model alias tinyllama-2
+    And   a model alias tinyjarvis-2
     And   BOS token is 1
     And   42 as server seed
       # KV Cache corresponds to the total amount of tokens
@@ -33,7 +33,7 @@ Feature: llama.cpp server
     And   the completion is <truncated> truncated
     And   <n_prompt> prompt tokens are processed
     And   prometheus metrics are exposed
-    And   metric llamacpp:tokens_predicted is <n_predicted>
+    And   metric jarviscpp:tokens_predicted is <n_predicted>
 
     Examples: Prompts
       | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
@@ -67,8 +67,8 @@ Feature: llama.cpp server
 
     Examples: Prompts
       | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
+      | jarvis-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
+      | codejarvis70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
 
 
   Scenario Outline: OAI Compatibility w/ response format
@@ -116,5 +116,5 @@ Feature: llama.cpp server
   Scenario: Models available
     Given available models
     Then  1 models are supported
-    Then  model 0 is identified by tinyllama-2
+    Then  model 0 is identified by tinyjarvis-2
     Then  model 0 is trained on 128 tokens context
diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature
index 1c281c0741afe..f4da88217a271 100644
--- a/examples/server/tests/features/slotsave.feature
+++ b/examples/server/tests/features/slotsave.feature
@@ -1,10 +1,10 @@
-@llama.cpp
+@jarvis.cpp
 @slotsave
-Feature: llama.cpp server slot management
+Feature: jarvis.cpp server slot management
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K.gguf from HF repo ggml-org/models
     And   prompt caching is enabled
     And   2 slots
     And   . as slot save path
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 2e418d8aa571b..3baecbdca44af 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -909,7 +909,7 @@ async def step_prometheus_metrics_exported(context):
             context.metrics = {}
             for metric in parser.text_string_to_metric_families(metrics_raw):
                 match metric.name:
-                    case "llamacpp:kv_cache_usage_ratio":
+                    case "jarviscpp:kv_cache_usage_ratio":
                         assert len(metric.samples) > 0
                         metric_exported = True
                 context.metrics[metric.name] = metric
@@ -1105,7 +1105,7 @@ async def oai_chat_completions(user_prompt,
         }
     }
     if async_client:
-        origin = 'llama.cpp'
+        origin = 'jarvis.cpp'
         headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
         async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
             async with session.post(f'{base_url}{base_path}',
@@ -1191,7 +1191,7 @@ async def oai_chat_completions(user_prompt,
                 'truncated': chat_completion.choices[0].finish_reason != 'stop'
             }
     if debug:
-        print("OAI response formatted to llama.cpp:", completion_response)
+        print("OAI response formatted to jarvis.cpp:", completion_response)
     return completion_response
 
 
@@ -1214,7 +1214,7 @@ async def request_oai_embeddings(input, seed,
     # openai client always expects an api_key
     user_api_key = user_api_key if user_api_key is not None else 'nope'
     if async_client:
-        origin = 'llama.cpp'
+        origin = 'jarvis.cpp'
         headers=[]
         if user_api_key is not None:
             headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
@@ -1427,11 +1427,11 @@ def context_text(context):
 
 def start_server_background(context):
     if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/llama-server.exe'
+        context.server_path = '../../../build/bin/Release/jarvis-server.exe'
     else:
-        context.server_path = '../../../build/bin/llama-server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+        context.server_path = '../../../build/bin/jarvis-server'
+    if 'JARVIS_SERVER_BIN_PATH' in os.environ:
+        context.server_path = os.environ['JARVIS_SERVER_BIN_PATH']
     server_listen_addr = context.server_fqdn
     server_args = [
         '--slots', # requires to get slot status via /slots endpoint
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
index 61d5f315e1567..b5fd34a73dd03 100644
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -1,13 +1,13 @@
 # run with: ./tests.sh --no-skipped --tags wrong_usage
 @wrong_usage
-Feature: Wrong usage of llama.cpp server
+Feature: Wrong usage of jarvis.cpp server
 
   #3969 The user must always set --n-predict option
   # to cap the number of tokens any completion request can generate
   # or pass n_predict/max_tokens in the request.
   Scenario: Infinite loop
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyjarviss/stories260K.gguf from HF repo ggml-org/models
     And   42 as server seed
     And   2048 KV cache size
     # Uncomment below to fix the issue
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 72a0fbad827db..af3cfbf1eb3fd 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -4,8 +4,8 @@ set -eu
 
 if [ $# -lt 1 ]
 then
-    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    # Start @jarvis.cpp scenario
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp
 else
     behave "$@"
 fi
diff --git a/examples/server/themes/README.md b/examples/server/themes/README.md
index 62e721a2758d8..2670e8f8b51af 100644
--- a/examples/server/themes/README.md
+++ b/examples/server/themes/README.md
@@ -1,4 +1,4 @@
-# LLaMA.cpp Server Wild Theme
+# JARVIS.cpp Server Wild Theme
 
 Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
 
diff --git a/examples/server/themes/buttons-top/README.md b/examples/server/themes/buttons-top/README.md
index 808c4cf81a959..8ef76ea48daa9 100644
--- a/examples/server/themes/buttons-top/README.md
+++ b/examples/server/themes/buttons-top/README.md
@@ -1,4 +1,4 @@
-# LLaMA.cpp Server Buttons Top Theme
+# JARVIS.cpp Server Buttons Top Theme
 
 Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
 
diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html
index 8334bcde5049c..5ad2605e92b86 100644
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -4,7 +4,7 @@
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
-  <title>llama.cpp - chat</title>
+  <title>jarvis.cpp - chat</title>
 
   <style>
     body {
@@ -201,18 +201,18 @@
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
     } from './index.js';
 
-    import { llama } from './completion.js';
+    import { jarvis } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
     let selected_image = false;
     var slot_id = -1;
 
     const session = signal({
-      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      prompt: "This is a conversation between User and Jarvis, a friendly chatbot. Jarvis is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
       type: "chat",  // "chat" | "completion"
-      char: "Llama",
+      char: "Jarvis",
       user: "User",
       image_selected: ''
     })
@@ -243,7 +243,7 @@
 
     /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
 
-    const local_storage_storageKey = "llamacpp_server_local_storage";
+    const local_storage_storageKey = "jarviscpp_server_local_storage";
 
     function local_storage_setDataFromObject(tag, content) {
       localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
@@ -374,7 +374,7 @@
 
     /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
 
-    const llamaStats = signal(null)
+    const jarvisStats = signal(null)
     const controller = signal(null)
 
     // currently generating a completion?
@@ -399,14 +399,14 @@
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
 
-    async function runLlama(prompt, llamaParams, char) {
+    async function runJarvis(prompt, jarvisParams, char) {
       const currentMessages = [];
       const history = session.value.transcript;
       if (controller.value) {
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+      for await (const chunk of jarvis(prompt, jarvisParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
         const data = chunk.data;
 
         if (data.stop) {
@@ -429,7 +429,7 @@
         }
 
         if (data.timings) {
-          llamaStats.value = data;
+          jarvisStats.value = data;
         }
       }
 
@@ -463,7 +463,7 @@
       if (selected_image) {
         prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
       }
-      await runLlama(prompt, {
+      await runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
@@ -477,7 +477,7 @@
       }
       const { prompt } = session.value;
       transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      runLlama(prompt, {
+      runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: [],
@@ -893,12 +893,12 @@
     };
 
     const ModelGenerationInfo = (params) => {
-      if (!llamaStats.value) {
+      if (!jarvisStats.value) {
         return html`<span/>`
       }
       return html`
         <span>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+          ${jarvisStats.value.tokens_predicted} predicted, ${jarvisStats.value.tokens_cached} cached, ${jarvisStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${jarvisStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
         </span>
       `
     }
@@ -1023,7 +1023,7 @@
       return html`
         <div class="mode-${session.value.type}">
           <header>
-            <h1>llama.cpp</h1>
+            <h1>jarvis.cpp</h1>
           </header>
 
           <section id="write">
@@ -1036,7 +1036,7 @@ <h1>llama.cpp</h1>
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggerganov/jarvis.cpp">jarvis.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;
diff --git a/examples/server/themes/wild/README.md b/examples/server/themes/wild/README.md
index 560bcc81bfde1..240908c103acc 100644
--- a/examples/server/themes/wild/README.md
+++ b/examples/server/themes/wild/README.md
@@ -1,4 +1,4 @@
-# LLaMA.cpp Server Wild Theme
+# JARVIS.cpp Server Wild Theme
 
 Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
 
diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html
index 8361c577494d7..87e1dbc6f109a 100644
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@@ -4,13 +4,13 @@
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
-  <title>llama.cpp - chat</title>
+  <title>jarvis.cpp - chat</title>
 
   <style>
     body {
       font-family: system-ui;
       font-size: 90%;
-      background-image: url('llamapattern.png');
+      background-image: url('jarvispattern.png');
     }
 
     #container {
@@ -204,18 +204,18 @@
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
     } from './index.js';
 
-    import { llama } from './completion.js';
+    import { jarvis } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
     let selected_image = false;
     var slot_id = -1;
 
     const session = signal({
-      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      prompt: "This is a conversation between User and Jarvis, a friendly chatbot. Jarvis is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
       type: "chat",  // "chat" | "completion"
-      char: "Llama",
+      char: "Jarvis",
       user: "User",
       image_selected: ''
     })
@@ -246,7 +246,7 @@
 
     /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
 
-    const local_storage_storageKey = "llamacpp_server_local_storage";
+    const local_storage_storageKey = "jarviscpp_server_local_storage";
 
     function local_storage_setDataFromObject(tag, content) {
       localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
@@ -377,7 +377,7 @@
 
     /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
 
-    const llamaStats = signal(null)
+    const jarvisStats = signal(null)
     const controller = signal(null)
 
     // currently generating a completion?
@@ -402,14 +402,14 @@
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
 
-    async function runLlama(prompt, llamaParams, char) {
+    async function runJarvis(prompt, jarvisParams, char) {
       const currentMessages = [];
       const history = session.value.transcript;
       if (controller.value) {
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+      for await (const chunk of jarvis(prompt, jarvisParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
         const data = chunk.data;
 
         if (data.stop) {
@@ -432,7 +432,7 @@
         }
 
         if (data.timings) {
-          llamaStats.value = data;
+          jarvisStats.value = data;
         }
       }
 
@@ -466,7 +466,7 @@
       if (selected_image) {
         prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
       }
-      await runLlama(prompt, {
+      await runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
@@ -480,7 +480,7 @@
       }
       const { prompt } = session.value;
       transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      runLlama(prompt, {
+      runJarvis(prompt, {
         ...params.value,
         slot_id: slot_id,
         stop: [],
@@ -896,12 +896,12 @@
     };
 
     const ModelGenerationInfo = (params) => {
-      if (!llamaStats.value) {
+      if (!jarvisStats.value) {
         return html`<span/>`
       }
       return html`
         <span>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+          ${jarvisStats.value.tokens_predicted} predicted, ${jarvisStats.value.tokens_cached} cached, ${jarvisStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${jarvisStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
         </span>
       `
     }
@@ -1026,7 +1026,7 @@
       return html`
         <div class="mode-${session.value.type}">
           <header>
-            <img src="llama_cpp.png" style="width:100%"/>
+            <img src="jarvis_cpp.png" style="width:100%"/>
           </header>
 
           <section id="write">
@@ -1040,7 +1040,7 @@
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggerganov/jarvis.cpp">jarvis.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 8112420624185..ad9692512b578 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -2,7 +2,7 @@
 
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -24,7 +24,7 @@
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
 using json = nlohmann::ordered_json;
-using llama_tokens = std::vector<llama_token>;
+using jarvis_tokens = std::vector<jarvis_token>;
 
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
@@ -104,10 +104,10 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
  * - only string, example: "string"
  * - mixed string and tokens, example: [12, 34, "string", 56, 78]
  */
-static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
+static jarvis_tokens tokenize_mixed(const jarvis_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
     // If `add_bos` is true, we only add BOS, when json_prompt is a string,
     // or the first element of the json_prompt array is a string.
-    llama_tokens prompt_tokens;
+    jarvis_tokens prompt_tokens;
 
     if (json_prompt.is_array()) {
         bool first = true;
@@ -115,7 +115,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
             if (p.is_string()) {
                 auto s = p.template get<std::string>();
 
-                llama_tokens p;
+                jarvis_tokens p;
                 if (first) {
                     p = common_tokenize(ctx, s, add_special, parse_special);
                     first = false;
@@ -129,7 +129,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
                     first = false;
                 }
 
-                prompt_tokens.push_back(p.template get<llama_token>());
+                prompt_tokens.push_back(p.template get<jarvis_token>());
             }
         }
     } else {
@@ -151,14 +151,14 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
  * - "prompt": ["string1", [12, 34, 56]]
  * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
  */
-static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
-    std::vector<llama_tokens> result;
+static std::vector<jarvis_tokens> tokenize_input_prompts(jarvis_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
+    std::vector<jarvis_tokens> result;
     if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
         // string or mixed
         result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
     } else if (json_is_array_of_numbers(json_prompt)) {
         // array of tokens
-        result.push_back(json_prompt.get<llama_tokens>());
+        result.push_back(json_prompt.get<jarvis_tokens>());
     } else if (json_prompt.is_array()) {
         // array of prompts
         result.reserve(json_prompt.size());
@@ -167,7 +167,7 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
                 result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
             } else if (json_is_array_of_numbers(p)) {
                 // array of tokens
-                result.push_back(p.get<llama_tokens>());
+                result.push_back(p.get<jarvis_tokens>());
             } else {
                 throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
             }
@@ -183,21 +183,21 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
 //
 
 // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
-static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
-    llama_tokens result;
+static jarvis_tokens format_rerank(const struct jarvis_model * model, const jarvis_tokens & query, const jarvis_tokens & doc) {
+    jarvis_tokens result;
     result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_token_bos(model));
+    result.push_back(jarvis_token_bos(model));
     result.insert(result.end(), query.begin(), query.end());
-    result.push_back(llama_token_eos(model));
-    result.push_back(llama_token_sep(model));
+    result.push_back(jarvis_token_eos(model));
+    result.push_back(jarvis_token_sep(model));
     result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(llama_token_eos(model));
+    result.push_back(jarvis_token_eos(model));
     return result;
 }
 
 // format infill task
-static llama_tokens format_infill(
-        const llama_context * ctx,
+static jarvis_tokens format_infill(
+        const jarvis_context * ctx,
         const json & input_prefix,
         const json & input_suffix,
         const json & input_extra,
@@ -205,7 +205,7 @@ static llama_tokens format_infill(
         const int n_predict,
         const int n_ctx,
         const bool spm_infill,
-        const llama_tokens & tokens_prompt
+        const jarvis_tokens & tokens_prompt
     ) {
     // TODO: optimize this block by reducing memory allocations and movement
 
@@ -221,18 +221,18 @@ static llama_tokens format_infill(
     // [FIM_SEP]filename
     // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
     //
-    llama_tokens extra_tokens;
+    jarvis_tokens extra_tokens;
     extra_tokens.reserve(n_ctx);
 
-    auto model = llama_get_model(ctx);
+    auto model = jarvis_get_model(ctx);
     auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
     auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
 
-    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
+    if (jarvis_token_fim_rep(model) != JARVIS_TOKEN_NULL) {
         // TODO: make project name an input
         static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
 
-        extra_tokens.push_back(llama_token_fim_rep(model));
+        extra_tokens.push_back(jarvis_token_fim_rep(model));
         extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
     }
     for (const auto & chunk : input_extra) {
@@ -240,10 +240,10 @@ static llama_tokens format_infill(
         const std::string text     = json_value(chunk, "text",     std::string());
         const std::string filename = json_value(chunk, "filename", std::string("tmp"));
 
-        if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+        if (jarvis_token_fim_sep(model) != JARVIS_TOKEN_NULL) {
             const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
 
-            extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
+            extra_tokens.insert(extra_tokens.end(), jarvis_token_fim_sep(model));
             extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
         } else {
             // chunk separator in binary form to avoid confusing the AI
@@ -257,11 +257,11 @@ static llama_tokens format_infill(
         extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
     }
 
-    if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+    if (jarvis_token_fim_sep(model) != JARVIS_TOKEN_NULL) {
         // TODO: current filename
         static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
 
-        extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
+        extra_tokens.insert(extra_tokens.end(), jarvis_token_fim_sep(model));
         extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
     }
 
@@ -275,15 +275,15 @@ static llama_tokens format_infill(
     tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
     tokens_suffix.resize(n_suffix_take);
 
-    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
+    tokens_prefix.insert(tokens_prefix.begin(), jarvis_token_fim_pre(model));
     tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
-    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
+    tokens_suffix.insert(tokens_suffix.begin(), jarvis_token_fim_suf(model));
 
     auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
     auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
 
-    if (llama_add_bos_token(model)) {
-        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+    if (jarvis_add_bos_token(model)) {
+        embd_inp.insert(embd_inp.begin(), jarvis_token_bos(model));
     }
 
     SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
@@ -292,13 +292,13 @@ static llama_tokens format_infill(
     embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
 
     embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-    embd_inp.push_back(llama_token_fim_mid(model));
+    embd_inp.push_back(jarvis_token_fim_mid(model));
 
     return embd_inp;
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct jarvis_model * model, const std::string & tmpl, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
@@ -317,10 +317,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
                     }
                 }
             } else {
-                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/jarvis.cpp/issues/8367)");
             }
         } else {
-            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/jarvis.cpp/issues/8367)");
         }
 
         chat.push_back({role, content});
@@ -332,15 +332,15 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
     return formatted_chat;
 }
 
-static std::string llama_get_chat_template(const struct llama_model * model) {
+static std::string jarvis_get_chat_template(const struct jarvis_model * model) {
     std::string template_key = "tokenizer.chat_template";
     // call with NULL buffer to get the total size of the string
-    int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
+    int32_t res = jarvis_model_meta_val_str(model, template_key.c_str(), NULL, 0);
     if (res < 0) {
         return "";
     } else {
         std::vector<char> model_template(res, 0);
-        llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        jarvis_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
         return std::string(model_template.data(), model_template.size());
     }
 }
@@ -437,7 +437,7 @@ static std::string gen_chatcmplid() {
 // other common utils
 //
 
-static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+static size_t longest_common_prefix(const std::vector<jarvis_token> & a, const std::vector<jarvis_token> & b) {
     size_t i;
     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
 
@@ -471,9 +471,9 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
     return std::string::npos;
 }
 
-// TODO: reuse llama_detokenize
+// TODO: reuse jarvis_detokenize
 template <class Iter>
-static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+static std::string tokens_to_str(jarvis_context * ctx, Iter begin, Iter end) {
     std::string ret;
     for (; begin != end; ++begin) {
         ret += common_token_to_piece(ctx, *begin);
@@ -483,7 +483,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 }
 
 // format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
+static std::string tokens_to_output_formatted_string(const jarvis_context * ctx, const jarvis_token token) {
     std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
 
     // if the size is 1 and first bit is 1, meaning it's a partial character
@@ -499,11 +499,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
 }
 
 struct completion_token_output {
-    llama_token tok;
+    jarvis_token tok;
     std::string text_to_send;
 
     struct token_prob {
-        llama_token tok;
+        jarvis_token tok;
         float prob;
     };
 
@@ -511,7 +511,7 @@ struct completion_token_output {
 };
 
 // convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
+static json probs_vector_to_json(const jarvis_context * ctx, const std::vector<completion_token_output> & probs) {
     json out = json::array();
 
     for (const auto & prob : probs) {
@@ -551,21 +551,21 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 //
 
 static json oaicompat_completion_params_parse(
-    const struct llama_model * model,
+    const struct jarvis_model * model,
     const json & body, /* openai api json semantics */
     const std::string & chat_template) {
-    json llama_params;
+    json jarvis_params;
 
-    llama_params["__oaicompat"] = true;
+    jarvis_params["__oaicompat"] = true;
 
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    jarvis_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {
-        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+        jarvis_params["stop"] = json::array({body.at("stop").get<std::string>()});
     } else {
-        llama_params["stop"] = json_value(body, "stop", json::array());
+        jarvis_params["stop"] = json_value(body, "stop", json::array());
     }
 
     // Handle "response_format" field
@@ -573,10 +573,10 @@ static json oaicompat_completion_params_parse(
         json response_format      = json_value(body, "response_format", json::object());
         std::string response_type = json_value(response_format, "type", std::string());
         if (response_type == "json_object") {
-            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+            jarvis_params["json_schema"] = json_value(response_format, "schema", json::object());
         } else if (response_type == "json_schema") {
             json json_schema = json_value(response_format, "json_schema", json::object());
-            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
+            jarvis_params["json_schema"] = json_value(json_schema, "schema", json::object());
         } else if (!response_type.empty() && response_type != "text") {
             throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
         }
@@ -591,12 +591,12 @@ static json oaicompat_completion_params_parse(
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
     if (json_value(body, "logprobs", false)) {
-        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+        jarvis_params["n_probs"] = json_value(body, "top_logprobs", 20);
     } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
         throw std::runtime_error("top_logprobs requires logprobs to be set to true");
     }
 
-    // Params supported by OAI but unsupported by llama.cpp
+    // Params supported by OAI but unsupported by jarvis.cpp
     static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
     for (const auto & param : unsupported_params) {
         if (body.contains(param)) {
@@ -604,17 +604,17 @@ static json oaicompat_completion_params_parse(
         }
     }
 
-    // Copy remaining properties to llama_params
-    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
-    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    // Copy remaining properties to jarvis_params
+    // This allows user to use jarvis.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by jarvis.cpp
     for (const auto & item : body.items()) {
         // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
-        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
-            llama_params[item.key()] = item.value();
+        if (!jarvis_params.contains(item.key()) || item.key() == "n_predict") {
+            jarvis_params[item.key()] = item.value();
         }
     }
 
-    return llama_params;
+    return jarvis_params;
 }
 
 static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index b63afbb8b3081..a25341e39e4e1 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-simple)
+set(TARGET jarvis-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 0ff3425359a41..a045ea1f4c59b 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -1,9 +1,9 @@
-# llama.cpp/example/simple
+# jarvis.cpp/example/simple
 
-The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
+The purpose of this example is to demonstrate a minimal usage of jarvis.cpp for generating text with a given prompt.
 
 ```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./jarvis-simple -m ./models/jarvis-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
 
 ...
 
@@ -13,9 +13,9 @@ main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
 
 main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
 
-llama_print_timings:        load time =   579.15 ms
-llama_print_timings:      sample time =     0.72 ms /    28 runs   (    0.03 ms per token, 38888.89 tokens per second)
-llama_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms per token,    15.25 tokens per second)
-llama_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
-llama_print_timings:       total time =  2891.13 ms
+jarvis_print_timings:        load time =   579.15 ms
+jarvis_print_timings:      sample time =     0.72 ms /    28 runs   (    0.03 ms per token, 38888.89 tokens per second)
+jarvis_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms per token,    15.25 tokens per second)
+jarvis_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
+jarvis_print_timings:       total time =  2891.13 ms
 ```
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 59760fe95db22..78b7f883815d5 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -76,10 +76,10 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
+    jarvis_model_params model_params = jarvis_model_default_params();
     model_params.n_gpu_layers = ngl;
 
-    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(model_path.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -89,45 +89,45 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
 
     // find the number of tokens in the prompt
-    const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    const int n_prompt = -jarvis_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
 
     // allocate space for the tokens and tokenize the prompt
-    std::vector<llama_token> prompt_tokens(n_prompt);
-    if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+    std::vector<jarvis_token> prompt_tokens(n_prompt);
+    if (jarvis_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
         fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
         return 1;
     }
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    jarvis_context_params ctx_params = jarvis_context_default_params();
     // n_ctx is the context size
     ctx_params.n_ctx = n_prompt + n_predict - 1;
-    // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
+    // n_batch is the maximum number of tokens that can be processed in a single call to jarvis_decode
     ctx_params.n_batch = n_prompt;
     // enable performance counters
     ctx_params.no_perf = false;
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
 
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
         return 1;
     }
 
     // initialize the sampler
 
-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = jarvis_sampler_chain_default_params();
     sparams.no_perf = false;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_greedy());
 
     // print the prompt token-by-token
 
     for (auto id : prompt_tokens) {
         char buf[128];
-        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+        int n = jarvis_token_to_piece(model, id, buf, sizeof(buf), 0, true);
         if (n < 0) {
             fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
             return 1;
@@ -138,17 +138,17 @@ int main(int argc, char ** argv) {
 
     // prepare a batch for the prompt
 
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    jarvis_batch batch = jarvis_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
     // main loop
 
     const auto t_main_start = ggml_time_us();
     int n_decode = 0;
-    llama_token new_token_id;
+    jarvis_token new_token_id;
 
     for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
         // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (jarvis_decode(ctx, batch)) {
             fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
@@ -157,15 +157,15 @@ int main(int argc, char ** argv) {
 
         // sample the next token
         {
-            new_token_id = llama_sampler_sample(smpl, ctx, -1);
+            new_token_id = jarvis_sampler_sample(smpl, ctx, -1);
 
             // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id)) {
+            if (jarvis_token_is_eog(model, new_token_id)) {
                 break;
             }
 
             char buf[128];
-            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
+            int n = jarvis_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
             if (n < 0) {
                 fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                 return 1;
@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {
             fflush(stdout);
 
             // prepare the next batch with the sampled token
-            batch = llama_batch_get_one(&new_token_id, 1);
+            batch = jarvis_batch_get_one(&new_token_id, 1);
 
             n_decode += 1;
         }
@@ -189,13 +189,13 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     fprintf(stderr, "\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    jarvis_perf_sampler_print(smpl);
+    jarvis_perf_context_print(ctx);
     fprintf(stderr, "\n");
 
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_sampler_free(smpl);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
     return 0;
 }
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index aa208e7aaeeb0..5d8f6e31e418d 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-speculative)
+set(TARGET jarvis-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
index a6608c5fe8e3a..c68c0be45bd2f 100644
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -1,9 +1,9 @@
-# llama.cpp/examples/speculative
+# jarvis.cpp/examples/speculative
 
 Demonstration of speculative decoding and tree-based speculative decoding techniques
 
 More info:
 
-- https://github.com/ggerganov/llama.cpp/pull/2926
-- https://github.com/ggerganov/llama.cpp/pull/3624
-- https://github.com/ggerganov/llama.cpp/pull/5625
+- https://github.com/ggerganov/jarvis.cpp/pull/2926
+- https://github.com/ggerganov/jarvis.cpp/pull/3624
+- https://github.com/ggerganov/jarvis.cpp/pull/5625
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index a40e755a26f84..9be89105ccb65 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "sampling.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -23,8 +23,8 @@ struct seq_draft {
     int i_batch_dft = 0;
     std::vector<int> i_batch_tgt;
 
-    std::vector<llama_token> tokens;
-    std::vector<std::vector<llama_token_data>> dists;
+    std::vector<jarvis_token> tokens;
+    std::vector<std::vector<jarvis_token_data>> dists;
 
     struct common_sampler * smpl = nullptr;
 };
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
     // needed to get candidate probs even for temp <= 0.0
     params.sparams.n_probs = 128;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
 
@@ -57,23 +57,23 @@ int main(int argc, char ** argv) {
     // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
     const float p_split  = params.p_split;
 
-    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
+    std::default_random_engine rng(params.sparams.seed == JARVIS_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
     std::uniform_real_distribution<> u_dist;
 
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    // init jarvis.cpp
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);
 
-    llama_model * model_tgt = NULL;
-    llama_model * model_dft = NULL;
+    jarvis_model * model_tgt = NULL;
+    jarvis_model * model_dft = NULL;
 
-    llama_context * ctx_tgt = NULL;
-    llama_context * ctx_dft = NULL;
+    jarvis_context * ctx_tgt = NULL;
+    jarvis_context * ctx_dft = NULL;
 
     // load the target model
-    common_init_result llama_init_tgt = common_init_from_params(params);
-    model_tgt = llama_init_tgt.model;
-    ctx_tgt = llama_init_tgt.context;
+    common_init_result jarvis_init_tgt = common_init_from_params(params);
+    model_tgt = jarvis_init_tgt.model;
+    ctx_tgt = jarvis_init_tgt.context;
 
     // load the draft model
     params.model = params.model_draft;
@@ -83,14 +83,14 @@ int main(int argc, char ** argv) {
     }
 
     params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
-    common_init_result llama_init_dft = common_init_from_params(params);
-    model_dft = llama_init_dft.model;
-    ctx_dft = llama_init_dft.context;
+    common_init_result jarvis_init_dft = common_init_from_params(params);
+    model_dft = jarvis_init_dft.model;
+    ctx_dft = jarvis_init_dft.context;
 
-    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
+    const bool vocab_type_tgt = jarvis_vocab_type(model_tgt);
     LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
 
-    const bool vocab_type_dft = llama_vocab_type(model_dft);
+    const bool vocab_type_dft = jarvis_vocab_type(model_dft);
     LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
@@ -100,18 +100,18 @@ int main(int argc, char ** argv) {
     }
 
     if (
-        llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
-        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
-        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
-        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
+        jarvis_add_bos_token(model_tgt) != jarvis_add_bos_token(model_dft) ||
+        jarvis_add_eos_token(model_tgt) != jarvis_add_eos_token(model_dft) ||
+        jarvis_token_bos(model_tgt) != jarvis_token_bos(model_dft) ||
+        jarvis_token_eos(model_tgt) != jarvis_token_eos(model_dft)
     ) {
         LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
         return 1;
     }
 
     {
-        const int n_vocab_tgt = llama_n_vocab(model_tgt);
-        const int n_vocab_dft = llama_n_vocab(model_dft);
+        const int n_vocab_tgt = jarvis_n_vocab(model_tgt);
+        const int n_vocab_dft = jarvis_n_vocab(model_dft);
         const int vocab_diff  = n_vocab_tgt > n_vocab_dft
             ? n_vocab_tgt - n_vocab_dft
             : n_vocab_dft - n_vocab_tgt;
@@ -119,13 +119,13 @@ int main(int argc, char ** argv) {
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
             LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
             LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+                    n_vocab_tgt, jarvis_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return 1;
         }
 
         for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
-            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            const char * token_text_tgt = jarvis_token_get_text(model_tgt, i);
+            const char * token_text_dft = jarvis_token_get_text(model_dft, i);
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                 LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                 LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
@@ -138,10 +138,10 @@ int main(int argc, char ** argv) {
 
 
     // Tokenize the prompt
-    std::vector<llama_token> inp;
+    std::vector<jarvis_token> inp;
     inp = common_tokenize(ctx_tgt, params.prompt, true, true);
 
-    const int max_context_size     = llama_n_ctx(ctx_tgt);
+    const int max_context_size     = jarvis_n_ctx(ctx_tgt);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
@@ -160,14 +160,14 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
+    jarvis_decode(ctx_tgt, jarvis_batch_get_one( inp.data(), n_input - 1));
+    jarvis_decode(ctx_tgt, jarvis_batch_get_one(&inp.back(),           1));
+    jarvis_decode(ctx_dft, jarvis_batch_get_one( inp.data(), n_input));
 
     const auto t_enc_end = ggml_time_us();
 
     // the 2 models should have the same vocab
-    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
+    //GGML_ASSERT(n_vocab == jarvis_n_vocab(model_dft));
 
     // how many tokens to draft each time
     int n_draft = params.n_draft;
@@ -182,19 +182,19 @@ int main(int argc, char ** argv) {
     // used to determine end of generation
     bool has_eos = false;
 
-    // target model sampling context (reuse the llama_context's sampling instance)
+    // target model sampling context (reuse the jarvis_context's sampling instance)
     struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
     for (int s = 0; s < n_seq_dft; ++s) {
-        // allocate llama_sampler for each draft sequence
+        // allocate jarvis_sampler for each draft sequence
         drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
     }
 
-    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
-    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
+    jarvis_batch batch_dft = jarvis_batch_init(jarvis_n_batch(ctx_dft), 0, 1);
+    jarvis_batch batch_tgt = jarvis_batch_init(jarvis_n_batch(ctx_tgt), 0, n_seq_dft);
 
     const auto t_dec_start = ggml_time_us();
 
@@ -220,7 +220,7 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
-        llama_token token_id;
+        jarvis_token token_id;
         std::string token_str;
 
         // loop until we fail to accept a drafted token or we run out of drafted tokens
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
 
                         LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
+                        jarvis_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), JARVIS_TOKEN_NULL, true };
 
                         //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
 
@@ -294,10 +294,10 @@ int main(int argc, char ** argv) {
                             GGML_ASSERT(dist_dft.sorted);
 
                             // sort dist by id
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
+                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const jarvis_token_data &a, const jarvis_token_data &b) {
                                 return a.id < b.id;
                             });
-                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
+                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const jarvis_token_data &a, const jarvis_token_data &b) {
                                 return a.id < b.id;
                             });
 
@@ -318,7 +318,7 @@ int main(int argc, char ** argv) {
                             }
 
                             // sort dist_tgt by p desc
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
+                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const jarvis_token_data &a, const jarvis_token_data &b) {
                                 return a.p > b.p;
                             });
                         }
@@ -382,7 +382,7 @@ int main(int argc, char ** argv) {
                     }
                 }
 
-                if (llama_token_is_eog(model_tgt, token_id)) {
+                if (jarvis_token_is_eog(model_tgt, token_id)) {
                     has_eos = true;
                 }
                 ++n_predict;
@@ -413,14 +413,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
+                jarvis_kv_cache_seq_keep(ctx_dft, s_keep);
+                jarvis_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                jarvis_kv_cache_seq_keep(ctx_dft, 0);
 
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
+                jarvis_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                jarvis_kv_cache_seq_keep(ctx_tgt, s_keep);
+                jarvis_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                jarvis_kv_cache_seq_keep(ctx_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -431,15 +431,15 @@ int main(int argc, char ** argv) {
             }
             // note: will be erased after the speculation phase
             drafts[0].tokens.push_back(token_id);
-            drafts[0].dists.push_back(std::vector<llama_token_data>());
+            drafts[0].dists.push_back(std::vector<jarvis_token_data>());
             drafts[0].i_batch_tgt.push_back(0);
 
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            jarvis_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
             // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode(ctx_dft, batch_dft);
+            jarvis_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
         }
@@ -496,8 +496,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        jarvis_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        jarvis_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -535,7 +535,7 @@ int main(int argc, char ** argv) {
 
                 // add drafted token for each sequence
                 for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p->data[is].id;
+                    const jarvis_token id = cur_p->data[is].id;
 
                     const int s = sa[is];
 
@@ -567,7 +567,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted tokens on the draft model
-            llama_decode(ctx_dft, batch_dft);
+            jarvis_decode(ctx_dft, batch_dft);
             ++n_past_cur;
             ++n_drafted;
 
@@ -578,13 +578,13 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            jarvis_kv_cache_seq_keep(ctx_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+                jarvis_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
-            llama_decode(ctx_tgt, batch_tgt);
+            jarvis_decode(ctx_tgt, batch_tgt);
             ++n_past_tgt;
         }
 
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
     LOG_INF("\n");
     LOG_INF("draft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    llama_perf_context_print(ctx_dft);
+    jarvis_perf_context_print(ctx_dft);
 
     LOG_INF("\n");
     LOG_INF("target:\n\n");
@@ -627,15 +627,15 @@ int main(int argc, char ** argv) {
         common_sampler_free(drafts[s].smpl);
     }
 
-    llama_batch_free(batch_dft);
+    jarvis_batch_free(batch_dft);
 
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
+    jarvis_free(ctx_tgt);
+    jarvis_free_model(model_tgt);
 
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
+    jarvis_free(ctx_dft);
+    jarvis_free_model(model_dft);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     LOG("\n\n");
 
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index e4d5083e6e502..297c8d6bb47fb 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -2,8 +2,8 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
-set(TARGET llama-ls-sycl-device)
+set(TARGET jarvis-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index 8819d87f56ec2..956f24e0c96d3 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -1,18 +1,18 @@
-# llama.cpp/example/sycl
+# jarvis.cpp/example/sycl
 
-This example program provides the tools for llama.cpp for SYCL on Intel GPU.
+This example program provides the tools for jarvis.cpp for SYCL on Intel GPU.
 
 ## Tool
 
 |Tool Name| Function|Status|
 |-|-|-|
-|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+|jarvis-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
 
-### llama-ls-sycl-device
+### jarvis-ls-sycl-device
 
 List all SYCL devices with ID, compute capability, max work group size, ect.
 
-1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
+1. Build the jarvis.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
 
 2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
 
@@ -23,7 +23,7 @@ source /opt/intel/oneapi/setvars.sh
 3. Execute
 
 ```
-./build/bin/llama-ls-sycl-device
+./build/bin/jarvis-ls-sycl-device
 ```
 
 Check the ID in startup log, like:
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
index 8fe0a67902cbd..8b9288aaf9930 100755
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -16,8 +16,8 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main
 #cmake --build . --config Release --target main
 
-#build example/llama-bench
-#cmake --build . --config Release --target llama-bench
+#build example/jarvis-bench
+#cmake --build . --config Release --target jarvis-bench
 
 #build all binary
 cmake --build . --config Release -j -v
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-jarvis2.sh
old mode 100755
new mode 100644
similarity index 64%
rename from examples/sycl/run-llama2.sh
rename to examples/sycl/run-jarvis2.sh
index 3b9ba3b2da491..38dc32ee0610f
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-jarvis2.sh
@@ -11,7 +11,7 @@ source /opt/intel/oneapi/setvars.sh
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
 
 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+MODEL_FILE=models/jarvis-2-7b.Q4_0.gguf
 NGL=33
 CONEXT=8192
 
@@ -19,9 +19,9 @@ if [ $# -gt 0 ]; then
     GGML_SYCL_DEVICE=$1
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
 
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
 fi
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-jarvis2.bat
similarity index 71%
rename from examples/sycl/win-run-llama2.bat
rename to examples/sycl/win-run-jarvis2.bat
index c2918d6dcead6..45e0e29de3e0f 100644
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-jarvis2.bat
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 
-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
index b704dcae18c52..3798db3e26ab8 100644
--- a/examples/tokenize/CMakeLists.txt
+++ b/examples/tokenize/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-tokenize)
+set(TARGET jarvis-tokenize)
 add_executable(${TARGET} tokenize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 12ad542565128..e794b1f0cf275 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 //#include "log.h" // TODO: start using log.h
-#include "llama.h"
+#include "jarvis.h"
 
 #include <cstdio>
 #include <cstring>
@@ -36,7 +36,7 @@ static void print_usage_information(const char * argv0) {
     printf("    --show-count                         print the total number of tokens.\n");
 }
 
-static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
+static void jarvis_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
     (void) level;
     (void) text;
     (void) user_data;
@@ -326,21 +326,21 @@ int main(int raw_argc, char ** raw_argv) {
     //////
 
     if (disable_logging) {
-        llama_log_set(llama_log_callback_null, NULL);
+        jarvis_log_set(jarvis_log_callback_null, NULL);
     }
 
-    llama_backend_init();
+    jarvis_backend_init();
 
-    llama_model_params model_params = llama_model_default_params();
+    jarvis_model_params model_params = jarvis_model_default_params();
     model_params.vocab_only = true;
-    llama_model * model = llama_load_model_from_file(model_path, model_params);
+    jarvis_model * model = jarvis_load_model_from_file(model_path, model_params);
     if (!model) {
         fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
         return 1;
     }
 
-    llama_context_params ctx_params = llama_context_default_params();
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context_params ctx_params = jarvis_context_default_params();
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
     if (!ctx) {
         fprintf(stderr, "Error: could not create context.\n");
         return 1;
@@ -360,11 +360,11 @@ int main(int raw_argc, char ** raw_argv) {
         prompt = stdin_buffer.str();
     }
 
-    const bool model_wants_add_bos = llama_add_bos_token(model);
+    const bool model_wants_add_bos = jarvis_add_bos_token(model);
     const bool add_bos = model_wants_add_bos && !no_bos;
     const bool parse_special = !no_parse_special;
 
-    std::vector<llama_token> tokens;
+    std::vector<jarvis_token> tokens;
     tokens = common_tokenize(model, prompt, add_bos, parse_special);
 
     if (printing_ids) {
@@ -397,8 +397,8 @@ int main(int raw_argc, char ** raw_argv) {
         printf("Total number of tokens: %ld\n", tokens.size());
     }
     // silence valgrind
-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);
 
     return 0;
 }
diff --git a/flake.nix b/flake.nix
index 26a2588169101..d82c34b1dd7e2 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,19 +1,19 @@
-# The flake interface to llama.cpp's Nix expressions. The flake is used as a
+# The flake interface to jarvis.cpp's Nix expressions. The flake is used as a
 # more discoverable entry-point, as well as a way to pin the dependencies and
 # expose default outputs, including the outputs built by the CI.
 
 # For more serious applications involving some kind of customization  you may
-# want to consider consuming the overlay, or instantiating `llamaPackages`
+# want to consider consuming the overlay, or instantiating `jarvisPackages`
 # directly:
 #
 # ```nix
-# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
+# pkgs.callPackage ${jarvis-cpp-root}/.devops/nix/scope.nix { }`
 # ```
 
 # Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
 # of the relation between Nix and the Nix Flakes.
 {
-  description = "Port of Facebook's LLaMA model in C/C++";
+  description = "Port of Facebook's JARVIS model in C/C++";
 
   inputs = {
     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
@@ -36,8 +36,8 @@
   # ```
   # nixConfig = {
   #   extra-substituters = [
-  #     # Populated by the CI in ggerganov/llama.cpp
-  #     "https://llama-cpp.cachix.org"
+  #     # Populated by the CI in ggerganov/jarvis.cpp
+  #     "https://jarvis-cpp.cachix.org"
   #
   #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
   #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
@@ -47,34 +47,34 @@
   #   ];
   #
   #   # Verify these are the same keys as published on
-  #   # - https://app.cachix.org/cache/llama-cpp
+  #   # - https://app.cachix.org/cache/jarvis-cpp
   #   # - https://app.cachix.org/cache/cuda-maintainers
   #   extra-trusted-public-keys = [
-  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
+  #     "jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
   #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
   #   ];
   # };
   # ```
 
-  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
+  # For inspection, use `nix flake show github:ggerganov/jarvis.cpp` or the nix repl:
   #
   # ```bash
   # ❯ nix repl
-  # nix-repl> :lf github:ggerganov/llama.cpp
+  # nix-repl> :lf github:ggerganov/jarvis.cpp
   # Added 13 variables.
   # nix-repl> outputs.apps.x86_64-linux.quantize
-  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; }
+  # { program = "/nix/store/00000000000000000000000000000000-jarvis.cpp/bin/jarvis-quantize"; type = "app"; }
   # ```
   outputs =
     { self, flake-parts, ... }@inputs:
     let
       # We could include the git revisions in the package names but those would
       # needlessly trigger rebuilds:
-      # llamaVersion = self.dirtyShortRev or self.shortRev;
+      # jarvisVersion = self.dirtyShortRev or self.shortRev;
 
       # Nix already uses cryptographic hashes for versioning, so we'll just fix
       # the fake semver for now:
-      llamaVersion = "0.0.0";
+      jarvisVersion = "0.0.0";
     in
     flake-parts.lib.mkFlake { inherit inputs; }
 
@@ -87,30 +87,30 @@
           .devops/nix/jetson-support.nix
         ];
 
-        # An overlay can be used to have a more granular control over llama-cpp's
+        # An overlay can be used to have a more granular control over jarvis-cpp's
         # dependencies and configuration, than that offered by the `.override`
         # mechanism. Cf. https://nixos.org/manual/nixpkgs/stable/#chap-overlays.
         #
         # E.g. in a flake:
         # ```
-        # { nixpkgs, llama-cpp, ... }:
+        # { nixpkgs, jarvis-cpp, ... }:
         # let pkgs = import nixpkgs {
-        #     overlays = [ (llama-cpp.overlays.default) ];
+        #     overlays = [ (jarvis-cpp.overlays.default) ];
         #     system = "aarch64-linux";
         #     config.allowUnfree = true;
         #     config.cudaSupport = true;
         #     config.cudaCapabilities = [ "7.2" ];
         #     config.cudaEnableForwardCompat = false;
         # }; in {
-        #     packages.aarch64-linux.llamaJetsonXavier = pkgs.llamaPackages.llama-cpp;
+        #     packages.aarch64-linux.jarvisJetsonXavier = pkgs.jarvisPackages.jarvis-cpp;
         # }
         # ```
         #
         # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
         flake.overlays.default = (
           final: prev: {
-            llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-            inherit (final.llamaPackages) llama-cpp;
+            jarvisPackages = final.callPackage .devops/nix/scope.nix { inherit jarvisVersion; };
+            inherit (final.jarvisPackages) jarvis-cpp;
           }
         );
 
@@ -141,34 +141,34 @@
             # show` either.
             #
             # You can add arbitrary scripts to `.devops/nix/scope.nix` and
-            # access them as `nix build .#llamaPackages.${scriptName}` using
+            # access them as `nix build .#jarvisPackages.${scriptName}` using
             # the same path you would with an overlay.
             legacyPackages = {
-              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
-                inherit llamaVersion;
+              jarvisPackages = pkgs.callPackage .devops/nix/scope.nix { inherit jarvisVersion; };
+              jarvisPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
+                inherit jarvisVersion;
               };
-              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              jarvisPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit jarvisVersion; };
+              jarvisPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit jarvisVersion; };
             };
 
             # We don't use the overlay here so as to avoid making too many instances of nixpkgs,
             # cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
             packages =
               {
-                default = config.legacyPackages.llamaPackages.llama-cpp;
+                default = config.legacyPackages.jarvisPackages.jarvis-cpp;
                 vulkan = config.packages.default.override { useVulkan = true; };
-                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
-                python-scripts = config.legacyPackages.llamaPackages.python-scripts;
+                windows = config.legacyPackages.jarvisPackagesWindows.jarvis-cpp;
+                python-scripts = config.legacyPackages.jarvisPackages.python-scripts;
               }
               // lib.optionalAttrs pkgs.stdenv.isLinux {
-                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
+                cuda = config.legacyPackages.jarvisPackagesCuda.jarvis-cpp;
 
                 mpi-cpu = config.packages.default.override { useMpi = true; };
                 mpi-cuda = config.packages.default.override { useMpi = true; };
               }
               // lib.optionalAttrs (system == "x86_64-linux") {
-                rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
+                rocm = config.legacyPackages.jarvisPackagesRocm.jarvis-cpp;
               };
 
             # Packages exposed in `.#checks` will be built by the CI and by
@@ -176,7 +176,7 @@
             #
             # We could test all outputs e.g. as `checks = confg.packages`.
             #
-            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
+            # TODO: Build more once https://github.com/ggerganov/jarvis.cpp/issues/6346 has been addressed
             checks = {
               inherit (config.packages) default vulkan;
             };
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index cfa6e3f70e4a3..33f3232e92ed9 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -57,8 +57,8 @@ else()
 endif()
 
 # defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
+if (NOT GGML_JARVISFILE_DEFAULT)
+    set(GGML_JARVISFILE_DEFAULT OFF)
 endif()
 
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@@ -122,7 +122,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+option(GGML_JARVISFILE                       "ggml: use JARVISFILE"                             ${GGML_JARVISFILE_DEFAULT})
 
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
@@ -139,7 +139,7 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
+option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (jarvis.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index b8d3f678b7157..e127da658fef1 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -45,7 +45,7 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
 GGML_DEPRECATED(
         GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+        "obsoleted by the new device interface - https://github.com/ggerganov/jarvis.cpp/pull/9713");
 
 GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index de3c706fc251c..b378184c3b32c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2510,7 +2510,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
     GGML_API int ggml_cpu_has_cann       (void);
-    GGML_API int ggml_cpu_has_llamafile  (void);
+    GGML_API int ggml_cpu_has_jarvisfile  (void);
 
     // get the sve vector length in bytes
     GGML_API int ggml_cpu_get_sve_cnt(void);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index aa405e4d0fb1a..d402d69ce3a2b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -258,13 +258,13 @@ if (GGML_BLAS)
     endif()
 endif()
 
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
+if (GGML_JARVISFILE)
+    message(STATUS "Using jarvisfile")
 
-    add_compile_definitions(GGML_USE_LLAMAFILE)
+    add_compile_definitions(GGML_USE_JARVISFILE)
 
-    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
+    set(GGML_HEADERS_JARVISFILE jarvisfile/sgemm.h)
+    set(GGML_SOURCES_JARVISFILE jarvisfile/sgemm.cpp)
 endif()
 
 if (GGML_AMX)
@@ -1380,7 +1380,7 @@ add_library(ggml
             ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
             ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
-            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ${GGML_SOURCES_JARVISFILE} ${GGML_HEADERS_JARVISFILE}
             ${GGML_SOURCES_AMX}       ${GGML_HEADERS_AMX}
             ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
             ggml-aarch64.c            ggml-aarch64.h
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index a4ec8418e2ab3..b9facfc3cd125 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2843,7 +2843,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     // TODO: use ascendc
-    // Only test with LLAMA model.
+    // Only test with JARVIS model.
     ggml_tensor* src0 = dst->src[0];  // input
     ggml_tensor* src2 = dst->src[2];  // freq_factors
 
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 2ed6509acb82d..7dc26036d416e 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -176,7 +176,7 @@ static __global__ void flash_attn_vec_ext_f16(
         // Calculate KQ tile and keep track of new maximum KQ values:
 
         // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
-        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
+        // see https://github.com/ggerganov/jarvis.cpp/pull/7061 .
         // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
         half kqmax_new = kqmax[0];
         half kqmax_new_arr[ncols];
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 88f586d689cfd..ea257cd06c325 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -9,7 +9,7 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// YaRN algorithm based on JarvisYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static __device__ void rope_yarn(
     float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 2c926aaeecefc..fd119313198ff 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1575,11 +1575,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
 
 #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
+#pragma message("ref:  https://github.com/ggerganov/jarvis.cpp/pull/5021")
                         GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
 
 #pragma message("TODO: add ALiBi support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/7192")
+#pragma message("ref:  https://github.com/ggerganov/jarvis.cpp/pull/7192")
                         GGML_ASSERT(max_bias == 0.0f);
 
                         ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
@@ -1689,11 +1689,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                 case GGML_OP_ROPE:
                     {
 #pragma message("TODO: implement phi3 frequency factors support")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7225")
+#pragma message("      https://github.com/ggerganov/jarvis.cpp/pull/7225")
                         GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
 
 #pragma message("TODO: update rope NORM mode to match NEOX mode")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
+#pragma message("      https://github.com/ggerganov/jarvis.cpp/pull/7634")
 
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 80c08f15b2999..a19c62c93cb2c 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -255,9 +255,9 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
-  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,     // https://github.com/ggerganov/llama.cpp/issues/7261
+  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,     // https://github.com/ggerganov/jarvis.cpp/issues/7261
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
-  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
+  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/jarvis.cpp/issues/7261
     GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
     GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
@@ -3127,7 +3127,7 @@ static enum ggml_status ggml_metal_graph_compute(
     // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
     // each thread creates it's own command buffer and enqueues the ops in parallel
     //
-    // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
+    // tests on M1 Pro and M2 Ultra using JARVIS models, show that optimal values for n_cb are 1 or 2
 
     @autoreleasepool {
         ctx->gf = gf;
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index defde6246f129..c2678f112f611 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -1745,7 +1745,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// YaRN algorithm based on JarvisYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
     float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 4d91ee460861c..eff28c6de9ddc 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -4303,7 +4303,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
     // mmvq and mmq need the __dp4a instruction which is available for gen12+
-    // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
+    // Workaround in https://github.com/ggerganov/jarvis.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
     use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
 #ifdef SYCL_USE_XMX
     use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 1f06f78fa3d91..cf4dacc3301af 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -9,7 +9,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
 }
 
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// YaRN algorithm based on JarvisYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
     float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index 17a542e490362..13f512c0a7c06 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -233,7 +233,7 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *s
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
 #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
+#pragma message("ref:  https://github.com/ggerganov/jarvis.cpp/pull/5021")
     GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
 
     const int64_t ne00 = src0->ne[0];
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index e749bbe704783..a4a91647f38e6 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -6367,7 +6367,7 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
 }
 
 // Should be changed to return device-specific host buffer type
-// but that probably requires changes in llama.cpp
+// but that probably requires changes in jarvis.cpp
 ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
         /* .iface    = */ {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 66df9a9c1e621..e06fd1cfbd0ab 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -36,11 +36,11 @@
 #endif
 
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-#undef GGML_USE_LLAMAFILE
+#undef GGML_USE_JARVISFILE
 #endif
 
-#ifdef GGML_USE_LLAMAFILE
-#include <llamafile/sgemm.h>
+#ifdef GGML_USE_JARVISFILE
+#include <jarvisfile/sgemm.h>
 #endif
 
 #if defined(_MSC_VER)
@@ -2640,7 +2640,7 @@ inline static float ggml_silu_f32(float x) {
 
 #if __FINITE_MATH_ONLY__
 #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#error "ref: https://github.com/ggerganov/jarvis.cpp/pull/7154#issuecomment-2143844461"
 #endif
 
 #if defined(__ARM_NEON) && defined(__aarch64__)
@@ -12502,7 +12502,7 @@ static void ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if GGML_USE_LLAMAFILE
+#if GGML_USE_JARVISFILE
     // broadcast factors
     const int64_t r2 = ne12 / ne02;
     const int64_t r3 = ne13 / ne03;
@@ -12512,7 +12512,7 @@ static void ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!jarvisfile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
@@ -12566,14 +12566,14 @@ UseGgmlGemm1:;
 
     ggml_barrier(params->threadpool);
 
-#if GGML_USE_LLAMAFILE
+#if GGML_USE_JARVISFILE
     if (src1->type != vec_dot_type) {
         const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!jarvisfile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
@@ -12619,7 +12619,7 @@ UseGgmlGemm2:;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
     // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/jarvis.cpp/pull/6915
     //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
     if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
         // distribute the thread work across the inner or outer loop based on which one is larger
@@ -14188,7 +14188,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return 1 - MIN(1, MAX(0, y));
 }
 
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// YaRN algorithm based on JarvisYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
     float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
@@ -14216,7 +14216,7 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, fl
 static void ggml_rope_cache_init(
      float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
      float * cache, float sin_sign, float theta_scale) {
-    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/JarvisYaRNScaledRotaryEmbedding.py
     float theta = theta_base;
     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
         const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
@@ -18074,7 +18074,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_REPEAT:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
@@ -18106,7 +18106,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_RMS_NORM:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     float eps;
                     memcpy(&eps, tensor->op_params, sizeof(float));
@@ -18142,7 +18142,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // src0.shape   [n,m,q1,r1]
                 // src1.shape   [n,p,qq,rr]
 
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     struct ggml_tensor * s1_tg =
                         ggml_out_prod(ctx, // [n,m,qq,rr]
@@ -18173,7 +18173,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
                                 //     tensor->grad),                  // [m,p,qq,rr]
 
-                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
+                                // // when src0 is bigger than tensor->grad (this is mostly the case in jarvis),
                                 // // avoid transpose of src0, rather transpose smaller tensor->grad
                                 // // and then use ggml_out_prod
                                 ggml_out_prod(ctx,                  // [n,p,qq,rr]
@@ -18193,7 +18193,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_SCALE:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     float s;
                     memcpy(&s, tensor->op_params, sizeof(float));
@@ -18246,7 +18246,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_CPY:
             {
-                // necessary for llama
+                // necessary for jarvis
                 // cpy overwrites value of src1 by src0 and returns view(src1)
                 // the overwriting is mathematically equivalent to:
                 // tensor = src0 * 1 + src1 * 0
@@ -18269,7 +18269,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_RESHAPE:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
@@ -18283,7 +18283,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_VIEW:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     size_t offset;
 
@@ -18312,7 +18312,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_PERMUTE:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     int32_t * axes = (int32_t *) tensor->op_params;
                     int axis0 = axes[0] & 0x3;
@@ -18337,7 +18337,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_TRANSPOSE:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
@@ -18347,7 +18347,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_GET_ROWS:
             {
-                // necessary for llama (only for tokenizer)
+                // necessary for jarvis (only for tokenizer)
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
@@ -18370,20 +18370,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_DIAG_MASK_INF:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     const int n_past = ((int32_t *) tensor->op_params)[0];
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
                             /* ggml_diag_mask_inf_impl() shouldn't be here */
-                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
+                            /* ref:  https://github.com/ggerganov/jarvis.cpp/pull/4203#discussion_r1412377992 */
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                         zero_table, acc_table);
                 }
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     const int n_past = ((int32_t *) tensor->op_params)[0];
                     src0->grad =
@@ -18394,7 +18394,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
@@ -18409,7 +18409,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             }
         case GGML_OP_ROPE:
             {
-                // necessary for llama
+                // necessary for jarvis
                 if (src0->grad) {
                     //const int n_past = ((int32_t *) tensor->op_params)[0];
                     const int n_dims     = ((int32_t *) tensor->op_params)[1];
@@ -18693,7 +18693,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         }
                     case GGML_UNARY_OP_SILU:
                         {
-                            // necessary for llama
+                            // necessary for jarvis
                             if (src0->grad) {
                                 src0->grad = ggml_add_or_set(ctx,
                                         src0->grad,
@@ -23388,8 +23388,8 @@ int ggml_cpu_has_cann(void) {
 #endif
 }
 
-int ggml_cpu_has_llamafile(void) {
-#if defined(GGML_USE_LLAMAFILE)
+int ggml_cpu_has_jarvisfile(void) {
+#if defined(GGML_USE_JARVISFILE)
     return 1;
 #else
     return 0;
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/jarvisfile/sgemm.cpp
similarity index 99%
rename from ggml/src/llamafile/sgemm.cpp
rename to ggml/src/jarvisfile/sgemm.cpp
index 9eead3f61e090..d65ca56e73a2d 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/jarvisfile/sgemm.cpp
@@ -40,7 +40,7 @@
 // hardware for performance, and then use whatever resources remain for
 // improving numerical accuracy.
 //
-// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+// [1] J. Tunney, ‘JARVIS Now Goes Faster on CPUs’, Mar. 2024. [Online].
 //     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
 
 #if defined(__GNUC__)
@@ -1038,7 +1038,7 @@ class tinyBLAS_Q0_AVX {
  *
  * For example, for single-threaded single-precision GEMM you can say
  *
- *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
+ *     jarvisfile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
  *                     0, 1,
  *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
  *
@@ -1058,7 +1058,7 @@ class tinyBLAS_Q0_AVX {
  * @param Ctype is GGML data type of `C`
  * @return true if this function was able to service the matmul request
  */
-bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
+bool jarvisfile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
                      int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
 
     assert(m >= 0);
diff --git a/ggml/src/llamafile/sgemm.h b/ggml/src/jarvisfile/sgemm.h
similarity index 76%
rename from ggml/src/llamafile/sgemm.h
rename to ggml/src/jarvisfile/sgemm.h
index caf6dd5567b3a..ad4a7238198a1 100644
--- a/ggml/src/llamafile/sgemm.h
+++ b/ggml/src/jarvisfile/sgemm.h
@@ -5,7 +5,7 @@
 extern "C" {
 #endif
 
-bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
+bool jarvisfile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
                      const void *, int64_t, void *, int64_t, int, int,
                      int, int, int);
 
diff --git a/ggml/src/kompute-shaders/op_softmax.comp b/ggml/src/kompute-shaders/op_softmax.comp
index 7bc9176cabaae..1a56e417f1a08 100644
--- a/ggml/src/kompute-shaders/op_softmax.comp
+++ b/ggml/src/kompute-shaders/op_softmax.comp
@@ -1,4 +1,4 @@
-// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
+// TODO: implement multi-simd softmax (jarvis.cpp commit e16b9fa4)
 
 #version 450
 
diff --git a/ggml/src/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp
index df4702896d46f..31135d8e36ad7 100644
--- a/ggml/src/kompute-shaders/rope_common.comp
+++ b/ggml/src/kompute-shaders/rope_common.comp
@@ -34,7 +34,7 @@ float rope_yarn_ramp(const float low, const float high, const float i0) {
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// YaRN algorithm based on JarvisYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 void rope_yarn(
     float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
diff --git a/gguf-py/README.md b/gguf-py/README.md
index 24af96a17a5bb..a163d10c59724 100644
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -3,7 +3,7 @@
 This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
 (GGML Universal File) format.
 
-See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)
+See [convert_hf_to_gguf.py](https://github.com/ggerganov/jarvis.cpp/blob/master/convert_hf_to_gguf.py)
 as an example for its usage.
 
 ## Installation
@@ -13,21 +13,21 @@ pip install gguf
 
 ## API Examples/Simple Tools
 
-[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
+[examples/writer.py](https://github.com/ggerganov/jarvis.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
 
-[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
+[scripts/gguf_dump.py](https://github.com/ggerganov/jarvis.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
 
-[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
+[scripts/gguf_set_metadata.py](https://github.com/ggerganov/jarvis.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
 
-[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
+[scripts/gguf_convert_endian.py](https://github.com/ggerganov/jarvis.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
 
-[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
+[scripts/gguf_new_metadata.py](https://github.com/ggerganov/jarvis.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
 
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:
 
 ```sh
-cd /path/to/llama.cpp/gguf-py
+cd /path/to/jarvis.cpp/gguf-py
 
 pip install --editable .
 ```
diff --git a/gguf-py/examples/writer.py b/gguf-py/examples/writer.py
index 731873a7d666c..d8b0d620206ae 100755
--- a/gguf-py/examples/writer.py
+++ b/gguf-py/examples/writer.py
@@ -13,7 +13,7 @@
 # Example usage:
 def writer_example() -> None:
     # Example usage with a file
-    gguf_writer = GGUFWriter("example.gguf", "llama")
+    gguf_writer = GGUFWriter("example.gguf", "jarvis")
 
     gguf_writer.add_block_count(12)
     gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 7ab08b036e527..3ab4668d08456 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -196,7 +196,7 @@ class GGUFType:
 
 
 class MODEL_ARCH(IntEnum):
-    LLAMA        = auto()
+    JARVIS        = auto()
     FALCON       = auto()
     BAICHUAN     = auto()
     GROK         = auto()
@@ -357,7 +357,7 @@ class MODEL_TENSOR(IntEnum):
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:          "llama",
+    MODEL_ARCH.JARVIS:          "jarvis",
     MODEL_ARCH.FALCON:         "falcon",
     MODEL_ARCH.BAICHUAN:       "baichuan",
     MODEL_ARCH.GROK:           "grok",
@@ -518,7 +518,7 @@ class MODEL_TENSOR(IntEnum):
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.LLAMA: [
+    MODEL_ARCH.JARVIS: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
@@ -1302,7 +1302,7 @@ class MODEL_TENSOR(IntEnum):
 
 # tensors that will not be serialized
 MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.LLAMA: [
+    MODEL_ARCH.JARVIS: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
@@ -1409,9 +1409,9 @@ class GGMLQuantizationType(IntEnum):
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
 
 
-# from llama_ftype in llama.h
+# from jarvis_ftype in jarvis.h
 # ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
-class LlamaFileType(IntEnum):
+class JarvisFileType(IntEnum):
     ALL_F32              = 0
     MOSTLY_F16           = 1   # except 1d tensors
     MOSTLY_Q4_0          = 2   # except 1d tensors
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index db318542a279b..6700a23a9891f 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -60,7 +60,7 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
 
         # Metadata Override File Provided
-        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        # This is based on LLM_KV_NAMES mapping in jarvis.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
 
         metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
@@ -305,7 +305,7 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
 
                 setattr(metadata, metadata_key, current_value)
 
-            # LLAMA.cpp's direct internal convention
+            # JARVIS.cpp's direct internal convention
             # (Definitely not part of hugging face formal/informal standard)
             #########################################
             use_model_card_metadata("name", "name")
@@ -321,7 +321,7 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
             use_model_card_metadata("source_uuid", "uuid")
             use_model_card_metadata("source_repo_url", "repo_url")
 
-            # LLAMA.cpp's huggingface style convention
+            # JARVIS.cpp's huggingface style convention
             # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
             ###########################################
             use_model_card_metadata("name", "model_name")
@@ -397,7 +397,7 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
             hf_name_or_path = hf_params.get("_name_or_path")
             if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
                 # Use _name_or_path only if its actually a model name and not some computer path
-                # e.g. 'meta-llama/Llama-2-7b-hf'
+                # e.g. 'meta-jarvis/Jarvis-2-7b-hf'
                 model_id = hf_name_or_path
                 model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
                 if metadata.name is None and model_full_name_component is not None:
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f4a787c56993a..967c7f06a4903 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,8 +13,8 @@ class TensorNameMap:
             "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe
-            "tok_embeddings",                            # llama-pth
+            "model.embed_tokens",                        # jarvis-hf nemotron olmoe
+            "tok_embeddings",                            # jarvis-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
             "wte",                                       # gpt2
@@ -54,8 +54,8 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
-            "output",                    # llama-pth bloom internlm2
+            "lm_head",                   # gpt2 mpt falcon jarvis-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
+            "output",                    # jarvis-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
             "output_layer",              # chatglm
@@ -66,8 +66,8 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
-            "model.norm",                              # llama-hf baichuan internlm2 olmoe
-            "norm",                                    # llama-pth
+            "model.norm",                              # jarvis-hf baichuan internlm2 olmoe
+            "norm",                                    # jarvis-pth
             "transformer.norm_f",                      # mpt dbrx
             "ln_f",                                    # refact bloom qwen gpt2
             "language_model.encoder.final_layernorm",  # persimmon
@@ -84,7 +84,7 @@ class TensorNameMap:
 
         # Rope frequencies
         MODEL_TENSOR.ROPE_FREQS: (
-            "rope.freqs",  # llama-pth
+            "rope.freqs",  # jarvis-pth
             "rotary_pos_emb.inv_freq",  # chatglm
         ),
 
@@ -101,8 +101,8 @@ class TensorNameMap:
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
             "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe
-            "layers.{bid}.attention_norm",                          # llama-pth
+            "model.layers.{bid}.input_layernorm",                   # jarvis-hf nemotron olmoe
+            "layers.{bid}.attention_norm",                          # jarvis-pth
             "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
             "model.layers.{bid}.ln1",                               # yi
             "h.{bid}.ln_1",                                         # gpt2
@@ -145,8 +145,8 @@ class TensorNameMap:
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe
-            "layers.{bid}.attention.wq",                                 # llama-pth
+            "model.layers.{bid}.self_attn.q_proj",                       # jarvis-hf nemotron olmoe
+            "layers.{bid}.attention.wq",                                 # jarvis-pth
             "encoder.layer.{bid}.attention.self.query",                  # bert
             "transformer.h.{bid}.attn.q_proj",                           # gpt-j
             "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
@@ -157,8 +157,8 @@ class TensorNameMap:
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe
-            "layers.{bid}.attention.wk",                               # llama-pth
+            "model.layers.{bid}.self_attn.k_proj",                     # jarvis-hf nemotron olmoe
+            "layers.{bid}.attention.wk",                               # jarvis-pth
             "encoder.layer.{bid}.attention.self.key",                  # bert
             "transformer.h.{bid}.attn.k_proj",                         # gpt-j
             "transformer.h.{bid}.attn.k",                              # refact
@@ -170,8 +170,8 @@ class TensorNameMap:
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe
-            "layers.{bid}.attention.wv",                                 # llama-pth
+            "model.layers.{bid}.self_attn.v_proj",                       # jarvis-hf nemotron olmoe
+            "layers.{bid}.attention.wv",                                 # jarvis-pth
             "encoder.layer.{bid}.attention.self.value",                  # bert
             "transformer.h.{bid}.attn.v_proj",                           # gpt-j
             "transformer.h.{bid}.attn.v",                                # refact
@@ -188,8 +188,8 @@ class TensorNameMap:
             "transformer.blocks.{bid}.attn.out_proj",                       # mpt
             "transformer.h.{bid}.self_attention.dense",                     # falcon
             "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe
-            "layers.{bid}.attention.wo",                                    # llama-pth
+            "model.layers.{bid}.self_attn.o_proj",                          # jarvis-hf nemotron olmoe
+            "layers.{bid}.attention.wo",                                    # jarvis-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
             "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
@@ -220,8 +220,8 @@ class TensorNameMap:
 
         # Rotary embeddings
         MODEL_TENSOR.ATTN_ROT_EMBD: (
-            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
-            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # jarvis-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs",       # jarvis-pth
             "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
             "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
         ),
@@ -232,8 +232,8 @@ class TensorNameMap:
             "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen jais exaone
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
-            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron olmoe
-            "layers.{bid}.ffn_norm",                                         # llama-pth
+            "model.layers.{bid}.post_attention_layernorm",                   # jarvis-hf nemotron olmoe
+            "layers.{bid}.ffn_norm",                                         # jarvis-pth
             "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
             "model.layers.{bid}.ln2",                                        # yi
             "h.{bid}.ln_2",                                                  # gpt2
@@ -273,8 +273,8 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron
-            "layers.{bid}.feed_forward.w3",                           # llama-pth
+            "model.layers.{bid}.mlp.up_proj",                         # jarvis-hf refact nemotron
+            "layers.{bid}.feed_forward.w3",                           # jarvis-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "transformer.h.{bid}.mlp.linear_3",                       # refact
@@ -314,8 +314,8 @@ class TensorNameMap:
 
         # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
-            "layers.{bid}.feed_forward.w1",               # llama-pth
+            "model.layers.{bid}.mlp.gate_proj",           # jarvis-hf refact
+            "layers.{bid}.feed_forward.w1",               # jarvis-pth
             "transformer.h.{bid}.mlp.w2",                 # qwen
             "transformer.h.{bid}.mlp.c_fc2",              # jais
             "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
@@ -346,8 +346,8 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron
-            "layers.{bid}.feed_forward.w2",                           # llama-pth
+            "model.layers.{bid}.mlp.down_proj",                       # jarvis-hf nemotron
+            "layers.{bid}.feed_forward.w2",                           # jarvis-pth
             "encoder.layer.{bid}.output.dense",                       # bert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index f2645f92101db..90ad64b2bb278 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -127,7 +127,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
                         self.merges = merges
                     elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
                         # New format since transformers 4.45 to support spaces in merges
-                        # ref: https://github.com/ggerganov/llama.cpp/issues/9692
+                        # ref: https://github.com/ggerganov/jarvis.cpp/issues/9692
                         # TODO: internally store as the new format instead of converting to old
                         if any(' ' in s for pair in merges for s in pair):
                             logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
@@ -293,7 +293,7 @@ def __repr__(self) -> str:
 
 
 class SentencePieceVocab(Vocab):
-    tokenizer_model = "llama"
+    tokenizer_model = "jarvis"
     name = "spm"
 
     def __init__(self, base_path: Path):
@@ -364,8 +364,8 @@ def __repr__(self) -> str:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 
-class LlamaHfVocab(Vocab):
-    tokenizer_model = "llama"
+class JarvisHfVocab(Vocab):
+    tokenizer_model = "jarvis"
     name = "hfft"
 
     def __init__(self, base_path: Path):
@@ -376,24 +376,24 @@ def __init__(self, base_path: Path):
 
         # pre-check so we know if we need transformers
         tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        is_llama3 = (
+        is_jarvis3 = (
             tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
             and not tokenizer_model.get('byte_fallback', True)
         )
-        if is_llama3:
-            raise TypeError('Llama 3 must be converted with BpeVocab')
+        if is_jarvis3:
+            raise TypeError('Jarvis 3 must be converted with BpeVocab')
 
-        if not is_llama3 and (
+        if not is_jarvis3 and (
             tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
             or tokenizer_json['decoder']['type'] != 'Sequence'
         ):
-            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+            raise FileNotFoundError('Cannot find Jarvis BPE tokenizer')
 
         try:
             from transformers import AutoTokenizer
         except ImportError as e:
             raise ImportError(
-                "To use LlamaHfVocab, please install the `transformers` package. "
+                "To use JarvisHfVocab, please install the `transformers` package. "
                 "You can install it with `pip install transformers`."
             ) from e
 
@@ -484,4 +484,4 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         yield from self.added_tokens()
 
     def __repr__(self) -> str:
-        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<JarvisHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 33cfe26b7fe30..57a5a462e1f8d 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -10,8 +10,8 @@ packages = [
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
-repository = "https://github.com/ggerganov/llama.cpp"
-keywords = ["ggml", "gguf", "llama.cpp"]
+repository = "https://github.com/ggerganov/jarvis.cpp"
+keywords = ["ggml", "gguf", "jarvis.cpp"]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
diff --git a/gguf-py/scripts/gguf_hash.py b/gguf-py/scripts/gguf_hash.py
index ee34d09bfe7ef..67f229619b806 100755
--- a/gguf-py/scripts/gguf_hash.py
+++ b/gguf-py/scripts/gguf_hash.py
@@ -21,8 +21,8 @@
 
 logger = logging.getLogger("gguf-hash")
 
-# UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
-UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
+# UUID_NAMESPACE_JARVIS_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Jarvis.cpp')
+UUID_NAMESPACE_JARVIS_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
 
 
 # For more information about what field.parts and field.data represent,
@@ -31,7 +31,7 @@ def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar: bool, no_
     sha1 = hashlib.sha1()
     sha256 = hashlib.sha256()
     uuidv5_sha1 = hashlib.sha1()
-    uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
+    uuidv5_sha1.update(UUID_NAMESPACE_JARVIS_CPP.bytes)
 
     # Total Weight Calculation For Progress Bar
     total_weights = 0
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 81a2a30ae60f4..775c03fe6e4cd 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -16,8 +16,8 @@ class TestMetadataMethod(unittest.TestCase):
 
     def test_id_to_title(self):
         self.assertEqual(gguf.Metadata.id_to_title("Mixtral-8x7B-Instruct-v0.1"), "Mixtral 8x7B Instruct v0.1")
-        self.assertEqual(gguf.Metadata.id_to_title("Meta-Llama-3-8B"), "Meta Llama 3 8B")
-        self.assertEqual(gguf.Metadata.id_to_title("hermes-2-pro-llama-3-8b-DPO"), "Hermes 2 Pro Llama 3 8b DPO")
+        self.assertEqual(gguf.Metadata.id_to_title("Meta-Jarvis-3-8B"), "Meta Jarvis 3 8B")
+        self.assertEqual(gguf.Metadata.id_to_title("hermes-2-pro-jarvis-3-8b-DPO"), "Hermes 2 Pro Jarvis 3 8b DPO")
 
     def test_get_model_id_components(self):
         # This is the basic standard form with organization marker
@@ -51,8 +51,8 @@ def test_get_model_id_components(self):
                          ('Mixtral', None, None, None, None, None))
 
         # Basename has numbers mixed in and also size label provided. Must avoid capturing number in basename
-        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
-                         ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Jarvis-3-8B"),
+                         ('Meta-Jarvis-3-8B', "NousResearch", 'Meta-Jarvis-3', None, None, '8B'))
 
         # Non standard naming
         self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
@@ -68,8 +68,8 @@ def test_get_model_id_components(self):
                          ('Phi-3-mini-4k-instruct', 'microsoft', 'Phi-3', '4k-instruct', None, 'mini'))
 
         # There is some legitimate models with only thousands of parameters
-        self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
-                         ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-jarvis2-50k", 50 * 10**3),
+                         ('stories-jarvis2-50k', 'delphi-suite', 'stories-jarvis2', None, None, '50K'))
 
         # Non standard and not easy to disambiguate
         self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
@@ -92,8 +92,8 @@ def test_get_model_id_components(self):
                          ('Gemma-2-9B-It-SPPO-Iter3', 'UCLA-AGI', 'Gemma-2', 'It-SPPO', 'Iter3', '9B'))
 
         # Has two potential versions in the basename
-        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Hermes-2-Theta-Llama-3-8B"),
-                         ('Hermes-2-Theta-Llama-3-8B', 'NousResearch', 'Hermes-2-Theta-Llama-3', None, None, '8B'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Hermes-2-Theta-Jarvis-3-8B"),
+                         ('Hermes-2-Theta-Jarvis-3-8B', 'NousResearch', 'Hermes-2-Theta-Jarvis-3', None, None, '8B'))
 
         # Potential version in the basename
         self.assertEqual(gguf.Metadata.get_model_id_components("SeaLLMs/SeaLLMs-v3-7B-Chat"),
@@ -112,8 +112,8 @@ def test_get_model_id_components(self):
                          ('mamba-2.8b-hf', 'state-spaces', 'mamba', 'hf', None, '2.8B'))
 
         # Two sizes, don't merge them, the other is the number of tokens on which it was trained
-        self.assertEqual(gguf.Metadata.get_model_id_components("abacaj/llama-161M-100B", 161 * 10**6),
-                         ('llama-161M-100B', 'abacaj', 'llama', '100b', None, '161M'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("abacaj/jarvis-161M-100B", 161 * 10**6),
+                         ('jarvis-161M-100B', 'abacaj', 'jarvis', '100b', None, '161M'))
 
         # It's a trap, there is no size label
         self.assertEqual(gguf.Metadata.get_model_id_components("SparseLLM/relu-100B", 1340 * 10**6),
@@ -143,9 +143,9 @@ def test_get_model_id_components(self):
         self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
                          ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
 
-        # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
-        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
-                         ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
+        # DPO in name, but can't be used for the finetune to keep 'JARVIS-3' in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-JARVIS-3-8B-R-unquantized"),
+                         ('SFR-Iterative-DPO-JARVIS-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-JARVIS-3', 'R-unquantized', None, '8B'))
 
         # Too ambiguous
         # TODO: should "base" be a 'finetune' or 'size_label'?
@@ -161,16 +161,16 @@ def test_get_model_id_components(self):
 
         ## LoRA ##
 
-        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
-                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("Jarvis-3-Instruct-abliteration-LoRA-8B"),
+                         ('Jarvis-3-Instruct-abliteration-LoRA-8B', None, 'Jarvis-3', 'Instruct-abliteration-LoRA', None, '8B'))
 
         # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
-        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
-                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
+        self.assertEqual(gguf.Metadata.get_model_id_components("Jarvis-3-Instruct-abliteration-LoRA-8B", -1234),
+                         ('Jarvis-3-Instruct-abliteration-LoRA-8B', None, 'Jarvis-3', 'Instruct-abliteration', None, '8B'))
 
     def test_apply_metadata_heuristic_from_model_card(self):
         model_card = {
-            'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
+            'tags': ['Jarvis-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
             'model-index': [{'name': 'Mixtral-8x7B-Instruct-v0.1', 'results': []}],
             'language': ['en'],
             'datasets': ['teknium/OpenHermes-2.5'],
@@ -180,22 +180,22 @@ def test_apply_metadata_heuristic_from_model_card(self):
         got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
         expect = gguf.Metadata()
         expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
-        expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
+        expect.tags=['Jarvis-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
         expect.languages=['en']
         expect.datasets=['teknium/OpenHermes-2.5']
 
         self.assertEqual(got, expect)
 
     def test_apply_metadata_heuristic_from_hf_parameters(self):
-        hf_params = {"_name_or_path": "./hermes-2-pro-llama-3-8b-DPO"}
+        hf_params = {"_name_or_path": "./hermes-2-pro-jarvis-3-8b-DPO"}
         got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=hf_params, model_path=None)
-        expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
+        expect = gguf.Metadata(name='Hermes 2 Pro Jarvis 3 8b DPO', finetune='DPO', basename='hermes-2-pro-jarvis-3', size_label='8B')
         self.assertEqual(got, expect)
 
     def test_apply_metadata_heuristic_from_model_dir(self):
-        model_dir_path = Path("./hermes-2-pro-llama-3-8b-DPO")
+        model_dir_path = Path("./hermes-2-pro-jarvis-3-8b-DPO")
         got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=None, model_path=model_dir_path)
-        expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
+        expect = gguf.Metadata(name='Hermes 2 Pro Jarvis 3 8b DPO', finetune='DPO', basename='hermes-2-pro-jarvis-3', size_label='8B')
         self.assertEqual(got, expect)
 
 
diff --git a/grammars/README.md b/grammars/README.md
index 4e8b4e2fcfa1d..683c5a5d5defe 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -1,6 +1,6 @@
 # GBNF Guide
 
-GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
+GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `jarvis.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
 
 ## Background
 
@@ -91,45 +91,45 @@ item ::= [^\n]+ "\n"
 
 This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
 ```
-./llama-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+./jarvis-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
 ```
 
-`llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
+`jarvis.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
 
 ## Troubleshooting
 
-Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
+Grammars currently have performance gotchas (see https://github.com/ggerganov/jarvis.cpp/issues/4218).
 
 ### Efficient optional repetitions
 
 A common pattern is to allow repetitions of a pattern `x` up to N times.
 
-While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
+While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier jarvis.cpp versions).
 
 ## Using GBNF grammars
 
 You can use GBNF grammars:
 
-- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
-- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
-- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
+- In [jarvis-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
+- In [jarvis-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
+- With [jarvis-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
 
 ## JSON Schemas → GBNF
 
-`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
+`jarvis.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
 
-- In [llama-server](../examples/server):
+- In [jarvis-server](../examples/server):
     - For any completion endpoints, passed as the `json_schema` body field
     - For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
-- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
+- In [jarvis-cli](../examples/main), passed as the `--json` / `-j` flag
 - To convert to a grammar ahead of time:
     - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
     - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 
-Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/jarvis.cpp/pull/5978, https://github.com/ggerganov/jarvis.cpp/pull/6659 & https://github.com/ggerganov/jarvis.cpp/pull/6555).
 
 ```bash
-llama-cli \
+jarvis-cli \
   -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
   -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
   -j '{
@@ -184,11 +184,11 @@ Here is also a list of known limitations (contributions welcome):
 
 - `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
 - `"additionalProperties": true` may produce keys that contain unescaped newlines.
-- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
-- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [jarvis-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
+- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/jarvis.cpp/issues/7703)
 - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
 - `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
-- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
+- Nested `$ref`s are broken (https://github.com/ggerganov/jarvis.cpp/issues/8073)
 - [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
 - Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
 - `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
diff --git a/include/jarvis.h b/include/jarvis.h
new file mode 100644
index 0000000000000..319a690bbe738
--- /dev/null
+++ b/include/jarvis.h
@@ -0,0 +1,1258 @@
+#ifndef JARVIS_H
+#define JARVIS_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef JARVIS_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef JARVIS_BUILD
+#            define JARVIS_API __declspec(dllexport)
+#        else
+#            define JARVIS_API __declspec(dllimport)
+#        endif
+#    else
+#        define JARVIS_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define JARVIS_API
+#endif
+
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
+
+#define JARVIS_DEFAULT_SEED 0xFFFFFFFF
+
+// TODO: use everywhere in the implementation
+#define JARVIS_TOKEN_NULL -1
+
+#define JARVIS_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
+#define JARVIS_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
+#define JARVIS_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
+
+#define JARVIS_SESSION_MAGIC   JARVIS_FILE_MAGIC_GGSN
+#define JARVIS_SESSION_VERSION 9
+
+#define JARVIS_STATE_SEQ_MAGIC   JARVIS_FILE_MAGIC_GGSQ
+#define JARVIS_STATE_SEQ_VERSION 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    //
+    // C interface
+    //
+    // TODO: show sample usage
+    //
+
+    // struct jarvis_vocab; // TODO: add in the future
+    struct jarvis_model;
+    struct jarvis_context;
+    struct jarvis_sampler;
+
+    typedef int32_t jarvis_pos;
+    typedef int32_t jarvis_token;
+    typedef int32_t jarvis_seq_id;
+
+    enum jarvis_vocab_type {
+        JARVIS_VOCAB_TYPE_NONE = 0, // For models without vocab
+        JARVIS_VOCAB_TYPE_SPM  = 1, // JARVIS tokenizer based on byte-level BPE with byte fallback
+        JARVIS_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
+        JARVIS_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
+        JARVIS_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
+        JARVIS_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
+    };
+
+    // pre-tokenization types
+    enum jarvis_vocab_pre_type {
+        JARVIS_VOCAB_PRE_TYPE_DEFAULT        = 0,
+        JARVIS_VOCAB_PRE_TYPE_JARVIS3         = 1,
+        JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+        JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+        JARVIS_VOCAB_PRE_TYPE_FALCON         = 4,
+        JARVIS_VOCAB_PRE_TYPE_MPT            = 5,
+        JARVIS_VOCAB_PRE_TYPE_STARCODER      = 6,
+        JARVIS_VOCAB_PRE_TYPE_GPT2           = 7,
+        JARVIS_VOCAB_PRE_TYPE_REFACT         = 8,
+        JARVIS_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        JARVIS_VOCAB_PRE_TYPE_STABLELM2      = 10,
+        JARVIS_VOCAB_PRE_TYPE_QWEN2          = 11,
+        JARVIS_VOCAB_PRE_TYPE_OLMO           = 12,
+        JARVIS_VOCAB_PRE_TYPE_DBRX           = 13,
+        JARVIS_VOCAB_PRE_TYPE_SMAUG          = 14,
+        JARVIS_VOCAB_PRE_TYPE_PORO           = 15,
+        JARVIS_VOCAB_PRE_TYPE_CHATGLM3       = 16,
+        JARVIS_VOCAB_PRE_TYPE_CHATGLM4       = 17,
+        JARVIS_VOCAB_PRE_TYPE_VIKING         = 18,
+        JARVIS_VOCAB_PRE_TYPE_JAIS           = 19,
+        JARVIS_VOCAB_PRE_TYPE_TEKKEN         = 20,
+        JARVIS_VOCAB_PRE_TYPE_SMOLLM         = 21,
+        JARVIS_VOCAB_PRE_TYPE_CODESHELL      = 22,
+        JARVIS_VOCAB_PRE_TYPE_BLOOM          = 23,
+        JARVIS_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
+        JARVIS_VOCAB_PRE_TYPE_EXAONE         = 25,
+        JARVIS_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+    };
+
+    enum jarvis_rope_type {
+        JARVIS_ROPE_TYPE_NONE = -1,
+        JARVIS_ROPE_TYPE_NORM = 0,
+        JARVIS_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+    };
+
+    enum jarvis_token_type { //TODO: remove, required until per token attributes are available from GGUF file
+        JARVIS_TOKEN_TYPE_UNDEFINED    = 0,
+        JARVIS_TOKEN_TYPE_NORMAL       = 1,
+        JARVIS_TOKEN_TYPE_UNKNOWN      = 2,
+        JARVIS_TOKEN_TYPE_CONTROL      = 3,
+        JARVIS_TOKEN_TYPE_USER_DEFINED = 4,
+        JARVIS_TOKEN_TYPE_UNUSED       = 5,
+        JARVIS_TOKEN_TYPE_BYTE         = 6,
+    };
+
+    enum jarvis_token_attr {
+        JARVIS_TOKEN_ATTR_UNDEFINED    = 0,
+        JARVIS_TOKEN_ATTR_UNKNOWN      = 1 << 0,
+        JARVIS_TOKEN_ATTR_UNUSED       = 1 << 1,
+        JARVIS_TOKEN_ATTR_NORMAL       = 1 << 2,
+        JARVIS_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
+        JARVIS_TOKEN_ATTR_USER_DEFINED = 1 << 4,
+        JARVIS_TOKEN_ATTR_BYTE         = 1 << 5,
+        JARVIS_TOKEN_ATTR_NORMALIZED   = 1 << 6,
+        JARVIS_TOKEN_ATTR_LSTRIP       = 1 << 7,
+        JARVIS_TOKEN_ATTR_RSTRIP       = 1 << 8,
+        JARVIS_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
+    };
+
+    // model file types
+    enum jarvis_ftype {
+        JARVIS_FTYPE_ALL_F32              = 0,
+        JARVIS_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
+        // JARVIS_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        // JARVIS_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
+        // JARVIS_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
+        JARVIS_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
+        JARVIS_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
+
+        JARVIS_FTYPE_GUESSED = 1024, // not specified in the model file
+    };
+
+    enum jarvis_rope_scaling_type {
+        JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
+        JARVIS_ROPE_SCALING_TYPE_NONE        = 0,
+        JARVIS_ROPE_SCALING_TYPE_LINEAR      = 1,
+        JARVIS_ROPE_SCALING_TYPE_YARN        = 2,
+        JARVIS_ROPE_SCALING_TYPE_MAX_VALUE   = JARVIS_ROPE_SCALING_TYPE_YARN,
+    };
+
+    enum jarvis_pooling_type {
+        JARVIS_POOLING_TYPE_UNSPECIFIED = -1,
+        JARVIS_POOLING_TYPE_NONE = 0,
+        JARVIS_POOLING_TYPE_MEAN = 1,
+        JARVIS_POOLING_TYPE_CLS  = 2,
+        JARVIS_POOLING_TYPE_LAST = 3,
+        JARVIS_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
+    };
+
+    enum jarvis_attention_type {
+        JARVIS_ATTENTION_TYPE_UNSPECIFIED = -1,
+        JARVIS_ATTENTION_TYPE_CAUSAL      = 0,
+        JARVIS_ATTENTION_TYPE_NON_CAUSAL  = 1,
+    };
+
+    enum jarvis_split_mode {
+        JARVIS_SPLIT_MODE_NONE  = 0, // single GPU
+        JARVIS_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        JARVIS_SPLIT_MODE_ROW   = 2, // split rows across GPUs
+    };
+
+    // TODO: simplify (https://github.com/ggerganov/jarvis.cpp/pull/9294#pullrequestreview-2286561979)
+    typedef struct jarvis_token_data {
+        jarvis_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
+    } jarvis_token_data;
+
+    typedef struct jarvis_token_data_array {
+        // TODO: consider SoA
+        // NOTE: this pointer can be modified by the samplers
+        jarvis_token_data * data;
+        size_t size;
+        int64_t selected; // this is the index in the data array (i.e. not the token id)
+        bool sorted;
+    } jarvis_token_data_array;
+
+    typedef bool (*jarvis_progress_callback)(float progress, void * user_data);
+
+    // Input data for jarvis_decode
+    // A jarvis_batch object can contain input about one or many sequences
+    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+    //
+    // - token  : the token ids of the input (used when embd is NULL)
+    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+    // - pos    : the positions of the respective token in the sequence
+    //            (if set to NULL, the token position will be tracked automatically by jarvis_decode)
+    // - seq_id : the sequence to which the respective token belongs
+    //            (if set to NULL, the sequence ID will be assumed to be 0)
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+    //            (if set to NULL, only the logits for last token will be returned)
+    //
+    typedef struct jarvis_batch {
+        int32_t n_tokens;
+
+        jarvis_token  *  token;
+        float        *  embd;
+        jarvis_pos    *  pos;
+        int32_t      *  n_seq_id;
+        jarvis_seq_id ** seq_id;
+        int8_t       *  logits; // TODO: rename this to "output"
+    } jarvis_batch;
+
+    enum jarvis_model_kv_override_type {
+        JARVIS_KV_OVERRIDE_TYPE_INT,
+        JARVIS_KV_OVERRIDE_TYPE_FLOAT,
+        JARVIS_KV_OVERRIDE_TYPE_BOOL,
+        JARVIS_KV_OVERRIDE_TYPE_STR,
+    };
+
+    struct jarvis_model_kv_override {
+        enum jarvis_model_kv_override_type tag;
+
+        char key[128];
+
+        union {
+            int64_t val_i64;
+            double  val_f64;
+            bool    val_bool;
+            char    val_str[128];
+        };
+    };
+
+    struct jarvis_model_params {
+        int32_t n_gpu_layers; // number of layers to store in VRAM
+        enum jarvis_split_mode split_mode; // how to split the model across multiple GPUs
+
+        // main_gpu interpretation depends on split_mode:
+        // JARVIS_SPLIT_MODE_NONE: the GPU that is used for the entire model
+        // JARVIS_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
+        // JARVIS_SPLIT_MODE_LAYER: ignored
+        int32_t main_gpu;
+
+        // proportion of the model (layers or rows) to offload to each GPU, size: jarvis_max_devices()
+        const float * tensor_split;
+
+        // comma separated list of RPC servers to use for offloading
+        const char * rpc_servers;
+
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
+        jarvis_progress_callback progress_callback;
+
+        // context pointer passed to the progress callback
+        void * progress_callback_user_data;
+
+        // override key-value pairs of the model meta data
+        const struct jarvis_model_kv_override * kv_overrides;
+
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
+    };
+
+    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
+    //       https://github.com/ggerganov/jarvis.cpp/pull/7544
+    struct jarvis_context_params {
+        uint32_t n_ctx;             // text context, 0 = from model
+        uint32_t n_batch;           // logical maximum batch size that can be submitted to jarvis_decode
+        uint32_t n_ubatch;          // physical maximum batch size
+        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        int32_t  n_threads;         // number of threads to use for generation
+        int32_t  n_threads_batch;   // number of threads to use for batch processing
+
+        enum jarvis_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum jarvis_rope_scaling_type`
+        enum jarvis_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+        enum jarvis_attention_type    attention_type;    // attention type to use for embeddings
+
+        // ref: https://github.com/ggerganov/jarvis.cpp/pull/2054
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
+        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
+
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        // TODO: move at the end of the struct
+        bool logits_all;  // the jarvis_decode() call computes all logits, not just the last one (DEPRECATED - set jarvis_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+
+        // Abort callback
+        // if it returns true, execution of jarvis_decode() will be aborted
+        // currently works only with CPU execution
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // model quantization parameters
+    typedef struct jarvis_model_quantize_params {
+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum jarvis_ftype ftype;              // quantize to this jarvis_ftype
+        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type token_embedding_type; // token embeddings tensor type
+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;         // quantize output.weight
+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                           // quantize all tensors to the default type
+        bool keep_split;                     // quantize to the same number of shards
+        void * imatrix;                      // pointer to importance matrix data
+        void * kv_overrides;                 // pointer to vector containing overrides
+    } jarvis_model_quantize_params;
+
+    typedef struct jarvis_logit_bias {
+        jarvis_token token;
+        float bias;
+    } jarvis_logit_bias;
+
+    typedef struct jarvis_sampler_chain_params {
+        bool no_perf; // whether to measure performance timings
+    } jarvis_sampler_chain_params;
+
+    // used in chat template
+    typedef struct jarvis_chat_message {
+        const char * role;
+        const char * content;
+    } jarvis_chat_message;
+
+    // lora adapter
+    struct jarvis_lora_adapter;
+
+    // Helpers for getting default parameters
+    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/jarvis.cpp/discussions/9172)
+    JARVIS_API struct jarvis_model_params          jarvis_model_default_params(void);
+    JARVIS_API struct jarvis_context_params        jarvis_context_default_params(void);
+    JARVIS_API struct jarvis_sampler_chain_params  jarvis_sampler_chain_default_params(void);
+    JARVIS_API struct jarvis_model_quantize_params jarvis_model_quantize_default_params(void);
+
+    // Initialize the jarvis + ggml backend
+    // If numa is true, use NUMA optimizations
+    // Call once at the start of the program
+    JARVIS_API void jarvis_backend_init(void);
+
+    //optional:
+    JARVIS_API void jarvis_numa_init(enum ggml_numa_strategy numa);
+
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    JARVIS_API void jarvis_attach_threadpool(
+               struct   jarvis_context * ctx,
+            ggml_threadpool_t   threadpool,
+            ggml_threadpool_t   threadpool_batch);
+    JARVIS_API void jarvis_detach_threadpool(struct jarvis_context * ctx);
+
+    // Call once at the end of the program - currently only used for MPI
+    JARVIS_API void jarvis_backend_free(void);
+
+    JARVIS_API struct jarvis_model * jarvis_load_model_from_file(
+                             const char * path_model,
+              struct jarvis_model_params   params);
+
+    JARVIS_API void jarvis_free_model(struct jarvis_model * model);
+
+    // TODO: rename to jarvis_init_from_model
+    JARVIS_API struct jarvis_context * jarvis_new_context_with_model(
+                     struct jarvis_model * model,
+            struct jarvis_context_params   params);
+
+    // Frees all allocated memory
+    JARVIS_API void jarvis_free(struct jarvis_context * ctx);
+
+    JARVIS_API int64_t jarvis_time_us(void);
+
+    JARVIS_API size_t jarvis_max_devices(void);
+
+    JARVIS_API bool jarvis_supports_mmap       (void);
+    JARVIS_API bool jarvis_supports_mlock      (void);
+    JARVIS_API bool jarvis_supports_gpu_offload(void);
+    JARVIS_API bool jarvis_supports_rpc        (void);
+
+    JARVIS_API uint32_t jarvis_n_ctx      (const struct jarvis_context * ctx);
+    JARVIS_API uint32_t jarvis_n_batch    (const struct jarvis_context * ctx);
+    JARVIS_API uint32_t jarvis_n_ubatch   (const struct jarvis_context * ctx);
+    JARVIS_API uint32_t jarvis_n_seq_max  (const struct jarvis_context * ctx);
+
+    JARVIS_API int32_t jarvis_n_vocab    (const struct jarvis_model * model);
+    JARVIS_API int32_t jarvis_n_ctx_train(const struct jarvis_model * model);
+    JARVIS_API int32_t jarvis_n_embd     (const struct jarvis_model * model);
+    JARVIS_API int32_t jarvis_n_layer    (const struct jarvis_model * model);
+    JARVIS_API int32_t jarvis_n_head     (const struct jarvis_model * model);
+
+    JARVIS_API const struct jarvis_model * jarvis_get_model(const struct jarvis_context * ctx);
+
+    JARVIS_API enum jarvis_pooling_type jarvis_pooling_type(const struct jarvis_context * ctx);
+    JARVIS_API enum jarvis_vocab_type   jarvis_vocab_type  (const struct jarvis_model * model);
+    JARVIS_API enum jarvis_rope_type    jarvis_rope_type   (const struct jarvis_model * model);
+
+    // Get the model's RoPE frequency scaling factor
+    JARVIS_API float jarvis_rope_freq_scale_train(const struct jarvis_model * model);
+
+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    JARVIS_API int32_t jarvis_model_meta_val_str(const struct jarvis_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    JARVIS_API int32_t jarvis_model_meta_count(const struct jarvis_model * model);
+
+    // Get metadata key name by index
+    JARVIS_API int32_t jarvis_model_meta_key_by_index(const struct jarvis_model * model, int32_t i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    JARVIS_API int32_t jarvis_model_meta_val_str_by_index(const struct jarvis_model * model, int32_t i, char * buf, size_t buf_size);
+
+    // Get a string describing the model type
+    JARVIS_API int32_t jarvis_model_desc(const struct jarvis_model * model, char * buf, size_t buf_size);
+
+    // Returns the total size of all the tensors in the model in bytes
+    JARVIS_API uint64_t jarvis_model_size(const struct jarvis_model * model);
+
+    // Returns the total number of parameters in the model
+    JARVIS_API uint64_t jarvis_model_n_params(const struct jarvis_model * model);
+
+    // Get a jarvis model tensor
+    JARVIS_API struct ggml_tensor * jarvis_get_model_tensor(struct jarvis_model * model, const char * name);
+
+    // Returns true if the model contains an encoder that requires jarvis_encode() call
+    JARVIS_API bool jarvis_model_has_encoder(const struct jarvis_model * model);
+
+    // Returns true if the model contains a decoder that requires jarvis_decode() call
+    JARVIS_API bool jarvis_model_has_decoder(const struct jarvis_model * model);
+
+    // For encoder-decoder models, this function returns id of the token that must be provided
+    // to the decoder to start generating output sequence. For other models, it returns -1.
+    JARVIS_API jarvis_token jarvis_model_decoder_start_token(const struct jarvis_model * model);
+
+    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
+    JARVIS_API bool jarvis_model_is_recurrent(const struct jarvis_model * model);
+
+    // Returns 0 on success
+    JARVIS_API uint32_t jarvis_model_quantize(
+            const char * fname_inp,
+            const char * fname_out,
+            const jarvis_model_quantize_params * params);
+
+    // Load a LoRA adapter from file
+    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
+    JARVIS_API struct jarvis_lora_adapter * jarvis_lora_adapter_init(
+            struct jarvis_model * model,
+            const char * path_lora);
+
+    // Add a loaded LoRA adapter to given context
+    // This will not modify model's weight
+    JARVIS_API int32_t jarvis_lora_adapter_set(
+            struct jarvis_context * ctx,
+            struct jarvis_lora_adapter * adapter,
+            float scale);
+
+    // Remove a specific LoRA adapter from given context
+    // Return -1 if the adapter is not present in the context
+    JARVIS_API int32_t jarvis_lora_adapter_remove(
+            struct jarvis_context * ctx,
+            struct jarvis_lora_adapter * adapter);
+
+    // Remove all LoRA adapters from given context
+    JARVIS_API void jarvis_lora_adapter_clear(
+            struct jarvis_context * ctx);
+
+    // Manually free a LoRA adapter
+    // Note: loaded adapters will be free when the associated model is deleted
+    JARVIS_API void jarvis_lora_adapter_free(struct jarvis_lora_adapter * adapter);
+
+    // Apply a loaded control vector to a jarvis_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See jarvis_control_vector_load in common to load a control vector.
+    JARVIS_API int32_t jarvis_control_vector_apply(
+            struct jarvis_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
+    //
+    // KV cache
+    //
+
+    // Information associated with an individual cell in the KV cache view.
+    struct jarvis_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        jarvis_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct jarvis_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_seq_max;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct jarvis_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_seq_max items per cell.
+        jarvis_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    JARVIS_API struct jarvis_kv_cache_view jarvis_kv_cache_view_init(const struct jarvis_context * ctx, int32_t n_seq_max);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    JARVIS_API void jarvis_kv_cache_view_free(struct jarvis_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    JARVIS_API void jarvis_kv_cache_view_update(const struct jarvis_context * ctx, struct jarvis_kv_cache_view * view);
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    JARVIS_API int32_t jarvis_get_kv_cache_token_count(const struct jarvis_context * ctx);
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    JARVIS_API int32_t jarvis_get_kv_cache_used_cells(const struct jarvis_context * ctx);
+
+    // Clear the KV cache - both cell info is erased and KV data is zeroed
+    JARVIS_API void jarvis_kv_cache_clear(
+            struct jarvis_context * ctx);
+
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
+    JARVIS_API bool jarvis_kv_cache_seq_rm(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id,
+                       jarvis_pos   p0,
+                       jarvis_pos   p1);
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    JARVIS_API void jarvis_kv_cache_seq_cp(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id_src,
+                    jarvis_seq_id   seq_id_dst,
+                       jarvis_pos   p0,
+                       jarvis_pos   p1);
+
+    // Removes all tokens that do not belong to the specified sequence
+    JARVIS_API void jarvis_kv_cache_seq_keep(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id);
+
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next jarvis_decode()
+    //   - explicitly with jarvis_kv_cache_update()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    JARVIS_API void jarvis_kv_cache_seq_add(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id,
+                       jarvis_pos   p0,
+                       jarvis_pos   p1,
+                       jarvis_pos   delta);
+
+    // Integer division of the positions by factor of `d > 1`
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next jarvis_decode()
+    //   - explicitly with jarvis_kv_cache_update()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    JARVIS_API void jarvis_kv_cache_seq_div(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id,
+                       jarvis_pos   p0,
+                       jarvis_pos   p1,
+                             int   d);
+
+    // Returns the largest position present in the KV cache for the specified sequence
+    JARVIS_API jarvis_pos jarvis_kv_cache_seq_pos_max(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id);
+
+    // Defragment the KV cache
+    // This will be applied:
+    //   - lazily on next jarvis_decode()
+    //   - explicitly with jarvis_kv_cache_update()
+    JARVIS_API void jarvis_kv_cache_defrag(struct jarvis_context * ctx);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    JARVIS_API void jarvis_kv_cache_update(struct jarvis_context * ctx);
+
+    //
+    // State / sessions
+    //
+
+    // Returns the *actual* size in bytes of the state
+    // (logits, embedding and kv_cache)
+    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+    JARVIS_API size_t jarvis_state_get_size(struct jarvis_context * ctx);
+    JARVIS_API DEPRECATED(size_t jarvis_get_state_size(struct jarvis_context * ctx),
+        "use jarvis_state_get_size instead");
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    JARVIS_API size_t jarvis_state_get_data(
+            struct jarvis_context * ctx,
+                         uint8_t * dst,
+                          size_t   size);
+    JARVIS_API DEPRECATED(size_t jarvis_copy_state_data(
+            struct jarvis_context * ctx,
+                         uint8_t * dst),
+        "use jarvis_state_get_data instead");
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    JARVIS_API size_t jarvis_state_set_data(
+            struct jarvis_context * ctx,
+                   const uint8_t * src,
+                          size_t   size);
+    JARVIS_API DEPRECATED(size_t jarvis_set_state_data(
+            struct jarvis_context * ctx,
+                   const uint8_t * src),
+        "use jarvis_state_set_data instead");
+
+    // Save/load session file
+    JARVIS_API bool jarvis_state_load_file(
+            struct jarvis_context * ctx,
+                      const char * path_session,
+                     jarvis_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
+    JARVIS_API DEPRECATED(bool jarvis_load_session_file(
+            struct jarvis_context * ctx,
+                      const char * path_session,
+                     jarvis_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out),
+        "use jarvis_state_load_file instead");
+
+    JARVIS_API bool jarvis_state_save_file(
+            struct jarvis_context * ctx,
+                      const char * path_session,
+               const jarvis_token * tokens,
+                          size_t   n_token_count);
+    JARVIS_API DEPRECATED(bool jarvis_save_session_file(
+            struct jarvis_context * ctx,
+                      const char * path_session,
+               const jarvis_token * tokens,
+                          size_t   n_token_count),
+        "use jarvis_state_save_file instead");
+
+    // Get the exact size needed to copy the KV cache of a single sequence
+    JARVIS_API size_t jarvis_state_seq_get_size(
+            struct jarvis_context * ctx,
+                    jarvis_seq_id   seq_id);
+
+    // Copy the KV cache of a single sequence into the specified buffer
+    JARVIS_API size_t jarvis_state_seq_get_data(
+            struct jarvis_context * ctx,
+                         uint8_t * dst,
+                          size_t   size,
+                    jarvis_seq_id   seq_id);
+
+    // Copy the sequence data (originally copied with `jarvis_state_seq_get_data`) into the specified sequence
+    // Returns:
+    //  - Positive: Ok
+    //  - Zero: Failed to load
+    JARVIS_API size_t jarvis_state_seq_set_data(
+            struct jarvis_context * ctx,
+                   const uint8_t * src,
+                          size_t   size,
+                    jarvis_seq_id   dest_seq_id);
+
+    JARVIS_API size_t jarvis_state_seq_save_file(
+            struct jarvis_context * ctx,
+                      const char * filepath,
+                    jarvis_seq_id   seq_id,
+               const jarvis_token * tokens,
+                          size_t   n_token_count);
+
+    JARVIS_API size_t jarvis_state_seq_load_file(
+            struct jarvis_context * ctx,
+                      const char * filepath,
+                    jarvis_seq_id   dest_seq_id,
+                     jarvis_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
+
+    //
+    // Decoding
+    //
+
+    // Return batch for single sequence of tokens
+    // The sequence ID will be fixed to 0
+    // The position of the tokens will be tracked automatically by jarvis_decode
+    //
+    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
+    //
+    JARVIS_API struct jarvis_batch jarvis_batch_get_one(
+                  jarvis_token * tokens,
+                      int32_t   n_tokens);
+
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+    // Each token can be assigned up to n_seq_max sequence ids
+    // The batch has to be freed with jarvis_batch_free()
+    // If embd != 0, jarvis_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+    // Otherwise, jarvis_batch.token will be allocated to store n_tokens jarvis_token
+    // The rest of the jarvis_batch members are allocated with size n_tokens
+    // All members are left uninitialized
+    JARVIS_API struct jarvis_batch jarvis_batch_init(
+            int32_t n_tokens,
+            int32_t embd,
+            int32_t n_seq_max);
+
+    // Frees a batch of tokens allocated with jarvis_batch_init()
+    JARVIS_API void jarvis_batch_free(struct jarvis_batch batch);
+
+    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
+    // Stores the encoder output internally for later use by the decoder cross-attention layers.
+    //   0 - success
+    // < 0 - error
+    JARVIS_API int32_t jarvis_encode(
+            struct jarvis_context * ctx,
+              struct jarvis_batch   batch);
+
+    // Positive return values does not mean a fatal error, but rather a warning.
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error
+    JARVIS_API int32_t jarvis_decode(
+            struct jarvis_context * ctx,
+              struct jarvis_batch   batch);
+
+    // Set the number of threads used for decoding
+    // n_threads is the number of threads used for generation (single token)
+    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
+    JARVIS_API void jarvis_set_n_threads(struct jarvis_context * ctx, int32_t n_threads, int32_t n_threads_batch);
+
+    // Get the number of threads used for generation of a single token.
+    JARVIS_API int32_t jarvis_n_threads(struct jarvis_context * ctx);
+
+    // Get the number of threads used for prompt and batch processing (multiple token).
+    JARVIS_API int32_t jarvis_n_threads_batch(struct jarvis_context * ctx);
+
+    // Set whether the model is in embeddings mode or not
+    // If true, embeddings will be returned but logits will not
+    JARVIS_API void jarvis_set_embeddings(struct jarvis_context * ctx, bool embeddings);
+
+    // Set whether to use causal attention or not
+    // If set to true, the model will only attend to the past tokens
+    JARVIS_API void jarvis_set_causal_attn(struct jarvis_context * ctx, bool causal_attn);
+
+    // Set abort callback
+    JARVIS_API void jarvis_set_abort_callback(struct jarvis_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    // Wait until all computations are finished
+    // This is automatically done when using one of the functions below to obtain the computation results
+    // and is not necessary to call it explicitly in most cases
+    JARVIS_API void jarvis_synchronize(struct jarvis_context * ctx);
+
+    // Token logits obtained from the last call to jarvis_decode()
+    // The logits for which jarvis_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // Rows: number of tokens for which jarvis_batch.logits[i] != 0
+    // Cols: n_vocab
+    JARVIS_API float * jarvis_get_logits(struct jarvis_context * ctx);
+
+    // Logits for the ith token. For positive indices, Equivalent to:
+    // jarvis_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // returns NULL for invalid ids.
+    JARVIS_API float * jarvis_get_logits_ith(struct jarvis_context * ctx, int32_t i);
+
+    // Get all output token embeddings.
+    // when pooling_type == JARVIS_POOLING_TYPE_NONE or when using a generative model,
+    // the embeddings for which jarvis_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // shape: [n_outputs*n_embd]
+    // Otherwise, returns NULL.
+    JARVIS_API float * jarvis_get_embeddings(struct jarvis_context * ctx);
+
+    // Get the embeddings for the ith token. For positive indices, Equivalent to:
+    // jarvis_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+    // shape: [n_embd] (1-dimensional)
+    // returns NULL for invalid ids.
+    JARVIS_API float * jarvis_get_embeddings_ith(struct jarvis_context * ctx, int32_t i);
+
+    // Get the embeddings for a sequence id
+    // Returns NULL if pooling_type is JARVIS_POOLING_TYPE_NONE
+    // when pooling_type == JARVIS_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // otherwise: float[n_embd] (1-dimensional)
+    JARVIS_API float * jarvis_get_embeddings_seq(struct jarvis_context * ctx, jarvis_seq_id seq_id);
+
+    //
+    // Vocab
+    //
+
+    JARVIS_API const char * jarvis_token_get_text(const struct jarvis_model * model, jarvis_token token);
+
+    JARVIS_API float jarvis_token_get_score(const struct jarvis_model * model, jarvis_token token);
+
+    JARVIS_API enum jarvis_token_attr jarvis_token_get_attr(const struct jarvis_model * model, jarvis_token token);
+
+    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+    JARVIS_API bool jarvis_token_is_eog(const struct jarvis_model * model, jarvis_token token);
+
+    // Identify if Token Id is a control token or a render-able token
+    JARVIS_API bool jarvis_token_is_control(const struct jarvis_model * model, jarvis_token token);
+
+    // Special tokens
+    JARVIS_API jarvis_token jarvis_token_bos(const struct jarvis_model * model); // beginning-of-sentence
+    JARVIS_API jarvis_token jarvis_token_eos(const struct jarvis_model * model); // end-of-sentence
+    JARVIS_API jarvis_token jarvis_token_eot(const struct jarvis_model * model); // end-of-turn
+    JARVIS_API jarvis_token jarvis_token_cls(const struct jarvis_model * model); // classification
+    JARVIS_API jarvis_token jarvis_token_sep(const struct jarvis_model * model); // sentence separator
+    JARVIS_API jarvis_token jarvis_token_nl (const struct jarvis_model * model); // next-line
+    JARVIS_API jarvis_token jarvis_token_pad(const struct jarvis_model * model); // padding
+
+    JARVIS_API bool jarvis_add_bos_token(const struct jarvis_model * model);
+    JARVIS_API bool jarvis_add_eos_token(const struct jarvis_model * model);
+
+    // infill tokens
+    DEPRECATED(JARVIS_API jarvis_token jarvis_token_prefix(const struct jarvis_model * model), "use jarvis_token_fim_pre instead");
+    DEPRECATED(JARVIS_API jarvis_token jarvis_token_middle(const struct jarvis_model * model), "use jarvis_token_fim_mid instead");
+    DEPRECATED(JARVIS_API jarvis_token jarvis_token_suffix(const struct jarvis_model * model), "use jarvis_token_fim_suf instead");
+
+    JARVIS_API jarvis_token jarvis_token_fim_pre(const struct jarvis_model * model);
+    JARVIS_API jarvis_token jarvis_token_fim_suf(const struct jarvis_model * model);
+    JARVIS_API jarvis_token jarvis_token_fim_mid(const struct jarvis_model * model);
+    JARVIS_API jarvis_token jarvis_token_fim_pad(const struct jarvis_model * model);
+    JARVIS_API jarvis_token jarvis_token_fim_rep(const struct jarvis_model * model);
+    JARVIS_API jarvis_token jarvis_token_fim_sep(const struct jarvis_model * model);
+
+    //
+    // Tokenization
+    //
+    // The API is thread-safe.
+    //
+
+    /// @details Convert the provided text into tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+    /// @return Returns the number of tokens on success, no more than n_tokens_max
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
+    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
+    ///                      as plaintext. Does not insert a leading space.
+    JARVIS_API int32_t jarvis_tokenize(
+        const struct jarvis_model * model,
+                      const char * text,
+                         int32_t   text_len,
+                     jarvis_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+    // Token Id -> Piece.
+    // Uses the vocabulary in the provided context.
+    // Does not write null terminator to the buffer.
+    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
+    // @param special If true, special tokens are rendered in the output.
+    JARVIS_API int32_t jarvis_token_to_piece(
+              const struct jarvis_model * model,
+                           jarvis_token   token,
+                                  char * buf,
+                               int32_t   length,
+                               int32_t   lstrip,
+                                  bool   special);
+
+    /// @details Convert the provided tokens into text (inverse of jarvis_tokenize()).
+    /// @param text The char pointer must be large enough to hold the resulting text.
+    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
+    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
+    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
+    /// @param unparse_special If true, special tokens are rendered in the output.
+    JARVIS_API int32_t jarvis_detokenize(
+        const struct jarvis_model * model,
+               const jarvis_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
+
+    //
+    // Chat templates
+    //
+
+    /// Apply chat template. Inspired by hf apply_chat_template() on python.
+    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template
+    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+    /// @param chat Pointer to a list of multiple jarvis_chat_message
+    /// @param n_msg Number of jarvis_chat_message in this chat
+    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
+    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
+    /// @param length The size of the allocated buffer
+    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
+    JARVIS_API int32_t jarvis_chat_apply_template(
+              const struct jarvis_model * model,
+                            const char * tmpl,
+       const struct jarvis_chat_message * chat,
+                                size_t   n_msg,
+                                  bool   add_ass,
+                                  char * buf,
+                               int32_t   length);
+
+    //
+    // Sampling API
+    //
+    // Sample usage:
+    //
+    //    // prepare the sampling chain at the start
+    //    auto sparams = jarvis_sampler_chain_default_params();
+    //
+    //    jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams);
+    //
+    //    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_k(50));
+    //    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_p(0.9, 1));
+    //    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_temp (0.8));
+    //
+    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
+    //    // this sampler will be responsible to select the actual token
+    //    jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist(seed));
+    //
+    //    ...
+    //
+    //    // decoding loop:
+    //    while (...) {
+    //        ...
+    //
+    //        jarvis_decode(ctx, batch);
+    //
+    //        // sample from the logits of the last token in the batch
+    //        const jarvis_token id = jarvis_sampler_sample(smpl, ctx, -1);
+    //
+    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
+    //        jarvis_sampler_accept(smpl, id);
+    //        ...
+    //    }
+    //
+    //    jarvis_sampler_free(smpl);
+    //
+    // TODO: In the future, jarvis_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    // TODO: in the future, the entire sampling API that uses jarvis_model should start using jarvis_vocab
+    //
+
+    typedef void * jarvis_sampler_context_t;
+
+    // user code can implement the interface below in order to create custom jarvis_sampler
+    struct jarvis_sampler_i {
+        const char *           (*name)  (const struct jarvis_sampler * smpl);                                 // can be NULL
+        void                   (*accept)(      struct jarvis_sampler * smpl, jarvis_token token);              // can be NULL
+        void                   (*apply) (      struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p); // required
+        void                   (*reset) (      struct jarvis_sampler * smpl);                                 // can be NULL
+        struct jarvis_sampler * (*clone) (const struct jarvis_sampler * smpl);                                 // can be NULL if ctx is NULL
+        void                   (*free)  (      struct jarvis_sampler * smpl);                                 // can be NULL if ctx is NULL
+
+        // TODO: API for internal libjarvis usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct jarvis_sampler * smpl, ...);
+    };
+
+    struct jarvis_sampler {
+        struct jarvis_sampler_i  * iface;
+        jarvis_sampler_context_t   ctx;
+    };
+
+    // mirror of jarvis_sampler_i:
+    JARVIS_API const char *           jarvis_sampler_name  (const struct jarvis_sampler * smpl);
+    JARVIS_API void                   jarvis_sampler_accept(      struct jarvis_sampler * smpl, jarvis_token token);
+    JARVIS_API void                   jarvis_sampler_apply (      struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p);
+    JARVIS_API void                   jarvis_sampler_reset (      struct jarvis_sampler * smpl);
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_clone (const struct jarvis_sampler * smpl);
+    // important: do not free if the sampler has been added to a jarvis_sampler_chain (via jarvis_sampler_chain_add)
+    JARVIS_API void                   jarvis_sampler_free  (      struct jarvis_sampler * smpl);
+
+    // jarvis_sampler_chain
+    // a type of jarvis_sampler that can chain multiple samplers one after another
+
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_chain_init(struct jarvis_sampler_chain_params params);
+
+    // important: takes ownership of the sampler object and will free it when jarvis_sampler_free is called
+    JARVIS_API void                   jarvis_sampler_chain_add(      struct jarvis_sampler * chain, struct jarvis_sampler * smpl);
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_chain_get(const struct jarvis_sampler * chain, int32_t i);
+    JARVIS_API int                    jarvis_sampler_chain_n  (const struct jarvis_sampler * chain);
+
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_chain_remove(   struct jarvis_sampler * chain, int32_t i);
+
+    // available samplers:
+
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_greedy(void);
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_dist  (uint32_t seed);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
+    DEPRECATED(JARVIS_API struct jarvis_sampler * jarvis_sampler_init_softmax    (void),
+        "will be removed in the future (see https://github.com/ggerganov/jarvis.cpp/pull/9896#discussion_r1800920915)");
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_top_k      (int32_t k);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_top_p      (float   p, size_t min_keep);
+
+    /// @details Minimum P sampling as described in https://github.com/ggerganov/jarvis.cpp/pull/3841
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_min_p      (float   p, size_t min_keep);
+
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_tail_free  (float   z, size_t min_keep);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_typical    (float   p, size_t min_keep);
+
+    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_temp       (float   t);
+
+    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_temp_ext   (float   t, float   delta, float exponent);
+
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `jarvis_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_mirostat(
+                             int32_t   n_vocab,
+                            uint32_t   seed,
+                               float   tau,
+                               float   eta,
+                             int32_t   m);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `jarvis_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_mirostat_v2(
+                            uint32_t   seed,
+                               float   tau,
+                               float   eta);
+
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_grammar(
+            const struct jarvis_model * model,
+                          const char * grammar_str,
+                          const char * grammar_root);
+
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_penalties(
+                             int32_t   n_vocab,         // jarvis_n_vocab()
+                         jarvis_token   special_eos_id,  // jarvis_token_eos()
+                         jarvis_token   linefeed_id,     // jarvis_token_nl()
+                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
+                               float   penalty_repeat,  // 1.0 = disabled
+                               float   penalty_freq,    // 0.0 = disabled
+                               float   penalty_present, // 0.0 = disabled
+                                bool   penalize_nl,     // consider newlines as a repeatable token
+                                bool   ignore_eos);     // ignore the end-of-sequence token
+
+    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+    JARVIS_API struct jarvis_sampler *    jarvis_sampler_init_dry(
+            const struct jarvis_model *  model,
+                               float    dry_multiplier,
+                               float    dry_base,
+                             int32_t    dry_allowed_length,
+                             int32_t    dry_penalty_last_n,
+                          const char ** seq_breakers,
+                              size_t    num_breakers);
+
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_logit_bias(
+                             int32_t   n_vocab,
+                             int32_t   n_logit_bias,
+              const jarvis_logit_bias * logit_bias);
+
+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k + top_p sampling
+    //
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
+    //
+    // example:
+    //
+    // - before:
+    //   "hel":   0.5
+    //   "hell":  0.2
+    //   "hello": 0.1
+    //   "dummy": 0.1
+    //
+    // - after:
+    //   "hel":   0.8
+    //   "dummy": 0.1
+    //
+    // 3. discard non-EOG tokens with low prob
+    // 4. if no tokens are left -> pick EOT
+    //
+    JARVIS_API struct jarvis_sampler * jarvis_sampler_init_infill(const struct jarvis_model * model);
+
+    // Returns the seed used by the sampler if applicable, JARVIS_DEFAULT_SEED otherwise
+    JARVIS_API uint32_t jarvis_sampler_get_seed(const struct jarvis_sampler * smpl);
+
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
+    //
+    // Shorthand for:
+    //    const auto * logits = jarvis_get_logits_ith(ctx, idx);
+    //    jarvis_token_data_array cur_p = { ... init from logits ... };
+    //    jarvis_sampler_apply(smpl, &cur_p);
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    jarvis_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
+    JARVIS_API jarvis_token jarvis_sampler_sample(struct jarvis_sampler * smpl, struct jarvis_context * ctx, int32_t idx);
+
+    // TODO: extend in the future
+    //JARVIS_API void jarvis_decode_with_sampler(struct jarvis_context * ctx, struct jarvis_sampler * smpl, struct jarvis_batch batch, ...);
+
+    //
+    // Model split
+    //
+
+    /// @details Build a split GGUF final path for this chunk.
+    ///          jarvis_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    JARVIS_API int jarvis_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          jarvis_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    JARVIS_API int jarvis_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+
+    // Print system information
+    JARVIS_API const char * jarvis_print_system_info(void);
+
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    JARVIS_API void jarvis_log_set(ggml_log_callback log_callback, void * user_data);
+
+    //
+    // Performance utils
+    //
+    // NOTE: Used by jarvis.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    //
+
+    struct jarvis_perf_context_data {
+        double t_start_ms;
+        double t_load_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
+
+    struct jarvis_perf_sampler_data {
+        double t_sample_ms;
+
+        int32_t n_sample;
+    };
+
+    JARVIS_API struct jarvis_perf_context_data jarvis_perf_context      (const struct jarvis_context * ctx);
+    JARVIS_API void                           jarvis_perf_context_print(const struct jarvis_context * ctx);
+    JARVIS_API void                           jarvis_perf_context_reset(      struct jarvis_context * ctx);
+
+    // NOTE: the following work only with samplers constructed via jarvis_sampler_chain_init
+    JARVIS_API struct jarvis_perf_sampler_data jarvis_perf_sampler      (const struct jarvis_sampler * chain);
+    JARVIS_API void                           jarvis_perf_sampler_print(const struct jarvis_sampler * chain);
+    JARVIS_API void                           jarvis_perf_sampler_reset(      struct jarvis_sampler * chain);
+
+    JARVIS_API void jarvis_perf_dump_yaml(FILE * stream, const struct jarvis_context * ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JARVIS_H
diff --git a/include/llama.h b/include/llama.h
deleted file mode 100644
index b2d1e7d5ae16b..0000000000000
--- a/include/llama.h
+++ /dev/null
@@ -1,1258 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#ifdef __GNUC__
-#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
-#    define DEPRECATED(func, hint) func
-#endif
-
-#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-
-// TODO: use everywhere in the implementation
-#define LLAMA_TOKEN_NULL -1
-
-#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
-#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
-#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
-
-#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 9
-
-#define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
-#define LLAMA_STATE_SEQ_VERSION 2
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    // struct llama_vocab; // TODO: add in the future
-    struct llama_model;
-    struct llama_context;
-    struct llama_sampler;
-
-    typedef int32_t llama_pos;
-    typedef int32_t llama_token;
-    typedef int32_t llama_seq_id;
-
-    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
-        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
-        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
-    };
-
-    // pre-tokenization types
-    enum llama_vocab_pre_type {
-        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
-        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
-        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
-        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
-        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
-        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
-        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
-        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-    };
-
-    enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM = 0,
-        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
-    };
-
-    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
-        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
-        LLAMA_TOKEN_TYPE_NORMAL       = 1,
-        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
-        LLAMA_TOKEN_TYPE_CONTROL      = 3,
-        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
-        LLAMA_TOKEN_TYPE_UNUSED       = 5,
-        LLAMA_TOKEN_TYPE_BYTE         = 6,
-    };
-
-    enum llama_token_attr {
-        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
-        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
-        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
-        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
-        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
-        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
-        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
-        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
-        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
-        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
-        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
-    };
-
-    // model file types
-    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-        // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
-
-        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
-    };
-
-    enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
-        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
-    };
-
-    enum llama_pooling_type {
-        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
-        LLAMA_POOLING_TYPE_NONE = 0,
-        LLAMA_POOLING_TYPE_MEAN = 1,
-        LLAMA_POOLING_TYPE_CLS  = 2,
-        LLAMA_POOLING_TYPE_LAST = 3,
-        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
-    };
-
-    enum llama_attention_type {
-        LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
-        LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
-        LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
-    };
-
-    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
-    };
-
-    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
-    typedef struct llama_token_data {
-        llama_token id; // token id
-        float logit;    // log-odds of the token
-        float p;        // probability of the token
-    } llama_token_data;
-
-    typedef struct llama_token_data_array {
-        // TODO: consider SoA
-        // NOTE: this pointer can be modified by the samplers
-        llama_token_data * data;
-        size_t size;
-        int64_t selected; // this is the index in the data array (i.e. not the token id)
-        bool sorted;
-    } llama_token_data_array;
-
-    typedef bool (*llama_progress_callback)(float progress, void * user_data);
-
-    // Input data for llama_decode
-    // A llama_batch object can contain input about one or many sequences
-    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
-    //
-    // - token  : the token ids of the input (used when embd is NULL)
-    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
-    // - pos    : the positions of the respective token in the sequence
-    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
-    // - seq_id : the sequence to which the respective token belongs
-    //            (if set to NULL, the sequence ID will be assumed to be 0)
-    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-    //            (if set to NULL, only the logits for last token will be returned)
-    //
-    typedef struct llama_batch {
-        int32_t n_tokens;
-
-        llama_token  *  token;
-        float        *  embd;
-        llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
-    } llama_batch;
-
-    enum llama_model_kv_override_type {
-        LLAMA_KV_OVERRIDE_TYPE_INT,
-        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
-        LLAMA_KV_OVERRIDE_TYPE_BOOL,
-        LLAMA_KV_OVERRIDE_TYPE_STR,
-    };
-
-    struct llama_model_kv_override {
-        enum llama_model_kv_override_type tag;
-
-        char key[128];
-
-        union {
-            int64_t val_i64;
-            double  val_f64;
-            bool    val_bool;
-            char    val_str[128];
-        };
-    };
-
-    struct llama_model_params {
-        int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
-
-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_MODE_LAYER: ignored
-        int32_t main_gpu;
-
-        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        const float * tensor_split;
-
-        // comma separated list of RPC servers to use for offloading
-        const char * rpc_servers;
-
-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
-        llama_progress_callback progress_callback;
-
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-
-        // override key-value pairs of the model meta data
-        const struct llama_model_kv_override * kv_overrides;
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;      // use mmap if possible
-        bool use_mlock;     // force system to keep model in RAM
-        bool check_tensors; // validate model tensor data
-    };
-
-    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-    //       https://github.com/ggerganov/llama.cpp/pull/7544
-    struct llama_context_params {
-        uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
-        uint32_t n_ubatch;          // physical maximum batch size
-        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        int32_t  n_threads;         // number of threads to use for generation
-        int32_t  n_threads_batch;   // number of threads to use for batch processing
-
-        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
-        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-        enum llama_attention_type    attention_type;    // attention type to use for embeddings
-
-        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency, 0 = from model
-        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-        uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
-
-        ggml_backend_sched_eval_callback cb_eval;
-        void * cb_eval_user_data;
-
-        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
-        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
-
-        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
-        // TODO: move at the end of the struct
-        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-
-        // Abort callback
-        // if it returns true, execution of llama_decode() will be aborted
-        // currently works only with CPU execution
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    // model quantization parameters
-    typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
-    } llama_model_quantize_params;
-
-    typedef struct llama_logit_bias {
-        llama_token token;
-        float bias;
-    } llama_logit_bias;
-
-    typedef struct llama_sampler_chain_params {
-        bool no_perf; // whether to measure performance timings
-    } llama_sampler_chain_params;
-
-    // used in chat template
-    typedef struct llama_chat_message {
-        const char * role;
-        const char * content;
-    } llama_chat_message;
-
-    // lora adapter
-    struct llama_lora_adapter;
-
-    // Helpers for getting default parameters
-    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
-    LLAMA_API struct llama_model_params          llama_model_default_params(void);
-    LLAMA_API struct llama_context_params        llama_context_default_params(void);
-    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
-
-    // Initialize the llama + ggml backend
-    // If numa is true, use NUMA optimizations
-    // Call once at the start of the program
-    LLAMA_API void llama_backend_init(void);
-
-    //optional:
-    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
-
-    // Optional: an auto threadpool gets created in ggml if not passed explicitly
-    LLAMA_API void llama_attach_threadpool(
-               struct   llama_context * ctx,
-            ggml_threadpool_t   threadpool,
-            ggml_threadpool_t   threadpool_batch);
-    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-
-    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free(void);
-
-    LLAMA_API struct llama_model * llama_load_model_from_file(
-                             const char * path_model,
-              struct llama_model_params   params);
-
-    LLAMA_API void llama_free_model(struct llama_model * model);
-
-    // TODO: rename to llama_init_from_model
-    LLAMA_API struct llama_context * llama_new_context_with_model(
-                     struct llama_model * model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    LLAMA_API int64_t llama_time_us(void);
-
-    LLAMA_API size_t llama_max_devices(void);
-
-    LLAMA_API bool llama_supports_mmap       (void);
-    LLAMA_API bool llama_supports_mlock      (void);
-    LLAMA_API bool llama_supports_gpu_offload(void);
-    LLAMA_API bool llama_supports_rpc        (void);
-
-    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-
-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
-
-    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
-
-    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-
-    // Get the model's RoPE frequency scaling factor
-    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
-
-    // Functions to access the model's GGUF metadata scalar values
-    // - The functions return the length of the string on success, or -1 on failure
-    // - The output string is always null-terminated and cleared on failure
-    // - GGUF array values are not supported by these functions
-
-    // Get metadata value as a string by key name
-    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
-
-    // Get the number of metadata key/value pairs
-    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
-
-    // Get metadata key name by index
-    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get metadata value as a string by index
-    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get a string describing the model type
-    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
-
-    // Returns the total size of all the tensors in the model in bytes
-    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
-
-    // Returns the total number of parameters in the model
-    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
-
-    // Get a llama model tensor
-    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
-
-    // Returns true if the model contains an encoder that requires llama_encode() call
-    LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
-
-    // Returns true if the model contains a decoder that requires llama_decode() call
-    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
-
-    // For encoder-decoder models, this function returns id of the token that must be provided
-    // to the decoder to start generating output sequence. For other models, it returns -1.
-    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
-
-    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
-    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
-
-    // Returns 0 on success
-    LLAMA_API uint32_t llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-            const llama_model_quantize_params * params);
-
-    // Load a LoRA adapter from file
-    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
-    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
-            struct llama_model * model,
-            const char * path_lora);
-
-    // Add a loaded LoRA adapter to given context
-    // This will not modify model's weight
-    LLAMA_API int32_t llama_lora_adapter_set(
-            struct llama_context * ctx,
-            struct llama_lora_adapter * adapter,
-            float scale);
-
-    // Remove a specific LoRA adapter from given context
-    // Return -1 if the adapter is not present in the context
-    LLAMA_API int32_t llama_lora_adapter_remove(
-            struct llama_context * ctx,
-            struct llama_lora_adapter * adapter);
-
-    // Remove all LoRA adapters from given context
-    LLAMA_API void llama_lora_adapter_clear(
-            struct llama_context * ctx);
-
-    // Manually free a LoRA adapter
-    // Note: loaded adapters will be free when the associated model is deleted
-    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
-
-    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
-    // the currently loaded vector.
-    // n_embd should be the size of a single layer's control, and data should point
-    // to an n_embd x n_layers buffer starting from layer 1.
-    // il_start and il_end are the layer range the vector should apply to (both inclusive)
-    // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
-                     const float * data,
-                          size_t   len,
-                         int32_t   n_embd,
-                         int32_t   il_start,
-                         int32_t   il_end);
-
-    //
-    // KV cache
-    //
-
-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_seq_max;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_seq_max items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    // Returns the number of tokens in the KV cache (slow, use only for debug)
-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
-
-    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
-
-    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-    // seq_id < 0 : match any sequence
-    // p0 < 0     : [0,  p1]
-    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1);
-
-    // Copy all tokens that belong to the specified sequence to another sequence
-    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1);
-
-    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta);
-
-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
-
-    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Defragment the KV cache
-    // This will be applied:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-
-    //
-    // State / sessions
-    //
-
-    // Returns the *actual* size in bytes of the state
-    // (logits, embedding and kv_cache)
-    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
-    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
-    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
-        "use llama_state_get_size instead");
-
-    // Copies the state to the specified destination address.
-    // Destination needs to have allocated enough memory.
-    // Returns the number of bytes copied
-    LLAMA_API size_t llama_state_get_data(
-            struct llama_context * ctx,
-                         uint8_t * dst,
-                          size_t   size);
-    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
-            struct llama_context * ctx,
-                         uint8_t * dst),
-        "use llama_state_get_data instead");
-
-    // Set the state reading from the specified address
-    // Returns the number of bytes read
-    LLAMA_API size_t llama_state_set_data(
-            struct llama_context * ctx,
-                   const uint8_t * src,
-                          size_t   size);
-    LLAMA_API DEPRECATED(size_t llama_set_state_data(
-            struct llama_context * ctx,
-                   const uint8_t * src),
-        "use llama_state_set_data instead");
-
-    // Save/load session file
-    LLAMA_API bool llama_state_load_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-                     llama_token * tokens_out,
-                          size_t   n_token_capacity,
-                          size_t * n_token_count_out);
-    LLAMA_API DEPRECATED(bool llama_load_session_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-                     llama_token * tokens_out,
-                          size_t   n_token_capacity,
-                          size_t * n_token_count_out),
-        "use llama_state_load_file instead");
-
-    LLAMA_API bool llama_state_save_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-               const llama_token * tokens,
-                          size_t   n_token_count);
-    LLAMA_API DEPRECATED(bool llama_save_session_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-               const llama_token * tokens,
-                          size_t   n_token_count),
-        "use llama_state_save_file instead");
-
-    // Get the exact size needed to copy the KV cache of a single sequence
-    LLAMA_API size_t llama_state_seq_get_size(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Copy the KV cache of a single sequence into the specified buffer
-    LLAMA_API size_t llama_state_seq_get_data(
-            struct llama_context * ctx,
-                         uint8_t * dst,
-                          size_t   size,
-                    llama_seq_id   seq_id);
-
-    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
-    // Returns:
-    //  - Positive: Ok
-    //  - Zero: Failed to load
-    LLAMA_API size_t llama_state_seq_set_data(
-            struct llama_context * ctx,
-                   const uint8_t * src,
-                          size_t   size,
-                    llama_seq_id   dest_seq_id);
-
-    LLAMA_API size_t llama_state_seq_save_file(
-            struct llama_context * ctx,
-                      const char * filepath,
-                    llama_seq_id   seq_id,
-               const llama_token * tokens,
-                          size_t   n_token_count);
-
-    LLAMA_API size_t llama_state_seq_load_file(
-            struct llama_context * ctx,
-                      const char * filepath,
-                    llama_seq_id   dest_seq_id,
-                     llama_token * tokens_out,
-                          size_t   n_token_capacity,
-                          size_t * n_token_count_out);
-
-    //
-    // Decoding
-    //
-
-    // Return batch for single sequence of tokens
-    // The sequence ID will be fixed to 0
-    // The position of the tokens will be tracked automatically by llama_decode
-    //
-    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
-    //
-    LLAMA_API struct llama_batch llama_batch_get_one(
-                  llama_token * tokens,
-                      int32_t   n_tokens);
-
-    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
-    // Each token can be assigned up to n_seq_max sequence ids
-    // The batch has to be freed with llama_batch_free()
-    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
-    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
-    // The rest of the llama_batch members are allocated with size n_tokens
-    // All members are left uninitialized
-    LLAMA_API struct llama_batch llama_batch_init(
-            int32_t n_tokens,
-            int32_t embd,
-            int32_t n_seq_max);
-
-    // Frees a batch of tokens allocated with llama_batch_init()
-    LLAMA_API void llama_batch_free(struct llama_batch batch);
-
-    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-    // Stores the encoder output internally for later use by the decoder cross-attention layers.
-    //   0 - success
-    // < 0 - error
-    LLAMA_API int32_t llama_encode(
-            struct llama_context * ctx,
-              struct llama_batch   batch);
-
-    // Positive return values does not mean a fatal error, but rather a warning.
-    //   0 - success
-    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error
-    LLAMA_API int32_t llama_decode(
-            struct llama_context * ctx,
-              struct llama_batch   batch);
-
-    // Set the number of threads used for decoding
-    // n_threads is the number of threads used for generation (single token)
-    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
-
-    // Get the number of threads used for generation of a single token.
-    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
-
-    // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
-
-    // Set whether the model is in embeddings mode or not
-    // If true, embeddings will be returned but logits will not
-    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
-
-    // Set whether to use causal attention or not
-    // If set to true, the model will only attend to the past tokens
-    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
-
-    // Set abort callback
-    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Wait until all computations are finished
-    // This is automatically done when using one of the functions below to obtain the computation results
-    // and is not necessary to call it explicitly in most cases
-    LLAMA_API void llama_synchronize(struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_decode()
-    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
-    // in the order they have appeared in the batch.
-    // Rows: number of tokens for which llama_batch.logits[i] != 0
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Logits for the ith token. For positive indices, Equivalent to:
-    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
-    // returns NULL for invalid ids.
-    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-
-    // Get all output token embeddings.
-    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
-    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
-    // in the order they have appeared in the batch.
-    // shape: [n_outputs*n_embd]
-    // Otherwise, returns NULL.
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Get the embeddings for the ith token. For positive indices, Equivalent to:
-    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
-    // shape: [n_embd] (1-dimensional)
-    // returns NULL for invalid ids.
-    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the embeddings for a sequence id
-    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
-    // otherwise: float[n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
-
-    //
-    // Vocab
-    //
-
-    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-
-    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-
-    LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
-
-    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
-    LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
-
-    // Identify if Token Id is a control token or a render-able token
-    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
-    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
-    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
-    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
-    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-
-    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
-    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
-
-    // infill tokens
-    DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
-
-    LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
-
-    //
-    // Tokenization
-    //
-    // The API is thread-safe.
-    //
-
-    /// @details Convert the provided text into tokens.
-    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    /// @return Returns the number of tokens on success, no more than n_tokens_max
-    /// @return Returns a negative number on failure - the number of tokens that would have been returned
-    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
-    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
-    ///                      as plaintext. Does not insert a leading space.
-    LLAMA_API int32_t llama_tokenize(
-        const struct llama_model * model,
-                      const char * text,
-                         int32_t   text_len,
-                     llama_token * tokens,
-                         int32_t   n_tokens_max,
-                            bool   add_special,
-                            bool   parse_special);
-
-    // Token Id -> Piece.
-    // Uses the vocabulary in the provided context.
-    // Does not write null terminator to the buffer.
-    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
-    // @param special If true, special tokens are rendered in the output.
-    LLAMA_API int32_t llama_token_to_piece(
-              const struct llama_model * model,
-                           llama_token   token,
-                                  char * buf,
-                               int32_t   length,
-                               int32_t   lstrip,
-                                  bool   special);
-
-    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
-    /// @param text The char pointer must be large enough to hold the resulting text.
-    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
-    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
-    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
-    /// @param unparse_special If true, special tokens are rendered in the output.
-    LLAMA_API int32_t llama_detokenize(
-        const struct llama_model * model,
-               const llama_token * tokens,
-                         int32_t   n_tokens,
-                            char * text,
-                         int32_t   text_len_max,
-                            bool   remove_special,
-                            bool   unparse_special);
-
-    //
-    // Chat templates
-    //
-
-    /// Apply chat template. Inspired by hf apply_chat_template() on python.
-    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
-    /// @param chat Pointer to a list of multiple llama_chat_message
-    /// @param n_msg Number of llama_chat_message in this chat
-    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
-    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
-    /// @param length The size of the allocated buffer
-    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
-    LLAMA_API int32_t llama_chat_apply_template(
-              const struct llama_model * model,
-                            const char * tmpl,
-       const struct llama_chat_message * chat,
-                                size_t   n_msg,
-                                  bool   add_ass,
-                                  char * buf,
-                               int32_t   length);
-
-    //
-    // Sampling API
-    //
-    // Sample usage:
-    //
-    //    // prepare the sampling chain at the start
-    //    auto sparams = llama_sampler_chain_default_params();
-    //
-    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    //
-    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
-    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
-    //
-    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
-    //    // this sampler will be responsible to select the actual token
-    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
-    //
-    //    ...
-    //
-    //    // decoding loop:
-    //    while (...) {
-    //        ...
-    //
-    //        llama_decode(ctx, batch);
-    //
-    //        // sample from the logits of the last token in the batch
-    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
-    //
-    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
-    //        llama_sampler_accept(smpl, id);
-    //        ...
-    //    }
-    //
-    //    llama_sampler_free(smpl);
-    //
-    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
-    // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
-    //
-
-    typedef void * llama_sampler_context_t;
-
-    // user code can implement the interface below in order to create custom llama_sampler
-    struct llama_sampler_i {
-        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
-        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
-        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
-        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
-        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-
-        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
-        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
-    };
-
-    struct llama_sampler {
-        struct llama_sampler_i  * iface;
-        llama_sampler_context_t   ctx;
-    };
-
-    // mirror of llama_sampler_i:
-    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
-    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
-    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
-    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
-    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
-    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
-    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
-
-    // llama_sampler_chain
-    // a type of llama_sampler that can chain multiple samplers one after another
-
-    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
-
-    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
-    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
-    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
-
-    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
-    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
-
-    // available samplers:
-
-    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
-
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
-
-    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
-
-    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
-
-    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
-
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
-
-    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
-
-    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
-
-    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
-
-    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
-    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
-
-    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
-                             int32_t   n_vocab,
-                            uint32_t   seed,
-                               float   tau,
-                               float   eta,
-                             int32_t   m);
-
-    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
-                            uint32_t   seed,
-                               float   tau,
-                               float   eta);
-
-    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
-            const struct llama_model * model,
-                          const char * grammar_str,
-                          const char * grammar_root);
-
-    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
-                             int32_t   n_vocab,         // llama_n_vocab()
-                         llama_token   special_eos_id,  // llama_token_eos()
-                         llama_token   linefeed_id,     // llama_token_nl()
-                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
-                               float   penalty_repeat,  // 1.0 = disabled
-                               float   penalty_freq,    // 0.0 = disabled
-                               float   penalty_present, // 0.0 = disabled
-                                bool   penalize_nl,     // consider newlines as a repeatable token
-                                bool   ignore_eos);     // ignore the end-of-sequence token
-
-    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
-    LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
-            const struct llama_model *  model,
-                               float    dry_multiplier,
-                               float    dry_base,
-                             int32_t    dry_allowed_length,
-                             int32_t    dry_penalty_last_n,
-                          const char ** seq_breakers,
-                              size_t    num_breakers);
-
-    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
-                             int32_t   n_vocab,
-                             int32_t   n_logit_bias,
-              const llama_logit_bias * logit_bias);
-
-    // this sampler is meant to be used for fill-in-the-middle infilling
-    // it's supposed to be used after top_k + top_p sampling
-    //
-    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-    // 2. combine probs of tokens that have the same prefix
-    //
-    // example:
-    //
-    // - before:
-    //   "hel":   0.5
-    //   "hell":  0.2
-    //   "hello": 0.1
-    //   "dummy": 0.1
-    //
-    // - after:
-    //   "hel":   0.8
-    //   "dummy": 0.1
-    //
-    // 3. discard non-EOG tokens with low prob
-    // 4. if no tokens are left -> pick EOT
-    //
-    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
-
-    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
-    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
-
-    /// @details Sample and accept a token from the idx-th output of the last evaluation
-    //
-    // Shorthand for:
-    //    const auto * logits = llama_get_logits_ith(ctx, idx);
-    //    llama_token_data_array cur_p = { ... init from logits ... };
-    //    llama_sampler_apply(smpl, &cur_p);
-    //    auto token = cur_p.data[cur_p.selected].id;
-    //    llama_sampler_accept(smpl, token);
-    //    return token;
-    // Returns the sampled token
-    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
-
-    // TODO: extend in the future
-    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
-
-    //
-    // Model split
-    //
-
-    /// @details Build a split GGUF final path for this chunk.
-    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
-    //  Returns the split_path length.
-    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
-
-    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-    //  Returns the split_prefix length.
-    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-    // Set callback for all future logging events.
-    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
-
-    //
-    // Performance utils
-    //
-    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
-    //
-
-    struct llama_perf_context_data {
-        double t_start_ms;
-        double t_load_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_p_eval;
-        int32_t n_eval;
-    };
-
-    struct llama_perf_sampler_data {
-        double t_sample_ms;
-
-        int32_t n_sample;
-    };
-
-    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
-    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
-    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
-
-    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
-    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
-    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
-    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
-
-    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // LLAMA_H
diff --git a/models/ggml-vocab-llama-bpe.gguf b/models/ggml-vocab-jarvis-bpe.gguf
similarity index 100%
rename from models/ggml-vocab-llama-bpe.gguf
rename to models/ggml-vocab-jarvis-bpe.gguf
diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-jarvis-bpe.gguf.inp
similarity index 100%
rename from models/ggml-vocab-llama-bpe.gguf.inp
rename to models/ggml-vocab-jarvis-bpe.gguf.inp
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-jarvis-bpe.gguf.out
similarity index 100%
rename from models/ggml-vocab-llama-bpe.gguf.out
rename to models/ggml-vocab-jarvis-bpe.gguf.out
diff --git a/models/ggml-vocab-llama-spm.gguf b/models/ggml-vocab-jarvis-spm.gguf
similarity index 100%
rename from models/ggml-vocab-llama-spm.gguf
rename to models/ggml-vocab-jarvis-spm.gguf
diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-jarvis-spm.gguf.inp
similarity index 100%
rename from models/ggml-vocab-llama-spm.gguf.inp
rename to models/ggml-vocab-jarvis-spm.gguf.inp
diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-jarvis-spm.gguf.out
similarity index 100%
rename from models/ggml-vocab-llama-spm.gguf.out
rename to models/ggml-vocab-jarvis-spm.gguf.out
diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt
index d5405ad2991d1..0eaed399a0a4c 100644
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -1,9 +1,9 @@
-set(TARGET llama-vdot)
+set(TARGET jarvis-vdot)
 add_executable(${TARGET} vdot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET llama-q8dot)
+set(TARGET jarvis-q8dot)
 add_executable(${TARGET} q8dot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/pyproject.toml b/pyproject.toml
index 84e71de6def38..d559bedbf4d94 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,12 @@
 [tool.poetry]
-name = "llama-cpp-scripts"
+name = "jarvis-cpp-scripts"
 version = "0.0.0"
-description = "Scripts that ship with llama.cpp"
+description = "Scripts that ship with jarvis.cpp"
 authors = ["GGML <ggml@ggml.ai>"]
 readme = "README.md"
 homepage = "https://ggml.ai"
-repository = "https://github.com/ggerganov/llama.cpp"
-keywords = ["ggml", "gguf", "llama.cpp"]
+repository = "https://github.com/ggerganov/jarvis.cpp"
+keywords = ["ggml", "gguf", "jarvis.cpp"]
 packages = [{ include = "*.py", from = "." }]
 classifiers = [
     "Programming Language :: Python :: 3",
@@ -39,6 +39,6 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-llama-convert-hf-to-gguf = "convert_hf_to_gguf:main"
-llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
-llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
+jarvis-convert-hf-to-gguf = "convert_hf_to_gguf:main"
+jarvis-convert-jarvis-ggml-to-gguf = "convert_jarvis_ggml_to_gguf:main"
+jarvis-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
diff --git a/requirements.txt b/requirements.txt
index 9e190ae27de38..b01b5b4704c73 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 # These requirements include all dependencies for all top-level python scripts
-# for llama.cpp. Avoid adding packages here directly.
+# for jarvis.cpp. Avoid adding packages here directly.
 #
 # Package versions must stay compatible across all top-level python scripts.
 #
 
--r ./requirements/requirements-convert_legacy_llama.txt
+-r ./requirements/requirements-convert_legacy_jarvis.txt
 
 -r ./requirements/requirements-convert_hf_to_gguf.txt
 -r ./requirements/requirements-convert_hf_to_gguf_update.txt
--r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements/requirements-convert_jarvis_ggml_to_gguf.txt
 -r ./requirements/requirements-convert_lora_to_gguf.txt
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 94de59d7e1860..8280e4d1caffd 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -2,11 +2,11 @@
 -r ../examples/server/bench/requirements.txt
 -r ../examples/server/tests/requirements.txt
 
--r ./requirements-compare-llama-bench.txt
+-r ./requirements-compare-jarvis-bench.txt
 -r ./requirements-pydantic.txt
 -r ./requirements-test-tokenizer-random.txt
 
 -r ./requirements-convert_hf_to_gguf.txt
 -r ./requirements-convert_hf_to_gguf_update.txt
--r ./requirements-convert_legacy_llama.txt
--r ./requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements-convert_legacy_jarvis.txt
+-r ./requirements-convert_jarvis_ggml_to_gguf.txt
diff --git a/requirements/requirements-compare-llama-bench.txt b/requirements/requirements-compare-jarvis-bench.txt
similarity index 100%
rename from requirements/requirements-compare-llama-bench.txt
rename to requirements/requirements-compare-jarvis-bench.txt
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 8cb9c354f0152..bf04bf745cd4f 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,3 +1,3 @@
--r ./requirements-convert_legacy_llama.txt
+-r ./requirements-convert_legacy_jarvis.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.2.1
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
index 8cb9c354f0152..bf04bf745cd4f 100644
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,3 +1,3 @@
--r ./requirements-convert_legacy_llama.txt
+-r ./requirements-convert_legacy_jarvis.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.2.1
diff --git a/requirements/requirements-convert_jarvis_ggml_to_gguf.txt b/requirements/requirements-convert_jarvis_ggml_to_gguf.txt
new file mode 100644
index 0000000000000..95900292538a8
--- /dev/null
+++ b/requirements/requirements-convert_jarvis_ggml_to_gguf.txt
@@ -0,0 +1 @@
+-r ./requirements-convert_legacy_jarvis.txt
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_jarvis.txt
similarity index 100%
rename from requirements/requirements-convert_legacy_llama.txt
rename to requirements/requirements-convert_legacy_jarvis.txt
diff --git a/requirements/requirements-convert_llama_ggml_to_gguf.txt b/requirements/requirements-convert_llama_ggml_to_gguf.txt
deleted file mode 100644
index afe2747d448d4..0000000000000
--- a/requirements/requirements-convert_llama_ggml_to_gguf.txt
+++ /dev/null
@@ -1 +0,0 @@
--r ./requirements-convert_legacy_llama.txt
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
index fa9e7bacdb8cf..2441038b771ee 100755
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -24,7 +24,7 @@ if out=$($CC -dumpmachine); then
     build_target=$out
 fi
 
-echo "int LLAMA_BUILD_NUMBER = ${build_number};"
-echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
-echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
-echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
+echo "int JARVIS_BUILD_NUMBER = ${build_number};"
+echo "char const *JARVIS_COMMIT = \"${build_commit}\";"
+echo "char const *JARVIS_COMPILER = \"${build_compiler}\";"
+echo "char const *JARVIS_BUILD_TARGET = \"${build_target}\";"
diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh
index d3bbded130daf..bf9a24dc861c9 100755
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@@ -70,7 +70,7 @@ if (( do_cleanup )); then
 fi
 
 this=$(realpath -- "$0"); readonly this
-cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
+cd "$(dirname "$this")/.." # PWD should stay in jarvis.cpp project directory
 
 shellcheck "$this"
 
@@ -166,11 +166,11 @@ if (( do_cleanup )); then
     rm -rf -- "$all_venv"
 fi
 
-check_convert_script examples/convert_legacy_llama.py
+check_convert_script examples/convert_legacy_jarvis.py
 for py in convert_*.py; do
     # skip convert_hf_to_gguf_update.py
     # TODO: the check is failing for some reason:
-    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
+    #       https://github.com/ggerganov/jarvis.cpp/actions/runs/8875330981/job/24364557177?pr=6920
     [[ $py == convert_hf_to_gguf_update.py ]] && continue
 
     check_convert_script "$py"
diff --git a/scripts/ci-run.sh b/scripts/ci-run.sh
index 06b5d9c6e5949..f1109d4748299 100755
--- a/scripts/ci-run.sh
+++ b/scripts/ci-run.sh
@@ -11,23 +11,23 @@ usage:
 
 This script wraps ci/run.sh:
 * If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
-    (openllama_3b_v2: quantized models are about 30GB)
+    (openjarvis_3b_v2: quantized models are about 30GB)
 * Persistent model and data files are synced to and from <cache_dir>,
     excluding generated .gguf files.
-    (openllama_3b_v2: persistent files are about 6.6GB)
-* <cache_dir> defaults to  ~/.cache/llama.cpp
+    (openjarvis_3b_v2: persistent files are about 6.6GB)
+* <cache_dir> defaults to  ~/.cache/jarvis.cpp
 EOF
     exit 1
 fi
 
-cd .. # => llama.cpp repo root
+cd .. # => jarvis.cpp repo root
 
 tmp="$1"
 mkdir -p "$tmp"
 tmp=$(realpath "$tmp")
 echo >&2 "Using tmp=$tmp"
 
-cache="${2-$HOME/.cache/llama.cpp}"
+cache="${2-$HOME/.cache/jarvis.cpp}"
 mkdir -p "$cache"
 cache=$(realpath "$cache")
 echo >&2 "Using cache=$cache"
@@ -41,10 +41,10 @@ _sync() {
     rsync -a "$from" "$to" --delete-during "$@"
 }
 
-_sync "$(realpath .)/" "$tmp/llama.cpp"
-_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
+_sync "$(realpath .)/" "$tmp/jarvis.cpp"
+_sync "$cache/ci-mnt/models/" "$tmp/jarvis.cpp/ci-mnt/models/"
 
-cd "$tmp/llama.cpp"
+cd "$tmp/jarvis.cpp"
 bash ci/run.sh ci-out ci-mnt
 
 _sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 8b9b1ad39f384..a99b60e9c1dd1 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 if [ $# -lt 2 ]; then
-    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
+    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional jarvis-bench arguments]"
     exit 1
 fi
 
@@ -9,22 +9,22 @@ set -e
 set -x
 
 # verify at the start that the compare script has all the necessary dependencies installed
-./scripts/compare-llama-bench.py --check
+./scripts/compare-jarvis-bench.py --check
 
 bench_args="${@:3}"
 
-rm -f llama-bench.sqlite > /dev/null
+rm -f jarvis-bench.sqlite > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 
 git checkout $1 > /dev/null
 make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+make -j$(nproc) $make_opts jarvis-bench > /dev/null
+./jarvis-bench -o sql -oe md $bench_args | sqlite3 jarvis-bench.sqlite
 
 git checkout $2 > /dev/null
 make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+make -j$(nproc) $make_opts jarvis-bench > /dev/null
+./jarvis-bench -o sql -oe md $bench_args | sqlite3 jarvis-bench.sqlite
 
-./scripts/compare-llama-bench.py -b $1 -c $2
+./scripts/compare-jarvis-bench.py -b $1 -c $2
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-jarvis-bench.py
old mode 100755
new mode 100644
similarity index 94%
rename from scripts/compare-llama-bench.py
rename to scripts/compare-jarvis-bench.py
index e45e83ce8ea6f..db6f30c0f9b12
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-jarvis-bench.py
@@ -15,7 +15,7 @@
     print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
     raise e
 
-logger = logging.getLogger("compare-llama-bench")
+logger = logging.getLogger("compare-jarvis-bench")
 
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
@@ -42,17 +42,17 @@
 GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
 MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
 
-DESCRIPTION = """Creates tables from llama-bench data written to an SQLite database. Example usage (Linux):
+DESCRIPTION = """Creates tables from jarvis-bench data written to an SQLite database. Example usage (Linux):
 
 $ git checkout master
-$ make clean && make llama-bench
-$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
+$ make clean && make jarvis-bench
+$ ./jarvis-bench -o sql | sqlite3 jarvis-bench.sqlite
 $ git checkout some_branch
-$ make clean && make llama-bench
-$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
-$ ./scripts/compare-llama-bench.py
+$ make clean && make jarvis-bench
+$ ./jarvis-bench -o sql | sqlite3 jarvis-bench.sqlite
+$ ./scripts/compare-jarvis-bench.py
 
-Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
+Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of jarvis-bench.
 """
 
 parser = argparse.ArgumentParser(
@@ -66,12 +66,12 @@
 help_c = (
     "The commit whose performance is to be compared to the baseline. "
     "Accepts either a branch name, tag name, or commit hash. "
-    "Defaults to the non-master commit for which llama-bench was run most recently."
+    "Defaults to the non-master commit for which jarvis-bench was run most recently."
 )
 parser.add_argument("-c", "--compare", help=help_c)
 help_i = (
     "Input SQLite file for comparing commits. "
-    "Defaults to 'llama-bench.sqlite' in the current working directory. "
+    "Defaults to 'jarvis-bench.sqlite' in the current working directory. "
     "If no such file is found and there is exactly one .sqlite file in the current directory, "
     "that file is instead used as input."
 )
@@ -90,7 +90,7 @@
     "Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
     "plus any column where not all data points are the same. "
     "If the columns are manually specified, then the results for each unique combination of the "
-    "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
+    "specified values are averaged WITHOUT weighing by the --repetitions parameter of jarvis-bench."
 )
 parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
 parser.add_argument("-s", "--show", help=help_s)
@@ -110,8 +110,8 @@
     sys.exit(1)
 
 input_file = known_args.input
-if input_file is None and os.path.exists("./llama-bench.sqlite"):
-    input_file = "llama-bench.sqlite"
+if input_file is None and os.path.exists("./jarvis-bench.sqlite"):
+    input_file = "jarvis-bench.sqlite"
 if input_file is None:
     sqlite_files = glob("*.sqlite")
     if len(sqlite_files) == 1:
@@ -234,7 +234,7 @@ def get_commit_hexsha8(name):
     if hexsha8_compare is None:
         logger.error(f"cannot find data for compare={known_args.compare}.")
         sys.exit(1)
-# Otherwise, search for the commit for llama-bench was most recently run
+# Otherwise, search for the commit for jarvis-bench was most recently run
 # and that is not a parent of master:
 elif repo is not None:
     hexsha8s_master = get_all_parent_hexsha8s(repo.heads.master.commit)
diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh
index c6c1e988a0027..bd85eb902e14f 100755
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@@ -109,8 +109,8 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 # Step 2: Setup Build Environment and Compile Test Binaries
 ###########################################################
 
-# Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
+# Note: test-eval-callback requires -DJARVIS_CURL
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DJARVIS_CURL=1 || abort "Failed to build environment"
 pushd "$build_dir"
 make -j || abort "Failed to compile"
 popd > /dev/null || exit 1
diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh
index 4e1b1cc15f01a..27bad09a58b44 100755
--- a/scripts/get-hellaswag.sh
+++ b/scripts/get-hellaswag.sh
@@ -4,7 +4,7 @@ wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag
 
 echo "Usage:"
 echo ""
-echo "  ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
+echo "  ./jarvis-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh
index 9c65fafbcc50b..d05649c81dc11 100755
--- a/scripts/get-wikitext-103.sh
+++ b/scripts/get-wikitext-103.sh
@@ -4,7 +4,7 @@ wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.
 
 echo "Usage:"
 echo ""
-echo "  ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo "  ./jarvis-perplexity -m model.gguf -f wiki.test.raw [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh
index 5f3845ef59a9e..53c6c9a2c281a 100755
--- a/scripts/get-wikitext-2.sh
+++ b/scripts/get-wikitext-2.sh
@@ -5,7 +5,7 @@ unzip wikitext-2-raw-v1.zip
 
 echo "Usage:"
 echo ""
-echo "  ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
+echo "  ./jarvis-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-winogrande.sh b/scripts/get-winogrande.sh
index f1fc0e2d47adb..a5d1f72512630 100755
--- a/scripts/get-winogrande.sh
+++ b/scripts/get-winogrande.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv
+wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-jarvis.cpp/raw/main/winogrande-debiased-eval.csv
 
 echo "Usage:"
 echo ""
-echo "  ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
+echo "  ./jarvis-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/hf.sh b/scripts/hf.sh
index 85c2c4d9a952e..b0de31ba9a7b5 100755
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@@ -3,9 +3,9 @@
 # Shortcut for downloading HF models
 #
 # Usage:
-#   ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./jarvis-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./jarvis-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./jarvis-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
 #
 
 # all logs go to stderr
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
index 6e56e1ed0908c..4d54b423ea3fa 100644
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@@ -7,13 +7,13 @@
 if [ -z "$1" ]; then
     echo "Usage: $0 <data>"
     echo "  0: no models"
-    echo "  1: tinyllama-1b"
-    echo "  2: codellama-7b"
-    echo "  3: codellama-13b"
-    echo "  4: codellama-34b"
-    echo "  5: codellama-7b-instruct"
-    echo "  6: codellama-13b-instruct"
-    echo "  7: codellama-34b-instruct"
+    echo "  1: tinyjarvis-1b"
+    echo "  2: codejarvis-7b"
+    echo "  3: codejarvis-13b"
+    echo "  4: codejarvis-34b"
+    echo "  5: codejarvis-7b-instruct"
+    echo "  6: codejarvis-13b-instruct"
+    echo "  7: codejarvis-34b-instruct"
 
     exit 1
 fi
@@ -36,26 +36,26 @@ cd /workspace
 git clone https://github.com/iboB/git-lfs-download
 ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
 
-# llama.cpp
+# jarvis.cpp
 cd /workspace
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggerganov/jarvis.cpp
 
-cd llama.cpp
+cd jarvis.cpp
 
 GGML_CUDA=1 make -j
 
-ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
-ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
-ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
-ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
-ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
-ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
-ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
+ln -sfn /workspace/TinyJarvis-1.1B-Chat-v0.3  ./models/tinyjarvis-1b
+ln -sfn /workspace/CodeJarvis-7b-hf           ./models/codejarvis-7b
+ln -sfn /workspace/CodeJarvis-13b-hf          ./models/codejarvis-13b
+ln -sfn /workspace/CodeJarvis-34b-hf          ./models/codejarvis-34b
+ln -sfn /workspace/CodeJarvis-7b-Instruct-hf  ./models/codejarvis-7b-instruct
+ln -sfn /workspace/CodeJarvis-13b-Instruct-hf ./models/codejarvis-13b-instruct
+ln -sfn /workspace/CodeJarvis-34b-Instruct-hf ./models/codejarvis-34b-instruct
 
 pip install -r requirements.txt
 
 # cmake
-cd /workspace/llama.cpp
+cd /workspace/jarvis.cpp
 
 mkdir build-cublas
 cd build-cublas
@@ -71,142 +71,142 @@ fi
 if [ "$1" -eq "1" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
+    git-lfs-download https://huggingface.co/PY007/TinyJarvis-1.1B-Chat-v0.3
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/tinyjarvis-1b  --outfile ./models/tinyjarvis-1b/ggml-model-f16.gguf  --outtype f16
 
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/tinyjarvis-1b/ggml-model-f16.gguf ./models/tinyjarvis-1b/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/tinyjarvis-1b/ggml-model-f16.gguf ./models/tinyjarvis-1b/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/tinyjarvis-1b/ggml-model-f16.gguf ./models/tinyjarvis-1b/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "2" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-7b-hf  --without *safetensors*
+    rm -v ./CodeJarvis-7b-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-7b  --outfile ./models/codejarvis-7b/ggml-model-f16.gguf  --outtype f16
 
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-7b/ggml-model-f16.gguf ./models/codejarvis-7b/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-7b/ggml-model-f16.gguf ./models/codejarvis-7b/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-7b/ggml-model-f16.gguf ./models/codejarvis-7b/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "3" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-13b-hf --without *safetensors*
+    rm -v ./CodeJarvis-13b-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-13b --outfile ./models/codejarvis-13b/ggml-model-f16.gguf --outtype f16
 
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-13b/ggml-model-f16.gguf ./models/codejarvis-13b/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-13b/ggml-model-f16.gguf ./models/codejarvis-13b/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-13b/ggml-model-f16.gguf ./models/codejarvis-13b/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "4" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-34b-hf --without *safetensors*
+    rm -v ./CodeJarvis-34b-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-34b --outfile ./models/codejarvis-34b/ggml-model-f16.gguf --outtype f16
 
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-34b/ggml-model-f16.gguf ./models/codejarvis-34b/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-34b/ggml-model-f16.gguf ./models/codejarvis-34b/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-34b/ggml-model-f16.gguf ./models/codejarvis-34b/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "5" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-7b-Instruct-hf  --without *safetensors*
+    rm -v ./CodeJarvis-7b-Instruct-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-7b-instruct  --outfile ./models/codejarvis-7b-instruct/ggml-model-f16.gguf  --outtype f16
 
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-7b-instruct/ggml-model-f16.gguf ./models/codejarvis-7b-instruct/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-7b-instruct/ggml-model-f16.gguf ./models/codejarvis-7b-instruct/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-7b-instruct/ggml-model-f16.gguf ./models/codejarvis-7b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "6" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-13b-Instruct-hf --without *safetensors*
+    rm -v ./CodeJarvis-13b-Instruct-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-13b-instruct --outfile ./models/codejarvis-13b-instruct/ggml-model-f16.gguf --outtype f16
 
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-13b-instruct/ggml-model-f16.gguf ./models/codejarvis-13b-instruct/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-13b-instruct/ggml-model-f16.gguf ./models/codejarvis-13b-instruct/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-13b-instruct/ggml-model-f16.gguf ./models/codejarvis-13b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "7" ]; then
     cd /workspace
 
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
+    git-lfs-download https://huggingface.co/codejarvis/CodeJarvis-34b-Instruct-hf --without *safetensors*
+    rm -v ./CodeJarvis-34b-Instruct-hf/*safetensors*
 
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert_legacy_jarvis.py ./models/codejarvis-34b-instruct --outfile ./models/codejarvis-34b-instruct/ggml-model-f16.gguf --outtype f16
 
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
+    ./jarvis-quantize ./models/codejarvis-34b-instruct/ggml-model-f16.gguf ./models/codejarvis-34b-instruct/ggml-model-q4_0.gguf q4_0
+    ./jarvis-quantize ./models/codejarvis-34b-instruct/ggml-model-f16.gguf ./models/codejarvis-34b-instruct/ggml-model-q4_k.gguf q4_k
+    ./jarvis-quantize ./models/codejarvis-34b-instruct/ggml-model-f16.gguf ./models/codejarvis-34b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 
 if [ "$1" -eq "1" ]; then
     # perf + perplexity
-    cd /workspace/llama.cpp/build-cublas
+    cd /workspace/jarvis.cpp/build-cublas
 
-    make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
+    make -j && ../scripts/run-all-perf.sh tinyjarvis-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
 
     ../scripts/get-wikitext-2.sh
     unzip wikitext-2-raw-v1.zip
 
-    make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
+    make -j && ./bin/jarvis-perplexity -m ../models/tinyjarvis-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
 
     # batched
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
+    GGML_CUDA=1 make -j && ./jarvis-batched ./models/tinyjarvis-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
 
     # batched-bench
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
+    GGML_CUDA=1 make -j && ./jarvis-batched-bench ./models/tinyjarvis-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
 
     # parallel
-    cd /workspace/llama.cpp
+    cd /workspace/jarvis.cpp
 
-    GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
+    GGML_CUDA=1 make -j && ./jarvis-parallel -m ./models/tinyjarvis-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
 
 fi
 
 # speculative
 #if [ "$1" -eq "7" ]; then
-#    cd /workspace/llama.cpp
+#    cd /workspace/jarvis.cpp
 #
-#    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
+#    GGML_CUDA=1 make -j && ./jarvis-speculative -m ./models/codejarvis-34b-instruct/ggml-model-f16.gguf -md ./models/codejarvis-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 
 # more benches
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#GGML_CUDA=1 make -j && ./jarvis-batched-bench ./models/codejarvis-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
+#GGML_CUDA=1 make -j && ./jarvis-batched-bench ./models/codejarvis-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh
index bc43738a2f498..c5337edf534d6 100755
--- a/scripts/qnt-all.sh
+++ b/scripts/qnt-all.sh
@@ -26,5 +26,5 @@ set -e
 mkdir -p ${out}
 
 for q in ${qnt[@]}; do
-    time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
+    time ./bin/jarvis-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
 done
diff --git a/scripts/run-all-perf.sh b/scripts/run-all-perf.sh
index 6384e364d5584..12205fe619b3d 100755
--- a/scripts/run-all-perf.sh
+++ b/scripts/run-all-perf.sh
@@ -31,4 +31,4 @@ for q in ${qnt[@]}; do
     mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf"
 done
 
-./bin/llama-bench ${mstr} ${args} 2> /dev/null
+./bin/jarvis-bench ${mstr} ${args} 2> /dev/null
diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh
index e15f74f1b666d..d12abf48ad393 100755
--- a/scripts/run-all-ppl.sh
+++ b/scripts/run-all-ppl.sh
@@ -26,5 +26,5 @@ out="../tmp/results-${model}"
 mkdir -p ${out}
 
 for q in ${qnt[@]}; do
-    time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
+    time ./bin/jarvis-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
 done
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
index 47cacb4321589..bfe3b33826c99 100755
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -10,7 +10,7 @@
 
 logger = logging.getLogger("run-with-preset")
 
-CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
+CLI_ARGS_JARVIS_CLI_PERPLEXITY = [
     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
     "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
@@ -24,26 +24,26 @@
     "verbose-prompt"
 ]
 
-CLI_ARGS_LLAMA_BENCH = [
+CLI_ARGS_JARVIS_BENCH = [
     "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
     "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
 ]
 
-CLI_ARGS_LLAMA_SERVER = [
+CLI_ARGS_JARVIS_SERVER = [
     "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
     "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
     "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
     "threads", "verbose"
 ]
 
-description = """Run llama.cpp binaries with presets from YAML file(s).
-To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported).
-To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
+description = """Run jarvis.cpp binaries with presets from YAML file(s).
+To specify which binary should be run, specify the "binary" property (jarvis-cli, jarvis-perplexity, jarvis-bench, and jarvis-server are supported).
+To get a preset file template, run a jarvis.cpp binary with the "--logdir" CLI argument.
 
 Formatting considerations:
 - The YAML property names are the same as the CLI argument names of the corresponding binary.
-- Properties must use the long name of their corresponding llama.cpp CLI arguments.
-- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
+- Properties must use the long name of their corresponding jarvis.cpp CLI arguments.
+- Like the jarvis.cpp binaries the property names do not differentiate between hyphens and underscores.
 - Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
 - To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
 - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
@@ -77,19 +77,19 @@
 
 props = {prop.replace("_", "-"): val for prop, val in props.items()}
 
-binary = props.pop("binary", "llama-cli")
+binary = props.pop("binary", "jarvis-cli")
 if known_args.binary:
     binary = known_args.binary
 
 if os.path.exists(f"./{binary}"):
     binary = f"./{binary}"
 
-if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"):
-    cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY
-elif binary.lower().endswith("llama-bench"):
-    cli_args = CLI_ARGS_LLAMA_BENCH
-elif binary.lower().endswith("llama-server"):
-    cli_args = CLI_ARGS_LLAMA_SERVER
+if binary.lower().endswith("jarvis-cli") or binary.lower().endswith("jarvis-perplexity"):
+    cli_args = CLI_ARGS_JARVIS_CLI_PERPLEXITY
+elif binary.lower().endswith("jarvis-bench"):
+    cli_args = CLI_ARGS_JARVIS_BENCH
+elif binary.lower().endswith("jarvis-server"):
+    cli_args = CLI_ARGS_JARVIS_SERVER
 else:
     logger.error(f"Unknown binary: {binary}")
     sys.exit(1)
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
index 802592a3e0d3b..920eb6697f62d 100644
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 #
-# Helper script for deploying llama.cpp server with a single Bash command
+# Helper script for deploying jarvis.cpp server with a single Bash command
 #
 # - Works on Linux and macOS
 # - Supports: CPU, CUDA, Metal
 # - Can run all GGUF models from HuggingFace
 # - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
+# - Always builds latest jarvis.cpp from GitHub
 #
 # Limitations
 #
@@ -172,12 +172,12 @@ fi
 
 # sample repos
 repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
+    "https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF"
+    "https://huggingface.co/TheBloke/Jarvis-2-13B-GGUF"
+    "https://huggingface.co/TheBloke/Jarvis-2-70B-GGUF"
+    "https://huggingface.co/TheBloke/CodeJarvis-7B-GGUF"
+    "https://huggingface.co/TheBloke/CodeJarvis-13B-GGUF"
+    "https://huggingface.co/TheBloke/CodeJarvis-34B-GGUF"
     "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
     "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
     "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
@@ -185,13 +185,13 @@ repos=(
 )
 if [ $is_interactive -eq 1 ]; then
     printf "\n"
-    printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
+    printf "[I] This is a helper script for deploying jarvis.cpp's server on this machine.\n\n"
     printf "    Based on the options that follow, the script might download a model file\n"
     printf "    from the internet, which can be a few GBs in size. The script will also\n"
-    printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
+    printf "    build the latest jarvis.cpp source code from GitHub, which can be unstable.\n"
     printf "\n"
     printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-    printf "    model using llama.cpp for demonstration purposes.\n"
+    printf "    model using jarvis.cpp for demonstration purposes.\n"
     printf "\n"
     printf "    Please note:\n"
     printf "\n"
@@ -199,7 +199,7 @@ if [ $is_interactive -eq 1 ]; then
     printf "    - The server will be listening on all network interfaces\n"
     printf "    - The server will run with default settings which are not always optimal\n"
     printf "    - Do not judge the quality of a model based on the results from this script\n"
-    printf "    - Do not use this script to benchmark llama.cpp\n"
+    printf "    - Do not use this script to benchmark jarvis.cpp\n"
     printf "    - Do not use this script in production\n"
     printf "    - This script is only for demonstration purposes\n"
     printf "\n"
@@ -334,42 +334,42 @@ else
     printf "[+] Using cached weights %s\n" "$wfile"
 fi
 
-# get latest llama.cpp and build
+# get latest jarvis.cpp and build
 
-printf "[+] Downloading latest llama.cpp\n"
+printf "[+] Downloading latest jarvis.cpp\n"
 
-llama_cpp_dir="__llama_cpp_port_${port}__"
+jarvis_cpp_dir="__jarvis_cpp_port_${port}__"
 
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
+if [[ -d "$jarvis_cpp_dir" && ! -f "$jarvis_cpp_dir/__ggml_script__" ]]; then
     # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
+    printf "[-] Directory %s already exists\n" "$jarvis_cpp_dir"
     printf "[-] Please remove it and try again\n"
     exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
+elif [[ -d "$jarvis_cpp_dir" ]]; then
+    printf "[+] Directory %s already exists\n" "$jarvis_cpp_dir"
+    printf "[+] Using cached jarvis.cpp\n"
 
-    cd "$llama_cpp_dir"
+    cd "$jarvis_cpp_dir"
     git reset --hard
     git fetch
     git checkout origin/master
 
     cd ..
 else
-    printf "[+] Cloning llama.cpp\n"
+    printf "[+] Cloning jarvis.cpp\n"
 
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
+    git clone https://github.com/ggerganov/jarvis.cpp "$jarvis_cpp_dir"
 fi
 
 # mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
+touch "$jarvis_cpp_dir/__ggml_script__"
 
 if [[ $verbose -eq 1 ]]; then
     set -x
 fi
 
 # build
-cd "$llama_cpp_dir"
+cd "$jarvis_cpp_dir"
 
 make clean
 
@@ -380,13 +380,13 @@ fi
 
 if [[ "$backend" == "cuda" ]]; then
     printf "[+] Building with CUDA backend\n"
-    GGML_CUDA=1 make -j llama-server $log
+    GGML_CUDA=1 make -j jarvis-server $log
 elif [[ "$backend" == "cpu" ]]; then
     printf "[+] Building with CPU backend\n"
-    make -j llama-server $log
+    make -j jarvis-server $log
 elif [[ "$backend" == "metal" ]]; then
     printf "[+] Building with Metal backend\n"
-    make -j llama-server $log
+    make -j jarvis-server $log
 else
     printf "[-] Unknown backend: %s\n" "$backend"
     exit 1
@@ -413,6 +413,6 @@ if [[ $verbose -eq 1 ]]; then
     args="$args --verbose"
 fi
 
-./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
+./jarvis-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
 
 exit 0
diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index fba29b9352e68..7094782f16685 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #
-# Synchronize ggml changes to llama.cpp
+# Synchronize ggml changes to jarvis.cpp
 #
 # Usage:
 #
-#   $ cd /path/to/llama.cpp
+#   $ cd /path/to/jarvis.cpp
 #   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
 #
 
@@ -13,7 +13,7 @@ set -e
 sd=$(dirname $0)
 cd $sd/../
 
-SRC_LLAMA=$(pwd)
+SRC_JARVIS=$(pwd)
 SRC_GGML=$(cd ../ggml; pwd)
 
 if [ ! -d $SRC_GGML ]; then
@@ -21,7 +21,7 @@ if [ ! -d $SRC_GGML ]; then
     exit 1
 fi
 
-lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
+lc=$(cat $SRC_JARVIS/scripts/sync-ggml.last)
 echo "Syncing ggml changes since commit $lc"
 
 to_skip=""
@@ -46,16 +46,16 @@ done
 cd $SRC_GGML
 
 git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD --reverse | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
+git log --oneline $lc..HEAD --reverse | grep -v "(jarvis/[0-9]*)" | cut -d' ' -f1 > $SRC_JARVIS/ggml-commits
 
-if [ ! -s $SRC_LLAMA/ggml-commits ]; then
-    rm -v $SRC_LLAMA/ggml-commits
+if [ ! -s $SRC_JARVIS/ggml-commits ]; then
+    rm -v $SRC_JARVIS/ggml-commits
     echo "No new commits"
     exit 0
 fi
 
-if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-    rm -v $SRC_LLAMA/ggml-src.patch
+if [ -f $SRC_JARVIS/ggml-src.patch ]; then
+    rm -v $SRC_JARVIS/ggml-src.patch
 fi
 
 while read c; do
@@ -89,19 +89,19 @@ while read c; do
         tests/test-backend-ops.cpp \
         LICENSE \
         scripts/gen-authors.sh \
-        >> $SRC_LLAMA/ggml-src.patch
-done < $SRC_LLAMA/ggml-commits
+        >> $SRC_JARVIS/ggml-src.patch
+done < $SRC_JARVIS/ggml-commits
 
-rm -v $SRC_LLAMA/ggml-commits
+rm -v $SRC_JARVIS/ggml-commits
 
 # delete files if empty
-if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
-    rm -v $SRC_LLAMA/ggml-src.patch
+if [ ! -s $SRC_JARVIS/ggml-src.patch ]; then
+    rm -v $SRC_JARVIS/ggml-src.patch
 fi
 
-cd $SRC_LLAMA
+cd $SRC_JARVIS
 
-if [ -f $SRC_LLAMA/ggml-src.patch ]; then
+if [ -f $SRC_JARVIS/ggml-src.patch ]; then
     # replace PR numbers
     #
     # Subject: some text (#1234)
@@ -214,12 +214,12 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
 
     git am -C${ctx} ggml-src.patch
 
-    rm -v $SRC_LLAMA/ggml-src.patch
+    rm -v $SRC_JARVIS/ggml-src.patch
 fi
 
 # update last commit
 cd $SRC_GGML
-git log -1 --format=%H > $SRC_LLAMA/scripts/sync-ggml.last
+git log -1 --format=%H > $SRC_JARVIS/scripts/sync-ggml.last
 
 echo "Done"
 
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
index 0b5b9aafaade3..7c2c6b90e2f9a 100755
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -22,11 +22,11 @@ def sha256sum(file):
     return file_hash.hexdigest()
 
 
-# Define the path to the llama directory (parent folder of script directory)
-llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+# Define the path to the jarvis directory (parent folder of script directory)
+jarvis_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
 
 # Define the file with the list of hashes and filenames
-hash_list_file = os.path.join(llama_path, "SHA256SUMS")
+hash_list_file = os.path.join(jarvis_path, "SHA256SUMS")
 
 # Check if the hash list file exists
 if not os.path.exists(hash_list_file):
@@ -45,8 +45,8 @@ def sha256sum(file):
     # Split the line into hash and filename
     hash_value, filename = line.split("  ")
 
-    # Get the full path of the file by joining the llama path and the filename
-    file_path = os.path.join(llama_path, filename)
+    # Get the full path of the file by joining the jarvis path and the filename
+    file_path = os.path.join(jarvis_path, filename)
 
     # Informing user of the progress of the integrity check
     logger.info(f"Verifying the checksum of {file_path}")
diff --git a/spm-headers/jarvis.h b/spm-headers/jarvis.h
new file mode 100644
index 0000000000000..625ec8a471188
--- /dev/null
+++ b/spm-headers/jarvis.h
@@ -0,0 +1 @@
+../include/jarvis.h
\ No newline at end of file
diff --git a/spm-headers/llama.h b/spm-headers/llama.h
deleted file mode 120000
index b31388f0dd652..0000000000000
--- a/spm-headers/llama.h
+++ /dev/null
@@ -1 +0,0 @@
-../include/llama.h
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 46a6ad56202f7..205462e9fd99a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,25 +9,25 @@ endif()
 # libraries
 #
 
-# llama
+# jarvis
 
-add_library(llama
-            ../include/llama.h
-            llama.cpp
-            llama-vocab.cpp
-            llama-grammar.cpp
-            llama-sampling.cpp
+add_library(jarvis
+            ../include/jarvis.h
+            jarvis.cpp
+            jarvis-vocab.cpp
+            jarvis-grammar.cpp
+            jarvis-sampling.cpp
             unicode.h
             unicode.cpp
             unicode-data.cpp
             )
 
-target_include_directories(llama PUBLIC . ../include)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
+target_include_directories(jarvis PUBLIC . ../include)
+target_compile_features   (jarvis PUBLIC cxx_std_11) # don't bump
 
-target_link_libraries(llama PUBLIC ggml)
+target_link_libraries(jarvis PUBLIC ggml)
 
 if (BUILD_SHARED_LIBS)
-    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    set_target_properties(jarvis PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(jarvis PRIVATE JARVIS_SHARED JARVIS_BUILD)
 endif()
diff --git a/src/llama-grammar.cpp b/src/jarvis-grammar.cpp
similarity index 70%
rename from src/llama-grammar.cpp
rename to src/jarvis-grammar.cpp
index 74e9f64b393b2..a8909fb7895db 100644
--- a/src/llama-grammar.cpp
+++ b/src/jarvis-grammar.cpp
@@ -1,7 +1,7 @@
-#include "llama-grammar.h"
+#include "jarvis-grammar.h"
 
-#include "llama-vocab.h"
-#include "llama-sampling.h"
+#include "jarvis-vocab.h"
+#include "jarvis-sampling.h"
 
 #include <cmath>
 #include <algorithm>
@@ -27,9 +27,9 @@ static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
     return std::make_pair(value, pos);
 }
 
-static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+static std::pair<std::vector<uint32_t>, jarvis_partial_utf8> decode_utf8(
         const std::string & src,
-        llama_partial_utf8 partial_start) {
+        jarvis_partial_utf8 partial_start) {
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
     const char          * pos      = src.c_str();
     std::vector<uint32_t> code_points;
@@ -45,7 +45,7 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         if ((next_byte >> 6) != 2) {
             // invalid sequence, abort
             code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
+            return std::make_pair(std::move(code_points), jarvis_partial_utf8{ 0, -1 });
         }
         value = (value << 6) + (next_byte & 0x3F);
         ++pos;
@@ -66,7 +66,7 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
             // invalid sequence, abort
             code_points.clear();
             code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
+            return std::make_pair(std::move(code_points), jarvis_partial_utf8{ 0, n_remain });
         }
 
         uint8_t mask  = (1 << (7 - n_remain)) - 1;
@@ -84,7 +84,7 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
     }
     code_points.push_back(0);
 
-    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
+    return std::make_pair(std::move(code_points), jarvis_partial_utf8{ value, n_remain });
 }
 
 static bool is_digit_char(char c) {
@@ -187,40 +187,40 @@ static void print_grammar_char(FILE * file, uint32_t c) {
     }
 }
 
-static bool is_char_element(llama_grammar_element elem) {
+static bool is_char_element(jarvis_grammar_element elem) {
     switch (elem.type) {
-        case LLAMA_GRETYPE_CHAR:           return true;
-        case LLAMA_GRETYPE_CHAR_NOT:       return true;
-        case LLAMA_GRETYPE_CHAR_ALT:       return true;
-        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-        case LLAMA_GRETYPE_CHAR_ANY:       return true;
+        case JARVIS_GRETYPE_CHAR:           return true;
+        case JARVIS_GRETYPE_CHAR_NOT:       return true;
+        case JARVIS_GRETYPE_CHAR_ALT:       return true;
+        case JARVIS_GRETYPE_CHAR_RNG_UPPER: return true;
+        case JARVIS_GRETYPE_CHAR_ANY:       return true;
         default:                           return false;
     }
 }
 
-static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
+static void print_rule_binary(FILE * file, const jarvis_grammar_rule & rule) {
     for (auto elem : rule) {
         switch (elem.type) {
-            case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-            case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-            case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-            case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-            case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-            case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-            case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+            case JARVIS_GRETYPE_END:            fprintf(file, "END");            break;
+            case JARVIS_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+            case JARVIS_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+            case JARVIS_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+            case JARVIS_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+            case JARVIS_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+            case JARVIS_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            case JARVIS_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
         }
         switch (elem.type) {
-            case LLAMA_GRETYPE_END:
-            case LLAMA_GRETYPE_ALT:
-            case LLAMA_GRETYPE_RULE_REF:
+            case JARVIS_GRETYPE_END:
+            case JARVIS_GRETYPE_ALT:
+            case JARVIS_GRETYPE_RULE_REF:
                 fprintf(file, "(%u) ", elem.value);
                 break;
-            case LLAMA_GRETYPE_CHAR:
-            case LLAMA_GRETYPE_CHAR_NOT:
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-            case LLAMA_GRETYPE_CHAR_ALT:
-            case LLAMA_GRETYPE_CHAR_ANY:
+            case JARVIS_GRETYPE_CHAR:
+            case JARVIS_GRETYPE_CHAR_NOT:
+            case JARVIS_GRETYPE_CHAR_RNG_UPPER:
+            case JARVIS_GRETYPE_CHAR_ALT:
+            case JARVIS_GRETYPE_CHAR_ANY:
                 fprintf(file, "(\"");
                 print_grammar_char(file, elem.value);
                 fprintf(file, "\") ");
@@ -233,60 +233,60 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
 static void print_rule(
         FILE     * file,
         uint32_t   rule_id,
-        const llama_grammar_rule & rule,
+        const jarvis_grammar_rule & rule,
         const std::map<uint32_t, std::string> & symbol_id_names) {
-    if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+    if (rule.empty() || rule.back().type != JARVIS_GRETYPE_END) {
         throw std::runtime_error(
-            "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+            "malformed rule, does not end with JARVIS_GRETYPE_END: " + std::to_string(rule_id));
     }
     fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
     for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-        llama_grammar_element elem = rule[i];
+        jarvis_grammar_element elem = rule[i];
         switch (elem.type) {
-            case LLAMA_GRETYPE_END:
+            case JARVIS_GRETYPE_END:
                 throw std::runtime_error(
                     "unexpected end of rule: " + std::to_string(rule_id) + "," +
                     std::to_string(i));
-            case LLAMA_GRETYPE_ALT:
+            case JARVIS_GRETYPE_ALT:
                 fprintf(file, "| ");
                 break;
-            case LLAMA_GRETYPE_RULE_REF:
+            case JARVIS_GRETYPE_RULE_REF:
                 fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
                 break;
-            case LLAMA_GRETYPE_CHAR:
+            case JARVIS_GRETYPE_CHAR:
                 fprintf(file, "[");
                 print_grammar_char(file, elem.value);
                 break;
-            case LLAMA_GRETYPE_CHAR_NOT:
+            case JARVIS_GRETYPE_CHAR_NOT:
                 fprintf(file, "[^");
                 print_grammar_char(file, elem.value);
                 break;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+            case JARVIS_GRETYPE_CHAR_RNG_UPPER:
                 if (i == 0 || !is_char_element(rule[i - 1])) {
                     throw std::runtime_error(
-                        "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                        "JARVIS_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
                         std::to_string(rule_id) + "," + std::to_string(i));
                 }
                 fprintf(file, "-");
                 print_grammar_char(file, elem.value);
                 break;
-            case LLAMA_GRETYPE_CHAR_ALT:
+            case JARVIS_GRETYPE_CHAR_ALT:
                 if (i == 0 || !is_char_element(rule[i - 1])) {
                     throw std::runtime_error(
-                        "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                        "JARVIS_GRETYPE_CHAR_ALT without preceding char: " +
                         std::to_string(rule_id) + "," + std::to_string(i));
                 }
                 print_grammar_char(file, elem.value);
                 break;
-            case LLAMA_GRETYPE_CHAR_ANY:
+            case JARVIS_GRETYPE_CHAR_ANY:
                 fprintf(file, ".");
                 break;
         }
         if (is_char_element(elem)) {
             switch (rule[i + 1].type) {
-                case LLAMA_GRETYPE_CHAR_ALT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ANY:
+                case JARVIS_GRETYPE_CHAR_ALT:
+                case JARVIS_GRETYPE_CHAR_RNG_UPPER:
+                case JARVIS_GRETYPE_CHAR_ANY:
                     break;
                 default:
                     fprintf(file, "] ");
@@ -300,46 +300,46 @@ static void print_rule(
 // implementation
 //
 
-uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) {
+uint32_t jarvis_grammar_parser::get_symbol_id(const char * src, size_t len) {
     uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
     auto result = symbol_ids.emplace(std::string(src, len), next_id);
     return result.first->second;
 }
 
-uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) {
+uint32_t jarvis_grammar_parser::generate_symbol_id(const std::string & base_name) {
     uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
     symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
     return next_id;
 }
 
-void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) {
+void jarvis_grammar_parser::add_rule(uint32_t rule_id, const jarvis_grammar_rule & rule) {
     if (rules.size() <= rule_id) {
         rules.resize(rule_id + 1);
     }
     rules[rule_id] = rule;
 }
 
-const char * llama_grammar_parser::parse_alternates(
+const char * jarvis_grammar_parser::parse_alternates(
         const char        * src,
         const std::string & rule_name,
         uint32_t            rule_id,
         bool                is_nested) {
-    llama_grammar_rule rule;
+    jarvis_grammar_rule rule;
     const char * pos = parse_sequence(src, rule_name, rule, is_nested);
     while (*pos == '|') {
-        rule.push_back({LLAMA_GRETYPE_ALT, 0});
+        rule.push_back({JARVIS_GRETYPE_ALT, 0});
         pos = parse_space(pos + 1, true);
         pos = parse_sequence(pos, rule_name, rule, is_nested);
     }
-    rule.push_back({LLAMA_GRETYPE_END, 0});
+    rule.push_back({JARVIS_GRETYPE_END, 0});
     add_rule(rule_id, rule);
     return pos;
 }
 
-const char * llama_grammar_parser::parse_sequence(
+const char * jarvis_grammar_parser::parse_sequence(
         const char         * src,
         const std::string  & rule_name,
-        llama_grammar_rule & rule,
+        jarvis_grammar_rule & rule,
         bool               is_nested) {
     size_t last_sym_start = rule.size();
     const char * pos = src;
@@ -367,7 +367,7 @@ const char * llama_grammar_parser::parse_sequence(
             //        --> S'
             //            S'     ::= S |
 
-            llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+            jarvis_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
             if (min_times == 0) {
                 rule.resize(last_sym_start);
             } else {
@@ -380,20 +380,20 @@ const char * llama_grammar_parser::parse_sequence(
             uint32_t last_rec_rule_id = 0;
             auto n_opt = max_times < 0 ? 1 : max_times - min_times;
 
-            llama_grammar_rule rec_rule(prev_rule);
+            jarvis_grammar_rule rec_rule(prev_rule);
             for (int i = 0; i < n_opt; i++) {
                 rec_rule.resize(prev_rule.size());
                 uint32_t rec_rule_id = generate_symbol_id( rule_name);
                 if (i > 0 || max_times < 0) {
-                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+                    rec_rule.push_back({JARVIS_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
                 }
-                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+                rec_rule.push_back({JARVIS_GRETYPE_ALT, 0});
+                rec_rule.push_back({JARVIS_GRETYPE_END, 0});
                 add_rule( rec_rule_id, rec_rule);
                 last_rec_rule_id = rec_rule_id;
             }
             if (n_opt > 0) {
-                rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+                rule.push_back({JARVIS_GRETYPE_RULE_REF, last_rec_rule_id});
             }
         };
 
@@ -407,15 +407,15 @@ const char * llama_grammar_parser::parse_sequence(
                     }
                     auto char_pair = parse_char(pos);
                          pos       = char_pair.second;
-                    rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                    rule.push_back({JARVIS_GRETYPE_CHAR, char_pair.first});
                 }
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '[') { // char range(s)
                 pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                enum jarvis_gretype start_type = JARVIS_GRETYPE_CHAR;
                 if (*pos == '^') {
                     pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                    start_type = JARVIS_GRETYPE_CHAR_NOT;
                 }
                 last_sym_start = rule.size();
                 while (*pos != ']') {
@@ -424,8 +424,8 @@ const char * llama_grammar_parser::parse_sequence(
                     }
                     auto char_pair = parse_char(pos);
                          pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < rule.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
+                    enum jarvis_gretype type = last_sym_start < rule.size()
+                        ? JARVIS_GRETYPE_CHAR_ALT
                         : start_type;
 
                     rule.push_back({type, char_pair.first});
@@ -435,7 +435,7 @@ const char * llama_grammar_parser::parse_sequence(
                         }
                         auto endchar_pair = parse_char(pos + 1);
                              pos          = endchar_pair.second;
-                        rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                        rule.push_back({JARVIS_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
                     }
                 }
                 pos = parse_space(pos + 1, is_nested);
@@ -444,7 +444,7 @@ const char * llama_grammar_parser::parse_sequence(
                 uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
                 pos = parse_space(name_end, is_nested);
                 last_sym_start = rule.size();
-                rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+                rule.push_back({JARVIS_GRETYPE_RULE_REF, ref_rule_id});
             } else if (*pos == '(') { // grouping
                 // parse nested alternates into synthesized rule
                 pos = parse_space(pos + 1, true);
@@ -452,14 +452,14 @@ const char * llama_grammar_parser::parse_sequence(
                 pos = parse_alternates(pos, rule_name, sub_rule_id, true);
                 last_sym_start = rule.size();
                 // output reference to synthesized rule
-                rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                rule.push_back({JARVIS_GRETYPE_RULE_REF, sub_rule_id});
                 if (*pos != ')') {
                     throw std::runtime_error(std::string("expecting ')' at ") + pos);
                 }
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '.') { // any char
                 last_sym_start = rule.size();
-                rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+                rule.push_back({JARVIS_GRETYPE_CHAR_ANY, 0});
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '*') {
                 pos = parse_space(pos + 1, is_nested);
@@ -509,7 +509,7 @@ const char * llama_grammar_parser::parse_sequence(
         return pos;
     }
 
-const char * llama_grammar_parser::parse_rule(const char * src) {
+const char * jarvis_grammar_parser::parse_rule(const char * src) {
         const char * name_end = parse_name(src);
         const char * pos      = parse_space(name_end, false);
         size_t       name_len = name_end - src;
@@ -533,7 +533,7 @@ const char * llama_grammar_parser::parse_rule(const char * src) {
         return parse_space(pos, true);
     }
 
-bool llama_grammar_parser::parse(const char * src) {
+bool jarvis_grammar_parser::parse(const char * src) {
     try {
         const char * pos = parse_space(src, true);
         while (*pos) {
@@ -545,7 +545,7 @@ bool llama_grammar_parser::parse(const char * src) {
                 throw std::runtime_error("Undefined rule");
             }
             for (const auto & elem : rule) {
-                if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+                if (elem.type == JARVIS_GRETYPE_RULE_REF) {
                     // Ensure that the rule at that location exists
                     if (elem.value >= rules.size() || rules[elem.value].empty()) {
                         // Get the name of the rule that is missing
@@ -567,7 +567,7 @@ bool llama_grammar_parser::parse(const char * src) {
     return true;
 }
 
-void llama_grammar_parser::print(FILE * file) {
+void jarvis_grammar_parser::print(FILE * file) {
     try {
         std::map<uint32_t, std::string> symbol_id_names;
         for (const auto & kv : symbol_ids) {
@@ -584,8 +584,8 @@ void llama_grammar_parser::print(FILE * file) {
     }
 }
 
-llama_grammar_stack llama_grammar_parser::c_rules() const {
-    llama_grammar_stack ret;
+jarvis_grammar_stack jarvis_grammar_parser::c_rules() const {
+    jarvis_grammar_stack ret;
     ret.reserve(rules.size());
     for (const auto & rule : rules) {
         ret.push_back(rule.data());
@@ -594,30 +594,30 @@ llama_grammar_stack llama_grammar_parser::c_rules() const {
 }
 
 // returns true iff pos points to the end of one of the definitions of a rule
-static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
+static bool jarvis_grammar_is_end_of_sequence(const jarvis_grammar_element * pos) {
     switch (pos->type) {
-        case LLAMA_GRETYPE_END: return true;  // NOLINT
-        case LLAMA_GRETYPE_ALT: return true;  // NOLINT
+        case JARVIS_GRETYPE_END: return true;  // NOLINT
+        case JARVIS_GRETYPE_ALT: return true;  // NOLINT
         default:                return false;
     }
 }
 
 // returns true iff chr satisfies the char range at pos (regular or inverse range)
 // asserts that pos is pointing to a char range element
-static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
-        const llama_grammar_element * pos,
+static std::pair<bool, const jarvis_grammar_element *> jarvis_grammar_match_char(
+        const jarvis_grammar_element * pos,
         const uint32_t                chr) {
     bool found            = false;
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+    bool is_positive_char = pos->type == JARVIS_GRETYPE_CHAR || pos->type == JARVIS_GRETYPE_CHAR_ANY;
 
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
+    GGML_ASSERT(is_positive_char || pos->type == JARVIS_GRETYPE_CHAR_NOT); // NOLINT
 
     do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+        if (pos[1].type == JARVIS_GRETYPE_CHAR_RNG_UPPER) {
             // inclusive range, e.g. [a-z]
             found = found || (pos->value <= chr && chr <= pos[1].value);
             pos += 2;
-        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+        } else if (pos->type == JARVIS_GRETYPE_CHAR_ANY) {
             // Any character matches "."
             found = true;
             pos += 1;
@@ -626,7 +626,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
             found = found || pos->value == chr;
             pos += 1;
         }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+    } while (pos->type == JARVIS_GRETYPE_CHAR_ALT);
 
     return std::make_pair(found == is_positive_char, pos);
 }
@@ -634,11 +634,11 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
 // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
 // range at pos (regular or inverse range)
 // asserts that pos is pointing to a char range element
-static bool llama_grammar_match_partial_char(
-        const llama_grammar_element * pos,
-        const llama_partial_utf8      partial_utf8) {
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
+static bool jarvis_grammar_match_partial_char(
+        const jarvis_grammar_element * pos,
+        const jarvis_partial_utf8      partial_utf8) {
+    bool is_positive_char = pos->type == JARVIS_GRETYPE_CHAR || pos->type == JARVIS_GRETYPE_CHAR_ANY;
+    GGML_ASSERT(is_positive_char || pos->type == JARVIS_GRETYPE_CHAR_NOT);
 
     uint32_t partial_value = partial_utf8.value;
     int      n_remain      = partial_utf8.n_remain;
@@ -661,13 +661,13 @@ static bool llama_grammar_match_partial_char(
     }
 
     do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+        if (pos[1].type == JARVIS_GRETYPE_CHAR_RNG_UPPER) {
             // inclusive range, e.g. [a-z]
             if (pos->value <= high && low <= pos[1].value) {
                 return is_positive_char;
             }
             pos += 2;
-        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+        } else if (pos->type == JARVIS_GRETYPE_CHAR_ANY) {
             // Any character matches "."
             return true;
         } else {
@@ -677,17 +677,17 @@ static bool llama_grammar_match_partial_char(
             }
             pos += 1;
         }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+    } while (pos->type == JARVIS_GRETYPE_CHAR_ALT);
 
     return !is_positive_char;
 }
 
 // transforms a grammar pushdown stack into N possible stacks, all ending
 // at a character range (terminal element)
-static void llama_grammar_advance_stack(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stack  & stack,
-              llama_grammar_stacks & new_stacks) {
+static void jarvis_grammar_advance_stack(
+        const jarvis_grammar_rules  & rules,
+        const jarvis_grammar_stack  & stack,
+              jarvis_grammar_stacks & new_stacks) {
     if (stack.empty()) {
         if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
             new_stacks.emplace_back(stack);
@@ -695,29 +695,29 @@ static void llama_grammar_advance_stack(
         return;
     }
 
-    const llama_grammar_element * pos = stack.back();
+    const jarvis_grammar_element * pos = stack.back();
 
     switch (pos->type) {
-        case LLAMA_GRETYPE_RULE_REF: {
+        case JARVIS_GRETYPE_RULE_REF: {
             const size_t                  rule_id = static_cast<size_t>(pos->value);
-            const llama_grammar_element * subpos  = rules[rule_id].data();
+            const jarvis_grammar_element * subpos  = rules[rule_id].data();
             do {
                 // init new stack without the top (pos)
-                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                jarvis_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!jarvis_grammar_is_end_of_sequence(pos + 1)) {
                     // if this rule ref is followed by another element, add that to stack
                     new_stack.push_back(pos + 1);
                 }
-                if (!llama_grammar_is_end_of_sequence(subpos)) {
+                if (!jarvis_grammar_is_end_of_sequence(subpos)) {
                     // if alternate is nonempty, add to stack
                     new_stack.push_back(subpos);
                 }
-                llama_grammar_advance_stack(rules, new_stack, new_stacks);
-                while (!llama_grammar_is_end_of_sequence(subpos)) {
+                jarvis_grammar_advance_stack(rules, new_stack, new_stacks);
+                while (!jarvis_grammar_is_end_of_sequence(subpos)) {
                     // scan to end of alternate def
                     subpos++;
                 }
-                if (subpos->type == LLAMA_GRETYPE_ALT) {
+                if (subpos->type == JARVIS_GRETYPE_ALT) {
                     // there's another alternate def of this rule to process
                     subpos++;
                 } else {
@@ -726,43 +726,43 @@ static void llama_grammar_advance_stack(
             } while (true);
             break;
         }
-        case LLAMA_GRETYPE_CHAR:
-        case LLAMA_GRETYPE_CHAR_NOT:
-        case LLAMA_GRETYPE_CHAR_ANY:
+        case JARVIS_GRETYPE_CHAR:
+        case JARVIS_GRETYPE_CHAR_NOT:
+        case JARVIS_GRETYPE_CHAR_ANY:
             if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                 // only add the stack if it's not a duplicate of one we already have
                 new_stacks.emplace_back(stack);
             }
             break;
         default:
-            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
-            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
+            // end of alternate (JARVIS_GRETYPE_END, JARVIS_GRETYPE_ALT) or middle of char range
+            // (JARVIS_GRETYPE_CHAR_ALT, JARVIS_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
             // those
             GGML_ABORT("fatal error");
     }
 }
 
-static llama_grammar_candidates llama_grammar_reject_candidates(
-        const llama_grammar_rules      & rules,
-        const llama_grammar_stacks     & stacks,
-        const llama_grammar_candidates & candidates) {
+static jarvis_grammar_candidates jarvis_grammar_reject_candidates(
+        const jarvis_grammar_rules      & rules,
+        const jarvis_grammar_stacks     & stacks,
+        const jarvis_grammar_candidates & candidates) {
     GGML_ASSERT(!stacks.empty()); // REVIEW
 
     if (candidates.empty()) {
         return {};
     }
 
-    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
+    auto rejects = jarvis_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
 
     for (size_t i = 1, size = stacks.size(); i < size; ++i) {
-        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
+        rejects = jarvis_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
     }
 
     return rejects;
 }
 
-static bool llama_grammar_detect_left_recursion(
-        const llama_grammar_rules & rules,
+static bool jarvis_grammar_detect_left_recursion(
+        const jarvis_grammar_rules & rules,
         size_t rule_index,
         std::vector<bool> * rules_visited,
         std::vector<bool> * rules_in_progress,
@@ -773,13 +773,13 @@ static bool llama_grammar_detect_left_recursion(
 
     (*rules_in_progress)[rule_index] = true;
 
-    const llama_grammar_rule & rule = rules[rule_index];
+    const jarvis_grammar_rule & rule = rules[rule_index];
 
     // First check if the rule might produce the empty string. This could be done combined with the second
     // step but it's more readable as two steps.
     bool at_rule_start = true;
     for (size_t i = 0; i < rule.size(); i++) {
-        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+        if (jarvis_grammar_is_end_of_sequence(&rule[i])) {
             if (at_rule_start) {
                 (*rules_may_be_empty)[rule_index] = true;
                 break;
@@ -794,14 +794,14 @@ static bool llama_grammar_detect_left_recursion(
     // be empty)
     bool recurse_into_nonterminal = true;
     for (size_t i = 0; i < rule.size(); i++) {
-        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
-            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+        if (rule[i].type == JARVIS_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (jarvis_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
                 return true;
             }
             if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
                 recurse_into_nonterminal = false;
             }
-        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+        } else if (jarvis_grammar_is_end_of_sequence(&rule[i])) {
             recurse_into_nonterminal = true;
         } else {
             recurse_into_nonterminal = false;
@@ -814,19 +814,19 @@ static bool llama_grammar_detect_left_recursion(
     return false;
 }
 
-const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
+const jarvis_grammar_rules & jarvis_grammar_get_rules(const struct jarvis_grammar * grammar) {
     return grammar->rules;
 }
 
-llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
+jarvis_grammar_stacks & jarvis_grammar_get_stacks(struct jarvis_grammar * grammar) {
     return grammar->stacks;
 }
 
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
+void jarvis_grammar_accept(
+        const jarvis_grammar_rules  & rules,
+        const jarvis_grammar_stacks & stacks,
         const uint32_t               chr,
-              llama_grammar_stacks & stacks_new) {
+              jarvis_grammar_stacks & stacks_new) {
     stacks_new.clear();
     stacks_new.reserve(stacks.size());
 
@@ -835,26 +835,26 @@ void llama_grammar_accept(
             continue;
         }
 
-        auto match = llama_grammar_match_char(stack.back(), chr);
+        auto match = jarvis_grammar_match_char(stack.back(), chr);
         if (match.first) {
-            const llama_grammar_element * pos = match.second;
+            const jarvis_grammar_element * pos = match.second;
 
             // update top of stack to next element, if any
-            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
+            jarvis_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+            if (!jarvis_grammar_is_end_of_sequence(pos)) {
                 new_stack.push_back(pos);
             }
-            llama_grammar_advance_stack(rules, new_stack, stacks_new);
+            jarvis_grammar_advance_stack(rules, new_stack, stacks_new);
         }
     }
 }
 
-llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
-        const llama_grammar_rules      & rules,
-        const llama_grammar_stack      & stack,
-        const llama_grammar_candidates & candidates) {
+jarvis_grammar_candidates jarvis_grammar_reject_candidates_for_stack(
+        const jarvis_grammar_rules      & rules,
+        const jarvis_grammar_stack      & stack,
+        const jarvis_grammar_candidates & candidates) {
 
-    llama_grammar_candidates rejects;
+    jarvis_grammar_candidates rejects;
     rejects.reserve(candidates.size());
 
     if (stack.empty()) {
@@ -866,9 +866,9 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
         return rejects;
     }
 
-    const llama_grammar_element * stack_pos = stack.back();
+    const jarvis_grammar_element * stack_pos = stack.back();
 
-    llama_grammar_candidates next_candidates;
+    jarvis_grammar_candidates next_candidates;
     next_candidates.reserve(candidates.size());
 
     for (const auto & tok : candidates) {
@@ -876,27 +876,27 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
             // reached end of full codepoints in token, reject iff it ended in a partial sequence
             // that cannot satisfy this position in grammar
             if (tok.partial_utf8.n_remain != 0 &&
-                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
+                    !jarvis_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
                 rejects.push_back(tok);
             }
-        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
+        } else if (jarvis_grammar_match_char(stack_pos, *tok.code_points).first) {
             next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
         } else {
             rejects.push_back(tok);
         }
     }
 
-    const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
+    const auto * stack_pos_after = jarvis_grammar_match_char(stack_pos, 0).second;
 
     // update top of stack to next element, if any
-    llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
-    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
+    jarvis_grammar_stack stack_after(stack.begin(), stack.end() - 1);
+    if (!jarvis_grammar_is_end_of_sequence(stack_pos_after)) {
         stack_after.push_back(stack_pos_after);
     }
-    llama_grammar_stacks next_stacks;
-    llama_grammar_advance_stack(rules, stack_after, next_stacks);
+    jarvis_grammar_stacks next_stacks;
+    jarvis_grammar_advance_stack(rules, stack_after, next_stacks);
 
-    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
+    auto next_rejects = jarvis_grammar_reject_candidates(rules, next_stacks, next_candidates);
     for (const auto & tok : next_rejects) {
         rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
     }
@@ -906,20 +906,20 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
 
 ////////////////////
 
-struct llama_grammar * llama_grammar_init_impl(
-        const struct llama_vocab * vocab,
-        const llama_grammar_element ** rules,
+struct jarvis_grammar * jarvis_grammar_init_impl(
+        const struct jarvis_vocab * vocab,
+        const jarvis_grammar_element ** rules,
         size_t n_rules,
         size_t start_rule_index) {
-    const llama_grammar_element * pos;
+    const jarvis_grammar_element * pos;
 
     // copy rule definitions into vectors
-    llama_grammar_rules vec_rules(n_rules);
+    jarvis_grammar_rules vec_rules(n_rules);
     for (size_t i = 0; i < n_rules; i++) {
-        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+        for (pos = rules[i]; pos->type != JARVIS_GRETYPE_END; pos++) {
             vec_rules[i].push_back(*pos);
         }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+        vec_rules[i].push_back({JARVIS_GRETYPE_END, 0});
     }
 
     // Check for left recursion
@@ -930,27 +930,27 @@ struct llama_grammar * llama_grammar_init_impl(
         if (rules_visited[i]) {
             continue;
         }
-        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+        if (jarvis_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            JARVIS_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
             return nullptr;
         }
     }
 
     // loop over alternates of start rule to build initial stacks
-    llama_grammar_stacks stacks;
+    jarvis_grammar_stacks stacks;
     pos = vec_rules[start_rule_index].data();
     do {
-        llama_grammar_stack stack;
-        if (!llama_grammar_is_end_of_sequence(pos)) {
+        jarvis_grammar_stack stack;
+        if (!jarvis_grammar_is_end_of_sequence(pos)) {
             // if alternate is nonempty, add to stack
             stack.push_back(pos);
         }
-        llama_grammar_advance_stack(vec_rules, stack, stacks);
-        while (!llama_grammar_is_end_of_sequence(pos)) {
+        jarvis_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!jarvis_grammar_is_end_of_sequence(pos)) {
             // scan to end of alternate def
             pos++;
         }
-        if (pos->type == LLAMA_GRETYPE_ALT) {
+        if (pos->type == JARVIS_GRETYPE_ALT) {
             // there's another alternate def of this rule to process
             pos++;
         } else {
@@ -959,13 +959,13 @@ struct llama_grammar * llama_grammar_init_impl(
     } while (true);
 
     // Important: vec_rules has to be moved here, not copied, because stacks contains
-    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // pointers to elements of vec_rules. If vec_rules were copied into jarvis_grammar
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
+    return new jarvis_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
 }
 
-struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
-    llama_grammar_parser parser;
+struct jarvis_grammar * jarvis_grammar_init_impl(const struct jarvis_vocab * vocab, const char * grammar_str, const char * grammar_root) {
+    jarvis_grammar_parser parser;
 
     // if there is a grammar, parse it
     if (!parser.parse(grammar_str)) {
@@ -984,20 +984,20 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
         return nullptr;
     }
 
-    std::vector<const llama_grammar_element *> grammar_rules(parser.c_rules());
+    std::vector<const jarvis_grammar_element *> grammar_rules(parser.c_rules());
 
     const size_t n_rules = grammar_rules.size();
     const size_t start_rule_index = parser.symbol_ids.at(grammar_root);
 
-    const llama_grammar_element * pos;
+    const jarvis_grammar_element * pos;
 
     // copy rule definitions into vectors
-    llama_grammar_rules vec_rules(n_rules);
+    jarvis_grammar_rules vec_rules(n_rules);
     for (size_t i = 0; i < n_rules; i++) {
-        for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+        for (pos = grammar_rules[i]; pos->type != JARVIS_GRETYPE_END; pos++) {
             vec_rules[i].push_back(*pos);
         }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+        vec_rules[i].push_back({JARVIS_GRETYPE_END, 0});
     }
 
     // Check for left recursion
@@ -1008,27 +1008,27 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
         if (rules_visited[i]) {
             continue;
         }
-        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+        if (jarvis_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            JARVIS_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
             return nullptr;
         }
     }
 
     // loop over alternates of start rule to build initial stacks
-    llama_grammar_stacks stacks;
+    jarvis_grammar_stacks stacks;
     pos = vec_rules[start_rule_index].data();
     do {
-        llama_grammar_stack stack;
-        if (!llama_grammar_is_end_of_sequence(pos)) {
+        jarvis_grammar_stack stack;
+        if (!jarvis_grammar_is_end_of_sequence(pos)) {
             // if alternate is nonempty, add to stack
             stack.push_back(pos);
         }
-        llama_grammar_advance_stack(vec_rules, stack, stacks);
-        while (!llama_grammar_is_end_of_sequence(pos)) {
+        jarvis_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!jarvis_grammar_is_end_of_sequence(pos)) {
             // scan to end of alternate def
             pos++;
         }
-        if (pos->type == LLAMA_GRETYPE_ALT) {
+        if (pos->type == JARVIS_GRETYPE_ALT) {
             // there's another alternate def of this rule to process
             pos++;
         } else {
@@ -1037,12 +1037,12 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
     } while (true);
 
     // Important: vec_rules has to be moved here, not copied, because stacks contains
-    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // pointers to elements of vec_rules. If vec_rules were copied into jarvis_grammar
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
+    return new jarvis_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
 }
 
-void llama_grammar_free_impl(struct llama_grammar * grammar) {
+void jarvis_grammar_free_impl(struct jarvis_grammar * grammar) {
     if (grammar == nullptr) {
         return;
     }
@@ -1050,8 +1050,8 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
     delete grammar;
 }
 
-struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
+struct jarvis_grammar * jarvis_grammar_clone_impl(const struct jarvis_grammar & grammar) {
+    jarvis_grammar * result = new jarvis_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -1069,7 +1069,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
     return result;
 }
 
-void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
+void jarvis_grammar_apply_impl(const struct jarvis_grammar & grammar, jarvis_token_data_array * cur_p) {
     GGML_ASSERT(grammar.vocab != nullptr);
 
     bool allow_eog = false;
@@ -1080,17 +1080,17 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
         }
     }
 
-    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+    std::vector<std::pair<std::vector<uint32_t>, jarvis_partial_utf8>> candidates_decoded;
     candidates_decoded.reserve(cur_p->size);
 
-    llama_grammar_candidates candidates_grammar;
+    jarvis_grammar_candidates candidates_grammar;
     candidates_grammar.reserve(cur_p->size);
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        const llama_token id      = cur_p->data[i].id;
+        const jarvis_token id      = cur_p->data[i].id;
         const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
 
-        if (llama_token_is_eog_impl(*grammar.vocab, id)) {
+        if (jarvis_token_is_eog_impl(*grammar.vocab, id)) {
             if (!allow_eog) {
                 cur_p->data[i].logit = -INFINITY;
             }
@@ -1102,16 +1102,16 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
         }
     }
 
-    const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
+    const auto rejects = jarvis_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
     for (const auto & reject : rejects) {
         cur_p->data[reject.index].logit = -INFINITY;
     }
 }
 
-void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
+void jarvis_grammar_accept_impl(struct jarvis_grammar & grammar, jarvis_token token) {
     GGML_ASSERT(grammar.vocab != nullptr);
 
-    if (llama_token_is_eog_impl(*grammar.vocab, token)) {
+    if (jarvis_token_is_eog_impl(*grammar.vocab, token)) {
         for (const auto & stack : grammar.stacks) {
             if (stack.empty()) {
                 return;
@@ -1126,10 +1126,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
     const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
     const auto & code_points = decoded.first;
 
-    llama_grammar_stacks stacks_new;
+    jarvis_grammar_stacks stacks_new;
 
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
+        jarvis_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
         grammar.stacks = std::move(stacks_new);
     }
 
diff --git a/src/jarvis-grammar.h b/src/jarvis-grammar.h
new file mode 100644
index 0000000000000..08d1b19e21be3
--- /dev/null
+++ b/src/jarvis-grammar.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "jarvis-impl.h"
+
+#include <map>
+
+struct jarvis_vocab;
+
+// grammar element type
+enum jarvis_gretype {
+    // end of rule definition
+    JARVIS_GRETYPE_END            = 0,
+
+    // start of alternate definition for rule
+    JARVIS_GRETYPE_ALT            = 1,
+
+    // non-terminal element: reference to rule
+    JARVIS_GRETYPE_RULE_REF       = 2,
+
+    // terminal element: character (code point)
+    JARVIS_GRETYPE_CHAR           = 3,
+
+    // inverse char(s) ([^a], [^a-b] [^abc])
+    JARVIS_GRETYPE_CHAR_NOT       = 4,
+
+    // modifies a preceding JARVIS_GRETYPE_CHAR or JARVIS_GRETYPE_CHAR_ALT to
+    // be an inclusive range ([a-z])
+    JARVIS_GRETYPE_CHAR_RNG_UPPER = 5,
+
+    // modifies a preceding JARVIS_GRETYPE_CHAR or
+    // JARVIS_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+    JARVIS_GRETYPE_CHAR_ALT       = 6,
+
+    // any character (.)
+    JARVIS_GRETYPE_CHAR_ANY       = 7,
+};
+
+typedef struct jarvis_grammar_element {
+    enum jarvis_gretype type;
+    uint32_t           value; // Unicode code point or rule ID
+} jarvis_grammar_element;
+
+struct jarvis_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct jarvis_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    jarvis_partial_utf8   partial_utf8;
+};
+
+using jarvis_grammar_rule  = std::vector<      jarvis_grammar_element>;
+using jarvis_grammar_stack = std::vector<const jarvis_grammar_element *>;
+
+using jarvis_grammar_rules      = std::vector<jarvis_grammar_rule>;
+using jarvis_grammar_stacks     = std::vector<jarvis_grammar_stack>;
+using jarvis_grammar_candidates = std::vector<jarvis_grammar_candidate>;
+
+const jarvis_grammar_rules  & jarvis_grammar_get_rules (const struct jarvis_grammar * grammar);
+      jarvis_grammar_stacks & jarvis_grammar_get_stacks(      struct jarvis_grammar * grammar);
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `jarvis_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void jarvis_grammar_accept(
+        const jarvis_grammar_rules  & rules,
+        const jarvis_grammar_stacks & stacks,
+                          uint32_t   chr,
+              jarvis_grammar_stacks & stacks_new);
+
+std::vector<jarvis_grammar_candidate> jarvis_grammar_reject_candidates_for_stack(
+        const jarvis_grammar_rules      & rules,
+        const jarvis_grammar_stack      & stack,
+        const jarvis_grammar_candidates & candidates);
+
+struct jarvis_grammar_parser {
+    std::map<std::string, uint32_t> symbol_ids;
+
+    jarvis_grammar_rules rules;
+
+    jarvis_grammar_stack c_rules() const;
+
+    uint32_t get_symbol_id(const char * src, size_t len);
+    uint32_t generate_symbol_id(const std::string & base_name);
+
+    void add_rule(uint32_t rule_id, const jarvis_grammar_rule & rule);
+
+    const char * parse_alternates(
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            const char         * src,
+            const std::string  & rule_name,
+            jarvis_grammar_rule & rule,
+            bool               is_nested);
+
+    const char * parse_rule(const char * src);
+
+    bool parse(const char * src);
+    void print(FILE * file);
+};
+
+struct jarvis_grammar {
+    // note: allow null vocab for testing (not great)
+    const jarvis_vocab * vocab;
+
+    const jarvis_grammar_rules  rules;  // TODO: shared ptr
+          jarvis_grammar_stacks stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    jarvis_partial_utf8 partial_utf8;
+};
+
+//
+// internal API
+//
+
+// note: needed for tests (not great)
+struct jarvis_grammar * jarvis_grammar_init_impl(
+        const struct jarvis_vocab * vocab,
+        const jarvis_grammar_element ** rules,
+        size_t n_rules,
+        size_t start_rule_index);
+
+struct jarvis_grammar * jarvis_grammar_init_impl(const struct jarvis_vocab * vocab, const char * grammar_str, const char * grammar_root);
+
+void jarvis_grammar_free_impl(struct jarvis_grammar * grammar);
+
+struct jarvis_grammar * jarvis_grammar_clone_impl(const struct jarvis_grammar & grammar);
+
+// TODO: move the API below as member functions of jarvis_grammar
+void jarvis_grammar_apply_impl(
+        const struct jarvis_grammar & grammar,
+            jarvis_token_data_array * cur_p);
+
+void jarvis_grammar_accept_impl(
+              struct jarvis_grammar & grammar,
+                       jarvis_token   token);
diff --git a/src/llama-impl.h b/src/jarvis-impl.h
similarity index 78%
rename from src/llama-impl.h
rename to src/jarvis-impl.h
index 70f16b61c12e0..d92a1fff5563e 100644
--- a/src/llama-impl.h
+++ b/src/jarvis-impl.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "llama.h"
+#include "jarvis.h"
 
 #include <string>
 #include <vector>
@@ -8,28 +8,28 @@
 
 #ifdef __GNUC__
 #ifdef __MINGW32__
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#define JARVIS_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#define JARVIS_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
-#define LLAMA_ATTRIBUTE_FORMAT(...)
+#define JARVIS_ATTRIBUTE_FORMAT(...)
 #endif
 
 //
 // logging
 //
 
-LLAMA_ATTRIBUTE_FORMAT(2, 3)
-void llama_log_internal        (ggml_log_level level, const char * format, ...);
-void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+JARVIS_ATTRIBUTE_FORMAT(2, 3)
+void jarvis_log_internal        (ggml_log_level level, const char * format, ...);
+void jarvis_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 
-#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
-#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
-#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
-#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
-#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+#define JARVIS_LOG(...)       jarvis_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define JARVIS_LOG_INFO(...)  jarvis_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define JARVIS_LOG_WARN(...)  jarvis_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define JARVIS_LOG_ERROR(...) jarvis_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define JARVIS_LOG_DEBUG(...) jarvis_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define JARVIS_LOG_CONT(...)  jarvis_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 
 //
 // helpers
@@ -66,8 +66,8 @@ static void replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
 
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & jarvis_internal_get_tensor_map(
+    struct jarvis_context * ctx
 );
 
 // the ring buffer works similarly to std::deque, but with a fixed capacity
diff --git a/src/llama-sampling.cpp b/src/jarvis-sampling.cpp
similarity index 60%
rename from src/llama-sampling.cpp
rename to src/jarvis-sampling.cpp
index 25536eb6c5a06..c833ead815346 100644
--- a/src/llama-sampling.cpp
+++ b/src/jarvis-sampling.cpp
@@ -1,7 +1,7 @@
-#include "llama-sampling.h"
+#include "jarvis-sampling.h"
 
-#include "llama-vocab.h"
-#include "llama-grammar.h"
+#include "jarvis-vocab.h"
+#include "jarvis-grammar.h"
 
 #include <algorithm>
 #include <cassert>
@@ -15,7 +15,7 @@
 #include <random>
 #include <unordered_map>
 
-static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+static int jarvis_sample_dist(jarvis_token_data_array * cur_p, std::mt19937 & rng) {
     // iterator for the probabilities
 #ifdef __GNUC__
     #pragma GCC diagnostic push
@@ -29,7 +29,7 @@ static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng)
         typedef float & reference;
         typedef ptrdiff_t difference_type;
 
-        const llama_token_data * data;
+        const jarvis_token_data * data;
 
         bool operator==(const probs_iterator & other) const { return data == other.data; }
         bool operator!=(const probs_iterator & other) const { return data != other.data; }
@@ -48,7 +48,7 @@ static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng)
 }
 
 /*
-static void llama_log_softmax(float * array, size_t size) {
+static void jarvis_log_softmax(float * array, size_t size) {
     float max_l = *std::max_element(array, array + size);
     float sum = 0.f;
     for (size_t i = 0; i < size; ++i) {
@@ -63,7 +63,7 @@ static void llama_log_softmax(float * array, size_t size) {
 }
 */
 
-static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+static void jarvis_sampler_temp_impl(jarvis_token_data_array * cur_p, float temp) {
     if (temp <= 0.0f) {
         // find the token with the highest logit and set the rest to -inf
         size_t max_i = 0;
@@ -87,12 +87,12 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
     }
 }
 
-static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
+static void jarvis_sampler_softmax_impl(jarvis_token_data_array * cur_p) {
     GGML_ASSERT(cur_p->size > 0);
 
     // Sort the logits in descending order
     if (!cur_p->sorted) {
-        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const jarvis_token_data & a, const jarvis_token_data & b) {
             return a.logit > b.logit;
         });
         cur_p->sorted = true;
@@ -112,7 +112,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
     }
 }
 
-static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
+static void jarvis_sampler_top_k_impl(jarvis_token_data_array * cur_p, int32_t k) {
     // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
     // if (k >= (int32_t)cur_p->size) {
     //     return;
@@ -126,7 +126,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 
     // Sort scores in descending order
     if (!cur_p->sorted) {
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+        auto comp = [](const jarvis_token_data & a, const jarvis_token_data & b) {
             return a.logit > b.logit;
         };
         if (k <= 128) {
@@ -156,9 +156,9 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
                     break;
                 }
             }
-            std::vector<llama_token_data> tmp_tokens(nhave);
+            std::vector<jarvis_token_data> tmp_tokens(nhave);
             auto * ptr = tmp_tokens.data();
-            std::vector<llama_token_data*> bucket_ptrs;
+            std::vector<jarvis_token_data*> bucket_ptrs;
             bucket_ptrs.reserve(nbuckets - ib);
             for (int j = nbuckets - 1; j >= ib; --j) {
                 bucket_ptrs.push_back(ptr);
@@ -180,7 +180,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
             }
             std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
 
-            std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+            std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(jarvis_token_data));
 
         }
         cur_p->sorted = true;
@@ -189,7 +189,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 }
 
 static uint32_t get_rng_seed(uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
+    if (seed == JARVIS_DEFAULT_SEED) {
         // use system clock if std::random_device is not a true RNG
         static bool is_rd_prng = std::random_device().entropy() == 0;
         if (is_rd_prng) {
@@ -201,9 +201,9 @@ static uint32_t get_rng_seed(uint32_t seed) {
     return seed;
 }
 
-// llama_sampler API
+// jarvis_sampler API
 
-const char * llama_sampler_name(const struct llama_sampler * smpl) {
+const char * jarvis_sampler_name(const struct jarvis_sampler * smpl) {
     if (!smpl->iface) {
         return "(null)";
     }
@@ -211,30 +211,30 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
     return smpl->iface->name(smpl);
 }
 
-void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+void jarvis_sampler_accept(struct jarvis_sampler * smpl, jarvis_token token) {
     if (smpl->iface->accept) {
         smpl->iface->accept(smpl, token);
     }
 }
 
-void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+void jarvis_sampler_apply(struct jarvis_sampler * smpl, struct jarvis_token_data_array * cur_p) {
     GGML_ASSERT(smpl->iface->apply);
     smpl->iface->apply(smpl, cur_p);
 }
 
-void llama_sampler_reset(struct llama_sampler * smpl) {
+void jarvis_sampler_reset(struct jarvis_sampler * smpl) {
     if (smpl->iface->reset) {
         smpl->iface->reset(smpl);
     }
 }
 
-struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+struct jarvis_sampler * jarvis_sampler_clone(const struct jarvis_sampler * smpl) {
     if (smpl->iface->clone) {
         return smpl->iface->clone(smpl);
     }
 
     if (smpl->ctx == nullptr) {
-        return new llama_sampler {
+        return new jarvis_sampler {
             /* .iface = */ smpl->iface,
             /* .ctx   = */ nullptr,
         };
@@ -243,7 +243,7 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
     GGML_ABORT("the sampler does not support cloning");
 }
 
-void llama_sampler_free(struct llama_sampler * smpl) {
+void jarvis_sampler_free(struct jarvis_sampler * smpl) {
     if (smpl == nullptr) {
         return;
     }
@@ -255,110 +255,110 @@ void llama_sampler_free(struct llama_sampler * smpl) {
     delete smpl;
 }
 
-llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
-    const auto * logits = llama_get_logits_ith(ctx, idx);
+jarvis_token jarvis_sampler_sample(struct jarvis_sampler * smpl, struct jarvis_context * ctx, int32_t idx) {
+    const auto * logits = jarvis_get_logits_ith(ctx, idx);
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
 
     // TODO: do not allocate each time
-    std::vector<llama_token_data> cur;
+    std::vector<jarvis_token_data> cur;
     cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur.emplace_back(jarvis_token_data{token_id, logits[token_id], 0.0f});
     }
 
-    llama_token_data_array cur_p = {
+    jarvis_token_data_array cur_p = {
         /* .data       = */ cur.data(),
         /* .size       = */ cur.size(),
         /* .selected   = */ -1,
         /* .sorted     = */ false,
     };
 
-    llama_sampler_apply(smpl, &cur_p);
+    jarvis_sampler_apply(smpl, &cur_p);
 
     GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
 
     auto token = cur_p.data[cur_p.selected].id;
 
-    llama_sampler_accept(smpl, token);
+    jarvis_sampler_accept(smpl, token);
 
     return token;
 }
 
 // sampler chain
 
-static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_chain_name(const struct jarvis_sampler * /*smpl*/) {
     return "chain";
 }
 
-static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
-    auto * chain = (llama_sampler_chain *) smpl->ctx;
+static void jarvis_sampler_chain_accept(struct jarvis_sampler * smpl, jarvis_token token) {
+    auto * chain = (jarvis_sampler_chain *) smpl->ctx;
 
     time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
     for (auto * smpl : chain->samplers) {
-        llama_sampler_accept(smpl, token);
+        jarvis_sampler_accept(smpl, token);
     }
 
     chain->n_sample++;
 }
 
-static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * chain = (llama_sampler_chain *) smpl->ctx;
+static void jarvis_sampler_chain_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * chain = (jarvis_sampler_chain *) smpl->ctx;
 
     time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
     for (auto * smpl : chain->samplers) {
-        llama_sampler_apply(smpl, cur_p);
+        jarvis_sampler_apply(smpl, cur_p);
     }
 }
 
-static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
-    auto * chain = (llama_sampler_chain *) smpl->ctx;
+static void jarvis_sampler_chain_reset(struct jarvis_sampler * smpl) {
+    auto * chain = (jarvis_sampler_chain *) smpl->ctx;
 
     for (auto * smpl : chain->samplers) {
-        llama_sampler_reset(smpl);
+        jarvis_sampler_reset(smpl);
     }
 
     chain->t_sample_us = 0;
     chain->n_sample    = 0;
 }
 
-static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
-    const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
+static struct jarvis_sampler * jarvis_sampler_chain_clone(const struct jarvis_sampler * smpl) {
+    const auto * chain_src = (const jarvis_sampler_chain *) smpl->ctx;
 
-    auto * result = llama_sampler_chain_init(chain_src->params);
+    auto * result = jarvis_sampler_chain_init(chain_src->params);
 
     for (auto * smpl : chain_src->samplers) {
-        llama_sampler_chain_add(result, llama_sampler_clone(smpl));
+        jarvis_sampler_chain_add(result, jarvis_sampler_clone(smpl));
     }
 
     return result;
 }
 
-static void llama_sampler_chain_free(struct llama_sampler * smpl) {
-    auto * chain = (llama_sampler_chain *) smpl->ctx;
+static void jarvis_sampler_chain_free(struct jarvis_sampler * smpl) {
+    auto * chain = (jarvis_sampler_chain *) smpl->ctx;
 
     for (auto * smpl : chain->samplers) {
-        llama_sampler_free(smpl);
+        jarvis_sampler_free(smpl);
     }
 
     delete chain;
 }
 
-static struct llama_sampler_i llama_sampler_chain_i = {
-    /* .name   = */ llama_sampler_chain_name,
-    /* .accept = */ llama_sampler_chain_accept,
-    /* .apply  = */ llama_sampler_chain_apply,
-    /* .reset  = */ llama_sampler_chain_reset,
-    /* .clone  = */ llama_sampler_chain_clone,
-    /* .free   = */ llama_sampler_chain_free,
+static struct jarvis_sampler_i jarvis_sampler_chain_i = {
+    /* .name   = */ jarvis_sampler_chain_name,
+    /* .accept = */ jarvis_sampler_chain_accept,
+    /* .apply  = */ jarvis_sampler_chain_apply,
+    /* .reset  = */ jarvis_sampler_chain_reset,
+    /* .clone  = */ jarvis_sampler_chain_clone,
+    /* .free   = */ jarvis_sampler_chain_free,
 };
 
-struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_chain_i,
-        /* .ctx   = */ new llama_sampler_chain {
+struct jarvis_sampler * jarvis_sampler_chain_init(struct jarvis_sampler_chain_params params) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_chain_i,
+        /* .ctx   = */ new jarvis_sampler_chain {
             /* .params      = */ params,
             /* .samplers    = */ {},
             /* .t_sample_us = */ 0,
@@ -367,13 +367,13 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
     };
 }
 
-void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
-    auto * p = (llama_sampler_chain *) chain->ctx;
+void jarvis_sampler_chain_add(struct jarvis_sampler * chain, struct jarvis_sampler * smpl) {
+    auto * p = (jarvis_sampler_chain *) chain->ctx;
     p->samplers.push_back(smpl);
 }
 
-struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
-    const auto * p = (const llama_sampler_chain *) chain->ctx;
+struct jarvis_sampler * jarvis_sampler_chain_get(const struct jarvis_sampler * chain, int32_t i) {
+    const auto * p = (const jarvis_sampler_chain *) chain->ctx;
 
     if (i < 0 || (size_t) i >= p->samplers.size()) {
         return nullptr;
@@ -382,8 +382,8 @@ struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chai
     return p->samplers[i];
 }
 
-struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
-    auto * p = (llama_sampler_chain *) chain->ctx;
+struct jarvis_sampler * jarvis_sampler_chain_remove(struct jarvis_sampler * chain, int32_t i) {
+    auto * p = (jarvis_sampler_chain *) chain->ctx;
 
     if (i < 0 || (size_t) i >= p->samplers.size()) {
         return nullptr;
@@ -395,8 +395,8 @@ struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain,
     return result;
 }
 
-int llama_sampler_chain_n(const struct llama_sampler * chain) {
-    const auto * p = (const llama_sampler_chain *) chain->ctx;
+int jarvis_sampler_chain_n(const struct jarvis_sampler * chain) {
+    const auto * p = (const jarvis_sampler_chain *) chain->ctx;
 
     return p->samplers.size();
 }
@@ -407,11 +407,11 @@ int llama_sampler_chain_n(const struct llama_sampler * chain) {
 
 // greedy
 
-static const char * llama_sampler_greedy_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_greedy_name(const struct jarvis_sampler * /*smpl*/) {
     return "greedy";
 }
 
-static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+static void jarvis_sampler_greedy_apply(struct jarvis_sampler * /*smpl*/, jarvis_token_data_array * cur_p) {
     cur_p->selected = 0;
     for (size_t i = 1; i < cur_p->size; ++i) {
         if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) {
@@ -420,50 +420,50 @@ static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_to
     }
 }
 
-static struct llama_sampler_i llama_sampler_greedy_i = {
-    /* .name   = */ llama_sampler_greedy_name,
+static struct jarvis_sampler_i jarvis_sampler_greedy_i = {
+    /* .name   = */ jarvis_sampler_greedy_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_greedy_apply,
+    /* .apply  = */ jarvis_sampler_greedy_apply,
     /* .reset  = */ nullptr,
     /* .clone  = */ nullptr,
     /* .free   = */ nullptr,
 };
 
-struct llama_sampler * llama_sampler_init_greedy() {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_greedy_i,
+struct jarvis_sampler * jarvis_sampler_init_greedy() {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_greedy_i,
         /* .ctx   = */ nullptr,
     };
 }
 
 // dist
 
-struct llama_sampler_dist {
+struct jarvis_sampler_dist {
     const uint32_t seed;
           uint32_t seed_cur;
 
     std::mt19937 rng;
 };
 
-static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_dist_name(const struct jarvis_sampler * /*smpl*/) {
     return "dist";
 }
 
-static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+static void jarvis_sampler_dist_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_dist *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
-    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+    cur_p->selected = jarvis_sample_dist(cur_p, ctx->rng);
 }
 
-static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
-    auto * result = llama_sampler_init_dist(ctx->seed);
+static struct jarvis_sampler * jarvis_sampler_dist_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_dist *) smpl->ctx;
+    auto * result = jarvis_sampler_init_dist(ctx->seed);
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_dist *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_dist *) result->ctx;
 
         result_ctx->rng = ctx->rng;
     }
@@ -471,30 +471,30 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
     return result;
 }
 
-static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+static void jarvis_sampler_dist_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_dist *) smpl->ctx;
     ctx->seed_cur = get_rng_seed(ctx->seed);
     ctx->rng.seed(ctx->seed_cur);
 }
 
-static void llama_sampler_dist_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_dist *) smpl->ctx;
+static void jarvis_sampler_dist_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_dist *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_dist_i = {
-    /* .name   = */ llama_sampler_dist_name,
+static struct jarvis_sampler_i jarvis_sampler_dist_i = {
+    /* .name   = */ jarvis_sampler_dist_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_dist_apply,
-    /* .reset  = */ llama_sampler_dist_reset,
-    /* .clone  = */ llama_sampler_dist_clone,
-    /* .free   = */ llama_sampler_dist_free,
+    /* .apply  = */ jarvis_sampler_dist_apply,
+    /* .reset  = */ jarvis_sampler_dist_reset,
+    /* .clone  = */ jarvis_sampler_dist_clone,
+    /* .free   = */ jarvis_sampler_dist_free,
 };
 
-struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+struct jarvis_sampler * jarvis_sampler_init_dist(uint32_t seed) {
     auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_dist_i,
-        /* .ctx   = */ new llama_sampler_dist {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_dist_i,
+        /* .ctx   = */ new jarvis_sampler_dist {
             /* .seed     = */ seed,
             /* .seed_cur = */ seed_cur,
             /* .rng      = */ std::mt19937(seed_cur),
@@ -504,67 +504,67 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
 
 // softmax
 
-static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_softmax_name(const struct jarvis_sampler * /*smpl*/) {
     return "softmax";
 }
 
-static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
-    llama_sampler_softmax_impl(cur_p);
+static void jarvis_sampler_softmax_apply(struct jarvis_sampler * /*smpl*/, jarvis_token_data_array * cur_p) {
+    jarvis_sampler_softmax_impl(cur_p);
 }
 
-static struct llama_sampler_i llama_sampler_softmax_i = {
-    /* .name   = */ llama_sampler_softmax_name,
+static struct jarvis_sampler_i jarvis_sampler_softmax_i = {
+    /* .name   = */ jarvis_sampler_softmax_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_softmax_apply,
+    /* .apply  = */ jarvis_sampler_softmax_apply,
     /* .reset  = */ nullptr,
     /* .clone  = */ nullptr,
     /* .free   = */ nullptr,
 };
 
-struct llama_sampler * llama_sampler_init_softmax() {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_softmax_i,
+struct jarvis_sampler * jarvis_sampler_init_softmax() {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_softmax_i,
         /* .ctx   = */ nullptr,
     };
 }
 
 // top-k
 
-struct llama_sampler_top_k {
+struct jarvis_sampler_top_k {
     const int32_t k;
 };
 
-static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_top_k_name(const struct jarvis_sampler * /*smpl*/) {
     return "top-k";
 }
 
-static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
-    llama_sampler_top_k_impl(cur_p, ctx->k);
+static void jarvis_sampler_top_k_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_top_k *) smpl->ctx;
+    jarvis_sampler_top_k_impl(cur_p, ctx->k);
 }
 
-static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_top_k *) smpl->ctx;
-    return llama_sampler_init_top_k(ctx->k);
+static struct jarvis_sampler * jarvis_sampler_top_k_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_top_k *) smpl->ctx;
+    return jarvis_sampler_init_top_k(ctx->k);
 }
 
-static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_top_k *) smpl->ctx;
+static void jarvis_sampler_top_k_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_top_k *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_top_k_i = {
-    /* .name   = */ llama_sampler_top_k_name,
+static struct jarvis_sampler_i jarvis_sampler_top_k_i = {
+    /* .name   = */ jarvis_sampler_top_k_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_top_k_apply,
+    /* .apply  = */ jarvis_sampler_top_k_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_top_k_clone,
-    /* .free   = */ llama_sampler_top_k_free,
+    /* .clone  = */ jarvis_sampler_top_k_clone,
+    /* .free   = */ jarvis_sampler_top_k_free,
 };
 
-struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_top_k_i,
-        /* .ctx   = */ new llama_sampler_top_k {
+struct jarvis_sampler * jarvis_sampler_init_top_k(int32_t k) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_top_k_i,
+        /* .ctx   = */ new jarvis_sampler_top_k {
             /* .k = */ k,
         },
     };
@@ -572,23 +572,23 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
 
 // top-p
 
-struct llama_sampler_top_p {
+struct jarvis_sampler_top_p {
     const float  p;
     const size_t min_keep;
 };
 
-static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_top_p_name(const struct jarvis_sampler * /*smpl*/) {
     return "top-p";
 }
 
-static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
+static void jarvis_sampler_top_p_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_top_p *) smpl->ctx;
 
     if (ctx->p >= 1.0f) {
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
@@ -609,28 +609,28 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     cur_p->size = last_idx;
 }
 
-static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_top_p *) smpl->ctx;
-    return llama_sampler_init_top_p(ctx->p, ctx->min_keep);
+static struct jarvis_sampler * jarvis_sampler_top_p_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_top_p *) smpl->ctx;
+    return jarvis_sampler_init_top_p(ctx->p, ctx->min_keep);
 }
 
-static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_top_p *) smpl->ctx;
+static void jarvis_sampler_top_p_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_top_p *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_top_p_i = {
-    /* .name   = */ llama_sampler_top_p_name,
+static struct jarvis_sampler_i jarvis_sampler_top_p_i = {
+    /* .name   = */ jarvis_sampler_top_p_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_top_p_apply,
+    /* .apply  = */ jarvis_sampler_top_p_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_top_p_clone,
-    /* .free   = */ llama_sampler_top_p_free,
+    /* .clone  = */ jarvis_sampler_top_p_clone,
+    /* .free   = */ jarvis_sampler_top_p_free,
 };
 
-struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_top_p_i,
-        /* .ctx   = */ new llama_sampler_top_p {
+struct jarvis_sampler * jarvis_sampler_init_top_p(float p, size_t min_keep) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_top_p_i,
+        /* .ctx   = */ new jarvis_sampler_top_p {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
         },
@@ -639,17 +639,17 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
 
 // min-p
 
-struct llama_sampler_min_p {
+struct jarvis_sampler_min_p {
     const float  p;
     const size_t min_keep;
 };
 
-static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_min_p_name(const struct jarvis_sampler * /*smpl*/) {
     return "min-p";
 }
 
-static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
+static void jarvis_sampler_min_p_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_min_p *) smpl->ctx;
 
     if (ctx->p <= 0.0f || !cur_p->size) {
         return;
@@ -659,7 +659,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
 
     // if the cur_p aren't sorted, try the unsorted implementation first
     if (!cur_p->sorted) {
-        std::vector<llama_token_data> filtered_tokens;
+        std::vector<jarvis_token_data> filtered_tokens;
 
         float max_logit = -FLT_MAX;
         for (size_t i = 0; i < cur_p->size; ++i) {
@@ -675,7 +675,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
 
         // if we have enough values the operation was a success
         if (filtered_tokens.size() >= ctx->min_keep) {
-            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(jarvis_token_data));
             cur_p->size = filtered_tokens.size();
             min_p_applied = true;
         }
@@ -685,7 +685,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!min_p_applied) {
         // Sort the logits in descending order
         if (!cur_p->sorted) {
-            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const jarvis_token_data & a, const jarvis_token_data & b) {
                 return a.logit > b.logit;
             });
             cur_p->sorted = true;
@@ -705,28 +705,28 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     }
 }
 
-static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_min_p *) smpl->ctx;
-    return llama_sampler_init_min_p(ctx->p, ctx->min_keep);
+static struct jarvis_sampler * jarvis_sampler_min_p_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_min_p *) smpl->ctx;
+    return jarvis_sampler_init_min_p(ctx->p, ctx->min_keep);
 }
 
-static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_min_p *) smpl->ctx;
+static void jarvis_sampler_min_p_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_min_p *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_min_p_i = {
-    /* .name   = */ llama_sampler_min_p_name,
+static struct jarvis_sampler_i jarvis_sampler_min_p_i = {
+    /* .name   = */ jarvis_sampler_min_p_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_min_p_apply,
+    /* .apply  = */ jarvis_sampler_min_p_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_min_p_clone,
-    /* .free   = */ llama_sampler_min_p_free,
+    /* .clone  = */ jarvis_sampler_min_p_clone,
+    /* .free   = */ jarvis_sampler_min_p_free,
 };
 
-struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_min_p_i,
-        /* .ctx   = */ new llama_sampler_min_p {
+struct jarvis_sampler * jarvis_sampler_init_min_p(float p, size_t min_keep) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_min_p_i,
+        /* .ctx   = */ new jarvis_sampler_min_p {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
         },
@@ -735,23 +735,23 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
 
 // tail-free
 
-struct llama_sampler_tail_free {
+struct jarvis_sampler_tail_free {
     const float  z;
     const size_t min_keep;
 };
 
-static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_tail_free_name(const struct jarvis_sampler * /*smpl*/) {
     return "tail-free";
 }
 
-static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
+static void jarvis_sampler_tail_free_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_tail_free *) smpl->ctx;
 
     if (ctx->z >= 1.0f || cur_p->size <= 2) {
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     // Compute the first and second derivatives
     std::vector<float> first_derivatives(cur_p->size - 1);
@@ -800,28 +800,28 @@ static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_tok
     cur_p->size = last_idx;
 }
 
-static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
-    return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
+static struct jarvis_sampler * jarvis_sampler_tail_free_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_tail_free *) smpl->ctx;
+    return jarvis_sampler_init_tail_free(ctx->z, ctx->min_keep);
 }
 
-static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_tail_free *) smpl->ctx;
+static void jarvis_sampler_tail_free_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_tail_free *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_tail_free_i = {
-    /* .name   = */ llama_sampler_tail_free_name,
+static struct jarvis_sampler_i jarvis_sampler_tail_free_i = {
+    /* .name   = */ jarvis_sampler_tail_free_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_tail_free_apply,
+    /* .apply  = */ jarvis_sampler_tail_free_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_tail_free_clone,
-    /* .free   = */ llama_sampler_tail_free_free,
+    /* .clone  = */ jarvis_sampler_tail_free_clone,
+    /* .free   = */ jarvis_sampler_tail_free_free,
 };
 
-struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_tail_free_i,
-        /* .ctx   = */ new llama_sampler_tail_free {
+struct jarvis_sampler * jarvis_sampler_init_tail_free(float z, size_t min_keep) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_tail_free_i,
+        /* .ctx   = */ new jarvis_sampler_tail_free {
             /* .z        = */ z,
             /*. min_keep = */ min_keep,
         },
@@ -830,17 +830,17 @@ struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
 
 // typical
 
-struct llama_sampler_typical {
+struct jarvis_sampler_typical {
     const float  p;
     const size_t min_keep;
 };
 
-static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_typical_name(const struct jarvis_sampler * /*smpl*/) {
     return "typical";
 }
 
-static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_typical *) smpl->ctx;
+static void jarvis_sampler_typical_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_typical *) smpl->ctx;
 
     // Reference implementation:
     // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
@@ -849,7 +849,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     float entropy = 0.0f;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -887,7 +887,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Resize the output vector to keep only the locally typical tokens
-    std::vector<llama_token_data> cur_p_new;
+    std::vector<jarvis_token_data> cur_p_new;
     for (size_t i = 0; i < last_idx; ++i) {
         size_t idx = indices[i];
         cur_p_new.push_back(cur_p->data[idx]);
@@ -899,28 +899,28 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     cur_p->sorted = false;
 }
 
-static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_typical *) smpl->ctx;
-    return llama_sampler_init_typical(ctx->p, ctx->min_keep);
+static struct jarvis_sampler * jarvis_sampler_typical_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_typical *) smpl->ctx;
+    return jarvis_sampler_init_typical(ctx->p, ctx->min_keep);
 }
 
-static void llama_sampler_typical_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_typical *) smpl->ctx;
+static void jarvis_sampler_typical_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_typical *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_typical_i = {
-    /* .name   = */ llama_sampler_typical_name,
+static struct jarvis_sampler_i jarvis_sampler_typical_i = {
+    /* .name   = */ jarvis_sampler_typical_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_typical_apply,
+    /* .apply  = */ jarvis_sampler_typical_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_typical_clone,
-    /* .free   = */ llama_sampler_typical_free,
+    /* .clone  = */ jarvis_sampler_typical_clone,
+    /* .free   = */ jarvis_sampler_typical_free,
 };
 
-struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_typical_i,
-        /* .ctx   = */ new llama_sampler_typical {
+struct jarvis_sampler * jarvis_sampler_init_typical(float p, size_t min_keep) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_typical_i,
+        /* .ctx   = */ new jarvis_sampler_typical {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
         },
@@ -929,42 +929,42 @@ struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
 
 // temp
 
-struct llama_sampler_temp {
+struct jarvis_sampler_temp {
     const float temp;
 };
 
-static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_temp_name(const struct jarvis_sampler * /*smpl*/) {
     return "temp";
 }
 
-static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_temp *) smpl->ctx;
+static void jarvis_sampler_temp_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_temp *) smpl->ctx;
 
-    llama_sampler_temp_impl(cur_p, ctx->temp);
+    jarvis_sampler_temp_impl(cur_p, ctx->temp);
 }
 
-static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_temp *) smpl->ctx;
-    return llama_sampler_init_temp(ctx->temp);
+static struct jarvis_sampler * jarvis_sampler_temp_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_temp *) smpl->ctx;
+    return jarvis_sampler_init_temp(ctx->temp);
 }
 
-static void llama_sampler_temp_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_temp *) smpl->ctx;
+static void jarvis_sampler_temp_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_temp *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_temp_i = {
-    /* .name   = */ llama_sampler_temp_name,
+static struct jarvis_sampler_i jarvis_sampler_temp_i = {
+    /* .name   = */ jarvis_sampler_temp_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_temp_apply,
+    /* .apply  = */ jarvis_sampler_temp_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_temp_clone,
-    /* .free   = */ llama_sampler_temp_free,
+    /* .clone  = */ jarvis_sampler_temp_clone,
+    /* .free   = */ jarvis_sampler_temp_free,
 };
 
-struct llama_sampler * llama_sampler_init_temp(float temp) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_temp_i,
-        /* .ctx   = */ new llama_sampler_temp {
+struct jarvis_sampler * jarvis_sampler_init_temp(float temp) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_temp_i,
+        /* .ctx   = */ new jarvis_sampler_temp {
             /*.temp = */ temp,
         },
     };
@@ -972,18 +972,18 @@ struct llama_sampler * llama_sampler_init_temp(float temp) {
 
 // temp-ext
 
-struct llama_sampler_temp_ext {
+struct jarvis_sampler_temp_ext {
     const float temp;
     const float delta;
     const float exponent;
 };
 
-static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_temp_ext_name(const struct jarvis_sampler * /*smpl*/) {
     return "temp-ext";
 }
 
-static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+static void jarvis_sampler_temp_ext_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    const auto * ctx = (jarvis_sampler_temp_ext *) smpl->ctx;
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
@@ -998,7 +998,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
-        llama_sampler_softmax_impl(cur_p);
+        jarvis_sampler_softmax_impl(cur_p);
 
         // Calculate entropy of the softmax probabilities
         float entropy = 0.0f;
@@ -1016,16 +1016,16 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
 
     #ifdef DEBUG
-        LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
-        LLAMA_LOG_INFO("Entropy: %f\n", entropy);
-        LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
-        LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
-        LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
-        LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+        JARVIS_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+        JARVIS_LOG_INFO("Entropy: %f\n", entropy);
+        JARVIS_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+        JARVIS_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+        JARVIS_LOG_INFO("Exponent: %f\n", exponent_val);
+        JARVIS_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
     #endif
 
         // Apply the dynamically calculated temperature scaling
-        llama_sampler_temp_impl(cur_p, dyn_temp);
+        jarvis_sampler_temp_impl(cur_p, dyn_temp);
 
         // Re-compute softmax probabilities after scaling logits with dynamic temperature
         const double max_l_double = cur_p->data[0].logit;
@@ -1043,38 +1043,38 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
 
     #ifdef DEBUG
         // Print the updated top 25 probabilities after temperature scaling
-        LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+        JARVIS_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
         for (size_t i = 0; i < 25 && i < cur_p->size; ++i) {
-            LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
+            JARVIS_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
         }
     #endif
     } else {
-        llama_sampler_temp_impl(cur_p, ctx->temp);
+        jarvis_sampler_temp_impl(cur_p, ctx->temp);
     }
 }
 
-static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
-    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
+static struct jarvis_sampler * jarvis_sampler_temp_ext_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_temp_ext *) smpl->ctx;
+    return jarvis_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
 }
 
-static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_temp_ext *) smpl->ctx;
+static void jarvis_sampler_temp_ext_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_temp_ext *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_temp_ext_i = {
-    /* .name   = */ llama_sampler_temp_ext_name,
+static struct jarvis_sampler_i jarvis_sampler_temp_ext_i = {
+    /* .name   = */ jarvis_sampler_temp_ext_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_temp_ext_apply,
+    /* .apply  = */ jarvis_sampler_temp_ext_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_temp_ext_clone,
-    /* .free   = */ llama_sampler_temp_ext_free,
+    /* .clone  = */ jarvis_sampler_temp_ext_clone,
+    /* .free   = */ jarvis_sampler_temp_ext_free,
 };
 
-struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_temp_ext_i,
-        /* .ctx   = */ new llama_sampler_temp_ext {
+struct jarvis_sampler * jarvis_sampler_init_temp_ext(float temp, float delta, float exponent) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_temp_ext_i,
+        /* .ctx   = */ new jarvis_sampler_temp_ext {
             /* .temp     = */ temp,
             /* .delta    = */ delta,
             /* .exponent = */ exponent,
@@ -1084,7 +1084,7 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
 
 // xtc
 
-struct llama_sampler_xtc {
+struct jarvis_sampler_xtc {
     const float    probability;
     const float    threshold;
     const size_t   min_keep;
@@ -1095,12 +1095,12 @@ struct llama_sampler_xtc {
     std::mt19937   rng;
 };
 
-static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_xtc_name(const struct jarvis_sampler * /*smpl*/) {
     return "xtc";
 }
 
-static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+static void jarvis_sample_xtc_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_xtc *) smpl->ctx;
 
     if (ctx->probability <= 0.0f
         || ctx->threshold > 0.5f
@@ -1113,7 +1113,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     if (chance > ctx->probability) return;
 
     // in case it's not sorted/recalculated yet
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     int pos_last = 0;
 
@@ -1129,13 +1129,13 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     }
 }
 
-static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
-    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+static struct jarvis_sampler * jarvis_sampler_xtc_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_xtc *) smpl->ctx;
+    auto * result = jarvis_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_xtc *) result->ctx;
 
         result_ctx->rng = ctx->rng;
     }
@@ -1143,30 +1143,30 @@ static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler
     return result;
 }
 
-static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_xtc *) smpl->ctx;
+static void jarvis_sampler_xtc_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_xtc *) smpl->ctx;
 }
 
-static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+static void jarvis_sampler_xtc_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_xtc *) smpl->ctx;
     ctx->seed_cur = get_rng_seed(ctx->seed);
     ctx->rng.seed(ctx->seed_cur);
 }
 
-static struct llama_sampler_i llama_sampler_xtc_i = {
-    /* .name   = */ llama_sampler_xtc_name,
+static struct jarvis_sampler_i jarvis_sampler_xtc_i = {
+    /* .name   = */ jarvis_sampler_xtc_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sample_xtc_apply,
-    /* .reset  = */ llama_sampler_xtc_reset,
-    /* .clone  = */ llama_sampler_xtc_clone,
-    /* .free   = */ llama_sampler_xtc_free,
+    /* .apply  = */ jarvis_sample_xtc_apply,
+    /* .reset  = */ jarvis_sampler_xtc_reset,
+    /* .clone  = */ jarvis_sampler_xtc_clone,
+    /* .free   = */ jarvis_sampler_xtc_free,
 };
 
-struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+struct jarvis_sampler * jarvis_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
     auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_xtc_i,
-        /* .ctx   = */ new llama_sampler_xtc {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_xtc_i,
+        /* .ctx   = */ new jarvis_sampler_xtc {
             /* .probability   = */ p,
             /* .threshold     = */ t,
             /* .min_keep      = */ min_keep,
@@ -1179,7 +1179,7 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
 
 // mirostat
 
-struct llama_sampler_mirostat {
+struct jarvis_sampler_mirostat {
     const int32_t n_vocab;
 
     const uint32_t seed;
@@ -1195,14 +1195,14 @@ struct llama_sampler_mirostat {
     std::mt19937 rng;
 };
 
-static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_mirostat_name(const struct jarvis_sampler * /*smpl*/) {
     return "mirostat";
 }
 
-static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+static void jarvis_sampler_mirostat_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_mirostat *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -1220,10 +1220,10 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
     float epsilon_hat = s_hat - 1;
     float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
 
-    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_top_k_impl(cur_p, std::max(int(k), 1));
+    jarvis_sampler_softmax_impl(cur_p);
 
-    const int idx = llama_sample_dist(cur_p, ctx->rng);
+    const int idx = jarvis_sample_dist(cur_p, ctx->rng);
 
     cur_p->selected = idx;
 
@@ -1234,13 +1234,13 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
     ctx->mu = ctx->mu - ctx->eta * e;
 }
 
-static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx;
-    auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
+static struct jarvis_sampler * jarvis_sampler_mirostat_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_mirostat *) smpl->ctx;
+    auto * result = jarvis_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx;
+        auto * result_ctx = (jarvis_sampler_mirostat *) smpl->ctx;
 
         result_ctx->mu  = ctx->mu;
         result_ctx->rng = ctx->rng;
@@ -1249,31 +1249,31 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
     return result;
 }
 
-static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+static void jarvis_sampler_mirostat_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_mirostat *) smpl->ctx;
     ctx->mu = 2.0f*ctx->tau;
     ctx->seed_cur = get_rng_seed(ctx->seed);
     ctx->rng.seed(ctx->seed_cur);
 }
 
-static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_mirostat *) smpl->ctx;
+static void jarvis_sampler_mirostat_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_mirostat *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_mirostat_i = {
-    /* .name   = */ llama_sampler_mirostat_name,
+static struct jarvis_sampler_i jarvis_sampler_mirostat_i = {
+    /* .name   = */ jarvis_sampler_mirostat_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_mirostat_apply,
-    /* .reset  = */ llama_sampler_mirostat_reset,
-    /* .clone  = */ llama_sampler_mirostat_clone,
-    /* .free   = */ llama_sampler_mirostat_free,
+    /* .apply  = */ jarvis_sampler_mirostat_apply,
+    /* .reset  = */ jarvis_sampler_mirostat_reset,
+    /* .clone  = */ jarvis_sampler_mirostat_clone,
+    /* .free   = */ jarvis_sampler_mirostat_free,
 };
 
-struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+struct jarvis_sampler * jarvis_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
     auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_mirostat_i,
-        /* .ctx   = */ new llama_sampler_mirostat {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_mirostat_i,
+        /* .ctx   = */ new jarvis_sampler_mirostat {
             /* .n_vocab  = */ n_vocab,
             /* .seed     = */ seed,
             /* .seed_cur = */ seed_cur,
@@ -1288,7 +1288,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
 
 // mirostat v2
 
-struct llama_sampler_mirostat_v2 {
+struct jarvis_sampler_mirostat_v2 {
     const uint32_t seed;
           uint32_t seed_cur;
 
@@ -1300,17 +1300,17 @@ struct llama_sampler_mirostat_v2 {
     std::mt19937 rng;
 };
 
-static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_mirostat_v2_name(const struct jarvis_sampler * /*smpl*/) {
     return "mirostat-v2";
 }
 
-static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+static void jarvis_sampler_mirostat_v2_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_mirostat_v2 *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
     // Truncate the words with surprise values greater than mu
-    cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
+    cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const jarvis_token_data & candidate) {
         return -log2f(candidate.p) > ctx->mu;
     }));
 
@@ -1319,9 +1319,9 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
     }
 
     // Normalize the probabilities of the remaining words
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
-    const int idx = llama_sample_dist(cur_p, ctx->rng);
+    const int idx = jarvis_sample_dist(cur_p, ctx->rng);
 
     cur_p->selected = idx;
 
@@ -1332,21 +1332,21 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
     ctx->mu = ctx->mu - ctx->eta * e;
 }
 
-static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+static void jarvis_sampler_mirostat_v2_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_mirostat_v2 *) smpl->ctx;
     ctx->mu = 2.0f*ctx->tau;
     ctx->seed_cur = get_rng_seed(ctx->seed);
     ctx->rng.seed(ctx->seed_cur);
 }
 
-static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx;
+static struct jarvis_sampler * jarvis_sampler_mirostat_v2_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_mirostat_v2 *) smpl->ctx;
 
-    auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
+    auto * result = jarvis_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_mirostat_v2 *) result->ctx;
 
         result_ctx->mu  = ctx->mu;
         result_ctx->rng = ctx->rng;
@@ -1355,24 +1355,24 @@ static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama
     return result;
 }
 
-static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_mirostat_v2 *) smpl->ctx;
+static void jarvis_sampler_mirostat_v2_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_mirostat_v2 *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
-    /* .name   = */ llama_sampler_mirostat_v2_name,
+static struct jarvis_sampler_i jarvis_sampler_mirostat_v2_i = {
+    /* .name   = */ jarvis_sampler_mirostat_v2_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_mirostat_v2_apply,
-    /* .reset  = */ llama_sampler_mirostat_v2_reset,
-    /* .clone  = */ llama_sampler_mirostat_v2_clone,
-    /* .free   = */ llama_sampler_mirostat_v2_free,
+    /* .apply  = */ jarvis_sampler_mirostat_v2_apply,
+    /* .reset  = */ jarvis_sampler_mirostat_v2_reset,
+    /* .clone  = */ jarvis_sampler_mirostat_v2_clone,
+    /* .free   = */ jarvis_sampler_mirostat_v2_free,
 };
 
-struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+struct jarvis_sampler * jarvis_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
     auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_mirostat_v2_i,
-        /* .ctx   = */ new llama_sampler_mirostat_v2 {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_mirostat_v2_i,
+        /* .ctx   = */ new jarvis_sampler_mirostat_v2 {
             /* .seed     = */ seed,
             /* .seed_cur = */ seed_cur,
             /* .tau      = */ tau,
@@ -1385,93 +1385,93 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
 
 // grammar
 
-struct llama_sampler_grammar {
-    const struct llama_vocab * vocab;
+struct jarvis_sampler_grammar {
+    const struct jarvis_vocab * vocab;
 
     std::string grammar_str;
     std::string grammar_root;
 
-    struct llama_grammar * grammar;
+    struct jarvis_grammar * grammar;
 };
 
-static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_grammar_name(const struct jarvis_sampler * /*smpl*/) {
     return "grammar";
 }
 
-static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) {
-    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+static void jarvis_sampler_grammar_accept_impl(struct jarvis_sampler * smpl, jarvis_token token) {
+    auto * ctx = (jarvis_sampler_grammar *) smpl->ctx;
     if (ctx->grammar) {
-        llama_grammar_accept_impl(*ctx->grammar, token);
+        jarvis_grammar_accept_impl(*ctx->grammar, token);
     }
 }
 
-static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+static void jarvis_sampler_grammar_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_grammar *) smpl->ctx;
     if (ctx->grammar) {
-        llama_grammar_apply_impl(*ctx->grammar, cur_p);
+        jarvis_grammar_apply_impl(*ctx->grammar, cur_p);
     }
 }
 
-static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+static void jarvis_sampler_grammar_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_grammar *) smpl->ctx;
     if (!ctx->grammar) {
         return;
     }
 
-    auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
+    auto * grammar_new = jarvis_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
 
-    llama_grammar_free_impl(ctx->grammar);
+    jarvis_grammar_free_impl(ctx->grammar);
     ctx->grammar = grammar_new;
 }
 
-static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
+static struct jarvis_sampler * jarvis_sampler_grammar_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_grammar *) smpl->ctx;
 
-    auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
+    auto * result = jarvis_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_grammar *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_grammar *) result->ctx;
 
         if (ctx->grammar) {
             result_ctx->grammar_str  = ctx->grammar_str;
             result_ctx->grammar_root = ctx->grammar_root;
 
-            result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar);
+            result_ctx->grammar = jarvis_grammar_clone_impl(*ctx->grammar);
         }
     }
 
     return result;
 }
 
-static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
-    const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+static void jarvis_sampler_grammar_free(struct jarvis_sampler * smpl) {
+    const auto * ctx = (jarvis_sampler_grammar *) smpl->ctx;
 
     if (ctx->grammar) {
-        llama_grammar_free_impl(ctx->grammar);
+        jarvis_grammar_free_impl(ctx->grammar);
     }
 
     delete ctx;
 }
 
-static struct llama_sampler_i llama_sampler_grammar_i = {
-    /* .name   = */ llama_sampler_grammar_name,
-    /* .accept = */ llama_sampler_grammar_accept_impl,
-    /* .apply  = */ llama_sampler_grammar_apply,
-    /* .reset  = */ llama_sampler_grammar_reset,
-    /* .clone  = */ llama_sampler_grammar_clone,
-    /* .free   = */ llama_sampler_grammar_free,
+static struct jarvis_sampler_i jarvis_sampler_grammar_i = {
+    /* .name   = */ jarvis_sampler_grammar_name,
+    /* .accept = */ jarvis_sampler_grammar_accept_impl,
+    /* .apply  = */ jarvis_sampler_grammar_apply,
+    /* .reset  = */ jarvis_sampler_grammar_reset,
+    /* .clone  = */ jarvis_sampler_grammar_clone,
+    /* .free   = */ jarvis_sampler_grammar_free,
 };
 
-struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
-    auto * ctx = new llama_sampler_grammar;
+struct jarvis_sampler * jarvis_sampler_init_grammar_impl(const struct jarvis_vocab & vocab, const char * grammar_str, const char * grammar_root) {
+    auto * ctx = new jarvis_sampler_grammar;
 
     if (grammar_str != nullptr && grammar_str[0] != '\0') {
         *ctx = {
             /* .vocab        = */ &vocab,
             /* .grammar_str  = */ grammar_str,
             /* .grammar_root = */ grammar_root,
-            /* .grammar      = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
+            /* .grammar      = */ jarvis_grammar_init_impl(&vocab, grammar_str, grammar_root),
         };
     } else {
         *ctx = {
@@ -1482,18 +1482,18 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
         };
     }
 
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_grammar_i,
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_grammar_i,
         /* .ctx   = */ ctx,
     };
 }
 
 // penalties
 
-struct llama_sampler_penalties {
+struct jarvis_sampler_penalties {
     const int32_t     n_vocab;
-    const llama_token special_eos_id;
-    const llama_token linefeed_id;
+    const jarvis_token special_eos_id;
+    const jarvis_token linefeed_id;
 
     const int32_t penalty_last_n;
     const float   penalty_repeat;
@@ -1503,15 +1503,15 @@ struct llama_sampler_penalties {
     const bool    penalize_nl;
     const bool    ignore_eos;
 
-    ring_buffer<llama_token> prev;
+    ring_buffer<jarvis_token> prev;
 };
 
-static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_penalties_name(const struct jarvis_sampler * /*smpl*/) {
     return "penalties";
 }
 
-static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) {
-    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+static void jarvis_sampler_penalties_accept(struct jarvis_sampler * smpl, jarvis_token token) {
+    auto * ctx = (jarvis_sampler_penalties *) smpl->ctx;
     if (ctx->penalty_last_n == 0) {
         return;
     }
@@ -1519,8 +1519,8 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
     ctx->prev.push_back(token);
 }
 
-static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+static void jarvis_sampler_penalties_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_penalties *) smpl->ctx;
 
     if (ctx->ignore_eos) {
         assert(ctx->special_eos_id >= 0);
@@ -1570,8 +1570,8 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
 
     // Create a frequency map to count occurrences of each token in last_tokens
     // TODO: optimize this by maintaining the token count in the sampler context
-    using llama_token_cnt = std::unordered_map<llama_token, int>;
-    llama_token_cnt token_count;
+    using jarvis_token_cnt = std::unordered_map<jarvis_token, int>;
+    jarvis_token_cnt token_count;
 
     for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
         token_count[ctx->prev.rat(i)]++;
@@ -1605,14 +1605,14 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
 }
 
-static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+static void jarvis_sampler_penalties_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_penalties *) smpl->ctx;
     ctx->prev.clear();
 }
 
-static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
-    auto * result = llama_sampler_init_penalties(
+static struct jarvis_sampler * jarvis_sampler_penalties_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_penalties *) smpl->ctx;
+    auto * result = jarvis_sampler_init_penalties(
             ctx->n_vocab,
             ctx->special_eos_id,
             ctx->linefeed_id,
@@ -1625,7 +1625,7 @@ static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_s
 
     // copy the state
     {
-        auto * result_ctx = (llama_sampler_penalties *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_penalties *) result->ctx;
 
         result_ctx->prev = ctx->prev;
     }
@@ -1633,42 +1633,42 @@ static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_s
     return result;
 }
 
-static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_penalties *) smpl->ctx;
+static void jarvis_sampler_penalties_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_penalties *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_penalties_i = {
-    /* .name   = */ llama_sampler_penalties_name,
-    /* .accept = */ llama_sampler_penalties_accept,
-    /* .apply  = */ llama_sampler_penalties_apply,
-    /* .reset  = */ llama_sampler_penalties_reset,
-    /* .clone  = */ llama_sampler_penalties_clone,
-    /* .free   = */ llama_sampler_penalties_free,
+static struct jarvis_sampler_i jarvis_sampler_penalties_i = {
+    /* .name   = */ jarvis_sampler_penalties_name,
+    /* .accept = */ jarvis_sampler_penalties_accept,
+    /* .apply  = */ jarvis_sampler_penalties_apply,
+    /* .reset  = */ jarvis_sampler_penalties_reset,
+    /* .clone  = */ jarvis_sampler_penalties_clone,
+    /* .free   = */ jarvis_sampler_penalties_free,
 };
 
-struct llama_sampler * llama_sampler_init_penalties(
+struct jarvis_sampler * jarvis_sampler_init_penalties(
         int32_t n_vocab,
-        llama_token special_eos_id,
-        llama_token linefeed_id,
+        jarvis_token special_eos_id,
+        jarvis_token linefeed_id,
         int32_t penalty_last_n,
         float penalty_repeat,
         float penalty_freq,
         float penalty_present,
         bool penalize_nl,
         bool ignore_eos) {
-    if (linefeed_id == LLAMA_TOKEN_NULL) {
+    if (linefeed_id == JARVIS_TOKEN_NULL) {
         penalize_nl = true;
     }
 
-    if (special_eos_id == LLAMA_TOKEN_NULL) {
+    if (special_eos_id == JARVIS_TOKEN_NULL) {
         ignore_eos = false;
     }
 
     penalty_last_n = std::max(penalty_last_n, 0);
 
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_penalties_i,
-        /* .ctx   = */ new llama_sampler_penalties {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_penalties_i,
+        /* .ctx   = */ new jarvis_sampler_penalties {
             /* .n_vocab         = */ n_vocab,
             /* .special_eos_id  = */ special_eos_id,
             /* .linefeed_id     = */ linefeed_id,
@@ -1678,14 +1678,14 @@ struct llama_sampler * llama_sampler_init_penalties(
             /* .penalty_present = */ penalty_present,
             /* .penalize_nl     = */ penalize_nl,
             /* .ignore_eos      = */ ignore_eos,
-            /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
+            /* .prev            = */ ring_buffer<jarvis_token>(penalty_last_n),
         },
     };
 }
 
 // DRY
 
-struct llama_sampler_dry {
+struct jarvis_sampler_dry {
     int32_t total_context_size;
 
     const float   dry_multiplier;
@@ -1693,18 +1693,18 @@ struct llama_sampler_dry {
     const int32_t dry_allowed_length;
     const int32_t dry_penalty_last_n;
 
-    std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
+    std::unordered_multimap<jarvis_token, std::vector<jarvis_token>> dry_processed_breakers;
     std::vector<int> dry_repeat_count;
-    std::unordered_map<llama_token, int> dry_max_token_repeat;
-    ring_buffer<llama_token> last_tokens;
+    std::unordered_map<jarvis_token, int> dry_max_token_repeat;
+    ring_buffer<jarvis_token> last_tokens;
 };
 
 // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
-static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
-    for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
-        std::string word = llama_detokenize(vocab, {token_id}, true);
+static void get_overlapping_token_sequences(const jarvis_vocab & vocab, const std::string& str, std::unordered_multimap<jarvis_token, std::vector<jarvis_token>>& token_sequences, int max_tail_len = -1) {
+    for (jarvis_token token_id = 0; token_id < (jarvis_token)vocab.n_vocab; token_id++) {
+        std::string word = jarvis_detokenize(vocab, {token_id}, true);
         if (word.find(str) != std::string::npos) {
-            token_sequences.emplace(token_id, std::vector<llama_token>());
+            token_sequences.emplace(token_id, std::vector<jarvis_token>());
         } else {
             size_t word_len = word.size(), str_len = str.size();
             size_t pos = -1;
@@ -1718,7 +1718,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
                     }
                 }
                 if (match) {
-                    std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
+                    std::vector<jarvis_token> tokenization = jarvis_tokenize_internal(vocab, str.substr(i), false, false);
                     if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
                         tokenization.resize(max_tail_len);
                     }
@@ -1741,12 +1741,12 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
     }
 }
 
-static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_dry_name(const struct jarvis_sampler * /*smpl*/) {
     return "dry";
 }
 
-static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+static void jarvis_sampler_dry_accept(struct jarvis_sampler * smpl, jarvis_token token) {
+    auto * ctx = (jarvis_sampler_dry *) smpl->ctx;
     if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
         return;
     }
@@ -1755,8 +1755,8 @@ static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token to
 }
 
 // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
-static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+static void jarvis_sampler_dry_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_dry *) smpl->ctx;
 
     if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
         return;
@@ -1796,7 +1796,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
 
     int rep_limit = last_n_repeat;
     for (int i = 0; i < last_n_repeat; ++i) {
-        llama_token token = ctx->last_tokens.rat(i);
+        jarvis_token token = ctx->last_tokens.rat(i);
         auto its = ctx->dry_processed_breakers.equal_range(token);
         if (its.first == ctx->dry_processed_breakers.end()) {
             continue;
@@ -1913,7 +1913,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
             // This token ends a repeat, so the next token would continue one.
             // By convention, the value of `repeat_len` only includes the tokens currently
             // in the context, not the new token that would be added.
-            llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+            jarvis_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
             // Track the maximum sequence ending in this token.
             const auto& it = ctx->dry_max_token_repeat.find(token);
             if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
@@ -1961,21 +1961,21 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
     cur_p->sorted = false;
 }
 
-static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+static void jarvis_sampler_dry_reset(struct jarvis_sampler * smpl) {
+    auto * ctx = (jarvis_sampler_dry *) smpl->ctx;
     ctx->last_tokens.clear();
     ctx->dry_repeat_count.clear();
     ctx->dry_max_token_repeat.clear();
 }
 
-static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (llama_sampler_dry *) smpl->ctx;
+static struct jarvis_sampler * jarvis_sampler_dry_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (jarvis_sampler_dry *) smpl->ctx;
 
     // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
-    auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+    auto * result = jarvis_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
     // Copy the state, including the processed breakers
     {
-        auto * result_ctx = (llama_sampler_dry *) result->ctx;
+        auto * result_ctx = (jarvis_sampler_dry *) result->ctx;
         result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
         result_ctx->dry_repeat_count = ctx->dry_repeat_count;
         result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
@@ -1985,22 +1985,22 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
     return result;
 }
 
-static void llama_sampler_dry_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_dry *) smpl->ctx;
+static void jarvis_sampler_dry_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_dry *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_dry_i = {
-    /* .name   = */ llama_sampler_dry_name,
-    /* .accept = */ llama_sampler_dry_accept,
-    /* .apply  = */ llama_sampler_dry_apply,
-    /* .reset  = */ llama_sampler_dry_reset,
-    /* .clone  = */ llama_sampler_dry_clone,
-    /* .free   = */ llama_sampler_dry_free,
+static struct jarvis_sampler_i jarvis_sampler_dry_i = {
+    /* .name   = */ jarvis_sampler_dry_name,
+    /* .accept = */ jarvis_sampler_dry_accept,
+    /* .apply  = */ jarvis_sampler_dry_apply,
+    /* .reset  = */ jarvis_sampler_dry_reset,
+    /* .clone  = */ jarvis_sampler_dry_clone,
+    /* .free   = */ jarvis_sampler_dry_free,
 };
 
-struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+struct jarvis_sampler * jarvis_sampler_init_dry_impl(const struct jarvis_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
     int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
-    std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
+    std::unordered_multimap<jarvis_token, std::vector<jarvis_token>> processed_breakers;
     const int MAX_CHAR_LEN = 40;
     const int MAX_SEQ_LEN = 20;
 
@@ -2010,18 +2010,18 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
         // Process sequence breakers
         for (size_t i = 0; i < num_breakers; ++i) {
             if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
-                LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
+                JARVIS_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
                 continue;
             }
 
             std::string sequence_break(seq_breakers[i]);
             if (sequence_break.empty()) {
-                LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
+                JARVIS_LOG_WARN("skipping empty DRY sequence breaker\n");
                 continue;
             }
 
             if (sequence_break.size() > MAX_CHAR_LEN) {
-                LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
+                JARVIS_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
                 sequence_break.resize(MAX_CHAR_LEN);
             }
 
@@ -2029,9 +2029,9 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
         }
     }
 
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_dry_i,
-        /* .ctx   = */ new llama_sampler_dry {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_dry_i,
+        /* .ctx   = */ new jarvis_sampler_dry {
             /* .total_context_size     = */ context_size,
             /* .dry_multiplier         = */ dry_multiplier,
             /* .dry_base               = */ dry_base,
@@ -2040,34 +2040,34 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
             /* .dry_processed_breakers = */ std::move(processed_breakers),
             /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
             /* .dry_max_token_repeat   = */ {},
-            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
+            /* .last_tokens            = */ dry_enabled ? ring_buffer<jarvis_token>(effective_dry_penalty_last_n) : ring_buffer<jarvis_token>(0),
         },
     };
 }
 
 // wrapper for test-sampling.cpp
-struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
-    llama_vocab dummy_vocab;
-    auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
-    auto * ctx = (llama_sampler_dry *) result->ctx;
+struct jarvis_sampler * jarvis_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<jarvis_token>>& seq_breakers) {
+    jarvis_vocab dummy_vocab;
+    auto * result = jarvis_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+    auto * ctx = (jarvis_sampler_dry *) result->ctx;
 
     // Process the token-based sequence breakers
     ctx->dry_processed_breakers.clear();
     if (seq_breakers.empty()) {
-        LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
+        JARVIS_LOG_WARN("empty DRY sequence breakers list in jarvis_sampler_init_dry_testing\n");
     } else {
         for (const auto& breaker : seq_breakers) {
             if (breaker.empty()) {
-                LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
+                JARVIS_LOG_WARN("skipping DRY empty sequence breaker\n");
                 continue;
             }
-            llama_token head_token = breaker[0];
-            std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
+            jarvis_token head_token = breaker[0];
+            std::vector<jarvis_token> tail_tokens(breaker.begin() + 1, breaker.end());
             ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
         }
 
         if (ctx->dry_processed_breakers.empty()) {
-            LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
+            JARVIS_LOG_WARN("no valid DRY sequence breakers processed in jarvis_sampler_init_dry_testing\n");
         }
     }
 
@@ -2076,20 +2076,20 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
 
 // logit-bias
 
-struct llama_sampler_logit_bias {
+struct jarvis_sampler_logit_bias {
     const int32_t n_vocab;
 
-    const std::vector<llama_logit_bias> logit_bias;
+    const std::vector<jarvis_logit_bias> logit_bias;
 
-    std::vector<llama_logit_bias> to_search;
+    std::vector<jarvis_logit_bias> to_search;
 };
 
-static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_logit_bias_name(const struct jarvis_sampler * /*smpl*/) {
     return "logit-bias";
 }
 
-static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
+static void jarvis_sampler_logit_bias_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_logit_bias *) smpl->ctx;
 
     if (ctx->logit_bias.empty()) {
         return;
@@ -2121,33 +2121,33 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
     }
 }
 
-static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
-    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
+static struct jarvis_sampler * jarvis_sampler_logit_bias_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_logit_bias *) smpl->ctx;
+    return jarvis_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
 }
 
-static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_logit_bias *) smpl->ctx;
+static void jarvis_sampler_logit_bias_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_logit_bias *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_logit_bias_i = {
-    /* .name   = */ llama_sampler_logit_bias_name,
+static struct jarvis_sampler_i jarvis_sampler_logit_bias_i = {
+    /* .name   = */ jarvis_sampler_logit_bias_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_logit_bias_apply,
+    /* .apply  = */ jarvis_sampler_logit_bias_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_logit_bias_clone,
-    /* .free   = */ llama_sampler_logit_bias_free,
+    /* .clone  = */ jarvis_sampler_logit_bias_clone,
+    /* .free   = */ jarvis_sampler_logit_bias_free,
 };
 
-struct llama_sampler * llama_sampler_init_logit_bias(
+struct jarvis_sampler * jarvis_sampler_init_logit_bias(
                          int32_t   n_vocab,
                          int32_t   n_logit_bias,
-          const llama_logit_bias * logit_bias) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_logit_bias_i,
-        /* .ctx   = */ new llama_sampler_logit_bias {
+          const jarvis_logit_bias * logit_bias) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_logit_bias_i,
+        /* .ctx   = */ new jarvis_sampler_logit_bias {
             /* .n_vocab    = */ n_vocab,
-            /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
+            /* .logit_bias = */ std::vector<jarvis_logit_bias>(logit_bias, logit_bias + n_logit_bias),
             /* .to_search  = */ {},
         },
     };
@@ -2157,24 +2157,24 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 //#define GGML_DEBUG_SAMPLER_INFILL
 
-struct llama_sampler_infill {
-    const struct llama_vocab * vocab;
+struct jarvis_sampler_infill {
+    const struct jarvis_vocab * vocab;
 
     std::vector<char> buf0;
     std::vector<char> buf1;
 };
 
-static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+static const char * jarvis_sampler_infill_name(const struct jarvis_sampler * /*smpl*/) {
     return "infill";
 }
 
-static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_infill *) smpl->ctx;
+static void jarvis_sampler_infill_apply(struct jarvis_sampler * smpl, jarvis_token_data_array * cur_p) {
+    auto * ctx = (jarvis_sampler_infill *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    jarvis_sampler_softmax_impl(cur_p);
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
-#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#define LOG_DBG_CUR JARVIS_LOG_DEBUG
 #else
 #define LOG_DBG_CUR(...)
 #endif
@@ -2187,7 +2187,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     float p_eog_sum = 0.0f;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+        if (jarvis_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
             p_eog_sum += cur_p->data[i].p;
         } else {
             p_txt_sum += cur_p->data[i].p;
@@ -2209,7 +2209,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         float p_sum = 0.0f;
 
         for (size_t i = 0; i < size_org; ++i) {
-            if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+            if (jarvis_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
                 p_sum += cur_p->data[i].p;
 
                 cur_p->data[cur_p->size++] = cur_p->data[i];
@@ -2237,17 +2237,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
                 continue;
             }
 
-            int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+            int len0 = jarvis_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
             if (len0 < 0) {
                 ctx->buf0.resize(len0);
-                len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+                len0 = jarvis_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
                 assert(len0 > 0);
             }
 
-            int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+            int len1 = jarvis_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
             if (len1 < 0) {
                 ctx->buf1.resize(len1);
-                len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+                len1 = jarvis_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
                 assert(len1 > 0);
             }
 
@@ -2282,7 +2282,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
 
     for (size_t i = 0; i < size_org; ++i) {
-        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+        const bool is_eog = jarvis_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
 
         if (cur_p->data[i].p < thold && !is_eog) {
             continue;
@@ -2303,7 +2303,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     // if no non-EOG tokens are left -> reduce cur_p to single EOT token
     if (n_non_eog == 0) {
         cur_p->size = 1;
-        cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+        cur_p->data[0].id = jarvis_token_eot_impl(*ctx->vocab);
         cur_p->data[0].logit = 1.0f;
 
         return;
@@ -2325,7 +2325,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
 
     for (size_t i = 0; i < size_org; ++i) {
-        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+        const bool is_eog = jarvis_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
 
         if (cur_p->data[i].p < thold && !is_eog) {
             continue;
@@ -2346,29 +2346,29 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 #undef LOG_DBG_CUR
 }
 
-static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
-    return llama_sampler_init_infill_impl(*ctx->vocab);
+static struct jarvis_sampler * jarvis_sampler_infill_clone(const struct jarvis_sampler * smpl) {
+    const auto * ctx = (const jarvis_sampler_infill *) smpl->ctx;
+    return jarvis_sampler_init_infill_impl(*ctx->vocab);
 }
 
-static void llama_sampler_infill_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_infill *) smpl->ctx;
+static void jarvis_sampler_infill_free(struct jarvis_sampler * smpl) {
+    delete (jarvis_sampler_infill *) smpl->ctx;
 }
 
-static struct llama_sampler_i llama_sampler_infill_i = {
-    /* .name   = */ llama_sampler_infill_name,
+static struct jarvis_sampler_i jarvis_sampler_infill_i = {
+    /* .name   = */ jarvis_sampler_infill_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_infill_apply,
+    /* .apply  = */ jarvis_sampler_infill_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_infill_clone,
-    /* .free   = */ llama_sampler_infill_free,
+    /* .clone  = */ jarvis_sampler_infill_clone,
+    /* .free   = */ jarvis_sampler_infill_free,
 };
 
-struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_infill_i,
-        /* .ctx   = */ new llama_sampler_infill {
+struct jarvis_sampler * jarvis_sampler_init_infill_impl(
+        const struct jarvis_vocab & vocab) {
+    return new jarvis_sampler {
+        /* .iface = */ &jarvis_sampler_infill_i,
+        /* .ctx   = */ new jarvis_sampler_infill {
             /* .vocab = */ &vocab,
             /* .buf0 = */ std::vector<char>(512),
             /* .buf1 = */ std::vector<char>(512),
@@ -2378,42 +2378,42 @@ struct llama_sampler * llama_sampler_init_infill_impl(
 
 // utils
 
-uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
-    if (smpl->iface == &llama_sampler_dist_i) {
-        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+uint32_t jarvis_sampler_get_seed(const struct jarvis_sampler * smpl) {
+    if (smpl->iface == &jarvis_sampler_dist_i) {
+        return ((const jarvis_sampler_dist *) smpl->ctx)->seed_cur;
     }
 
-    if (smpl->iface == &llama_sampler_mirostat_i) {
-        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    if (smpl->iface == &jarvis_sampler_mirostat_i) {
+        return ((const jarvis_sampler_mirostat *) smpl->ctx)->seed_cur;
     }
 
-    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
-        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    if (smpl->iface == &jarvis_sampler_mirostat_v2_i) {
+        return ((const jarvis_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
     }
 
-    if (smpl->iface == &llama_sampler_chain_i) {
-        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+    if (smpl->iface == &jarvis_sampler_chain_i) {
+        const auto * ctx = (const jarvis_sampler_chain *) smpl->ctx;
         for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
-            const uint32_t seed = llama_sampler_get_seed(*it);
-            if (seed != LLAMA_DEFAULT_SEED) {
+            const uint32_t seed = jarvis_sampler_get_seed(*it);
+            if (seed != JARVIS_DEFAULT_SEED) {
                 return seed;
             }
         }
     }
 
-    return LLAMA_DEFAULT_SEED;
+    return JARVIS_DEFAULT_SEED;
 }
 
 // perf
 
-struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
-    struct llama_perf_sampler_data data = {};
+struct jarvis_perf_sampler_data jarvis_perf_sampler(const struct jarvis_sampler * chain) {
+    struct jarvis_perf_sampler_data data = {};
 
-    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+    if (chain == nullptr || chain->iface != &jarvis_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with jarvis_sampler_chain_init()\n", __func__);
     }
 
-    const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
+    const auto * ctx = (const struct jarvis_sampler_chain *) chain->ctx;
 
     data.t_sample_ms = 1e-3 * ctx->t_sample_us;
     data.n_sample    = std::max(0, ctx->n_sample);
@@ -2421,19 +2421,19 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
     return data;
 }
 
-void llama_perf_sampler_print(const struct llama_sampler * chain) {
-    const auto data = llama_perf_sampler(chain);
+void jarvis_perf_sampler_print(const struct jarvis_sampler * chain) {
+    const auto data = jarvis_perf_sampler(chain);
 
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    JARVIS_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
 }
 
-void llama_perf_sampler_reset(struct llama_sampler * chain) {
-    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+void jarvis_perf_sampler_reset(struct jarvis_sampler * chain) {
+    if (chain == nullptr || chain->iface != &jarvis_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with jarvis_sampler_chain_init()\n", __func__);
     }
 
-    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
+    auto * ctx = (struct jarvis_sampler_chain *) chain->ctx;
 
     ctx->t_sample_us = ctx->n_sample = 0;
 }
diff --git a/src/llama-sampling.h b/src/jarvis-sampling.h
similarity index 55%
rename from src/llama-sampling.h
rename to src/jarvis-sampling.h
index 919f6fdfcefb8..28e928ea643dc 100644
--- a/src/llama-sampling.h
+++ b/src/jarvis-sampling.h
@@ -1,18 +1,18 @@
 #pragma once
 
-// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
+// TODO: rename jarvis-sampling.h/.cpp to jarvis-sampler.h/.cpp ?
 
-#include "llama-grammar.h"
+#include "jarvis-grammar.h"
 
-struct llama_vocab;
-struct llama_grammar;
+struct jarvis_vocab;
+struct jarvis_grammar;
 
 // sampler chain
 
-struct llama_sampler_chain {
-    llama_sampler_chain_params params;
+struct jarvis_sampler_chain {
+    jarvis_sampler_chain_params params;
 
-    std::vector<struct llama_sampler *> samplers;
+    std::vector<struct jarvis_sampler *> samplers;
 
     // timing
 
@@ -21,16 +21,16 @@ struct llama_sampler_chain {
     mutable int32_t n_sample;
 };
 
-struct llama_sampler * llama_sampler_init_grammar_impl(
-        const struct llama_vocab & vocab,
+struct jarvis_sampler * jarvis_sampler_init_grammar_impl(
+        const struct jarvis_vocab & vocab,
                       const char * grammar_str,
                       const char * grammar_root);
 
-struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab);
+struct jarvis_sampler * jarvis_sampler_init_infill_impl(
+        const struct jarvis_vocab & vocab);
 
-struct llama_sampler * llama_sampler_init_dry_impl(
-        const struct llama_vocab &  vocab,
+struct jarvis_sampler * jarvis_sampler_init_dry_impl(
+        const struct jarvis_vocab &  vocab,
                          int32_t    context_size,
                            float    dry_multiplier,
                            float    dry_base,
@@ -39,10 +39,10 @@ struct llama_sampler * llama_sampler_init_dry_impl(
                       const char ** seq_breakers,
                           size_t    num_breakers);
 
-struct llama_sampler * llama_sampler_init_dry_testing(
+struct jarvis_sampler * jarvis_sampler_init_dry_testing(
                          int32_t   context_size,
                            float   dry_multiplier,
                            float   dry_base,
                          int32_t   dry_allowed_length,
                          int32_t   dry_penalty_last_n,
-  const std::vector<std::vector<llama_token>>& seq_breakers);
+  const std::vector<std::vector<jarvis_token>>& seq_breakers);
diff --git a/src/llama-vocab.cpp b/src/jarvis-vocab.cpp
similarity index 82%
rename from src/llama-vocab.cpp
rename to src/jarvis-vocab.cpp
index d1dc96276c2a2..9839631df22d6 100644
--- a/src/llama-vocab.cpp
+++ b/src/jarvis-vocab.cpp
@@ -1,4 +1,4 @@
-#include "llama-vocab.h"
+#include "jarvis-vocab.h"
 
 #include "unicode.h"
 
@@ -16,7 +16,7 @@
 // helpers
 //
 
-LLAMA_ATTRIBUTE_FORMAT(1, 2)
+JARVIS_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -72,7 +72,7 @@ struct naive_trie {
     }
     std::map<char, struct naive_trie> children;
     bool has_value;
-    llama_token value;
+    jarvis_token value;
 };
 
 //
@@ -84,11 +84,11 @@ struct llm_tokenizer {
    virtual ~llm_tokenizer() = default;
 };
 
-llama_vocab::~llama_vocab() {
+jarvis_vocab::~jarvis_vocab() {
     delete tokenizer;
 }
 
-int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+int jarvis_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
     GGML_ASSERT(token_left.find(' ')   == std::string::npos);
     GGML_ASSERT(token_left.find('\n')  == std::string::npos);
     GGML_ASSERT(token_right.find(' ')  == std::string::npos);
@@ -102,55 +102,55 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
     return it->second;
 }
 
-static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
+static enum jarvis_vocab_type jarvis_vocab_get_type(const jarvis_vocab & vocab) {
     return vocab.type;
 }
 
-static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+static bool jarvis_is_normal_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_NORMAL;
 }
 
-static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+static bool jarvis_is_unknown_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_UNKNOWN;
 }
 
-static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+static bool jarvis_is_control_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_CONTROL;
 }
 
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+static bool jarvis_is_byte_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_BYTE;
 }
 
-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+static bool jarvis_is_user_defined_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_USER_DEFINED;
 }
 
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+static bool jarvis_is_unused_token(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & JARVIS_TOKEN_ATTR_UNUSED;
 }
 
-static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
-    GGML_ASSERT(llama_is_byte_token(vocab, id));
+static uint8_t jarvis_token_to_byte(const jarvis_vocab & vocab, jarvis_token id) {
+    GGML_ASSERT(jarvis_vocab_get_type(vocab) != JARVIS_VOCAB_TYPE_NONE);
+    GGML_ASSERT(jarvis_is_byte_token(vocab, id));
     const auto & token_data = vocab.id_to_token.at(id);
-    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM:
-        case LLAMA_VOCAB_TYPE_UGM: {
+    switch (jarvis_vocab_get_type(vocab)) {
+        case JARVIS_VOCAB_TYPE_SPM:
+        case JARVIS_VOCAB_TYPE_UGM: {
             auto buf = token_data.text.substr(3, 2);
             return strtol(buf.c_str(), NULL, 16);
         }
-        case LLAMA_VOCAB_TYPE_BPE: {
+        case JARVIS_VOCAB_TYPE_BPE: {
             GGML_ABORT("fatal error");
             //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
         }
-        case LLAMA_VOCAB_TYPE_WPM: {
+        case JARVIS_VOCAB_TYPE_WPM: {
             GGML_ABORT("fatal error");
         }
         default:
@@ -158,11 +158,11 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
     }
 }
 
-static void llama_escape_whitespace(std::string & text) {
+static void jarvis_escape_whitespace(std::string & text) {
     replace_all(text, " ", "\xe2\x96\x81");
 }
 
-static void llama_unescape_whitespace(std::string & word) {
+static void jarvis_unescape_whitespace(std::string & word) {
     replace_all(word, "\xe2\x96\x81", " ");
 }
 
@@ -179,7 +179,7 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
 //
 // SPM tokenizer
 // original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+// https://github.com/ggerganov/jarvis.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
 //
 
 struct llm_bigram_spm {
@@ -197,13 +197,13 @@ struct llm_bigram_spm {
 };
 
 struct llm_tokenizer_spm : llm_tokenizer {
-    llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+    llm_tokenizer_spm(const jarvis_vocab & /*vocab*/) : llm_tokenizer() {}
 };
 
 struct llm_tokenizer_spm_session {
-    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_spm_session(const jarvis_vocab & vocab) : vocab(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<jarvis_vocab::id> & output) {
 
         // split string into utf8 chars
         int index = 0;
@@ -243,7 +243,7 @@ struct llm_tokenizer_spm_session {
             left_sym.n += right_sym.n;
             right_sym.n = 0;
 
-            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
+            //JARVIS_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
 
             // remove the right sym from the chain
             left_sym.next = right_sym.next;
@@ -263,7 +263,7 @@ struct llm_tokenizer_spm_session {
     }
 
 private:
-    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
+    void resegment(llm_symbol & symbol, std::vector<jarvis_vocab::id> & output) {
         auto text = std::string(symbol.text, symbol.n);
         auto token = vocab.token_to_id.find(text);
 
@@ -279,7 +279,7 @@ struct llm_tokenizer_spm_session {
             // output any symbols that did not form tokens as bytes.
             output.reserve(output.size() + symbol.n);
             for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
+                jarvis_vocab::id token_id = jarvis_byte_to_token_impl(vocab, symbol.text[j]);
                 output.push_back(token_id);
             }
             return;
@@ -318,7 +318,7 @@ struct llm_tokenizer_spm_session {
         rev_merge[text] = std::make_pair(left, right);
     }
 
-    const llama_vocab & vocab;
+    const jarvis_vocab & vocab;
     // currently unused
     // const llm_tokenizer_spm * spm_tokenizer;
 
@@ -336,7 +336,7 @@ struct llm_tokenizer_spm_session {
 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
 
 template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
-class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
+class jarvis_priority_queue : public std::priority_queue<T, Container, Compare> {
 public:
     using std::priority_queue<T, Container, Compare>::priority_queue;
 
@@ -358,7 +358,7 @@ struct llm_bigram_bpe {
     };
 
     using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    using queue = jarvis_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
     llm_symbol::index left;
     llm_symbol::index right;
     std::string text;
@@ -367,26 +367,26 @@ struct llm_bigram_bpe {
 };
 
 struct llm_tokenizer_bpe : llm_tokenizer {
-    llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
-        GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
+    llm_tokenizer_bpe(const jarvis_vocab & vocab) : llm_tokenizer() {
+        GGML_ASSERT(vocab.type == JARVIS_VOCAB_TYPE_BPE);
         switch (vocab.type_pre) {
-            case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+            case JARVIS_VOCAB_PRE_TYPE_JARVIS3:
                 regex_exprs = {
                     // original regex from tokenizer.json
                     //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 
-                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
+                    // adapted: https://github.com/ggerganov/jarvis.cpp/pull/6920#issuecomment-2080233989
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_DBRX:
-            case LLAMA_VOCAB_PRE_TYPE_SMAUG:
+            case JARVIS_VOCAB_PRE_TYPE_DBRX:
+            case JARVIS_VOCAB_PRE_TYPE_SMAUG:
                 regex_exprs = {
-                    // same as llama3
+                    // same as jarvis3
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
+            case JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
                     "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
@@ -396,7 +396,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "\\p{N}+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
+            case JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                 regex_exprs = {
                     "[\r\n]",
                     "\\s?\\p{L}+",
@@ -405,66 +405,66 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "\\p{N}",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_FALCON:
+            case JARVIS_VOCAB_PRE_TYPE_FALCON:
                 regex_exprs = {
                     "[\\p{P}\\$\\+<=>\\^~\\|`]+",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                     "[0-9][0-9][0-9]",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
-            case LLAMA_VOCAB_PRE_TYPE_REFACT:
-            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
-            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
-            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
-            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+            case JARVIS_VOCAB_PRE_TYPE_STARCODER:
+            case JARVIS_VOCAB_PRE_TYPE_REFACT:
+            case JARVIS_VOCAB_PRE_TYPE_COMMAND_R:
+            case JARVIS_VOCAB_PRE_TYPE_SMOLLM:
+            case JARVIS_VOCAB_PRE_TYPE_CODESHELL:
+            case JARVIS_VOCAB_PRE_TYPE_EXAONE:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_GPT2:
-            case LLAMA_VOCAB_PRE_TYPE_MPT:
-            case LLAMA_VOCAB_PRE_TYPE_OLMO:
-            case LLAMA_VOCAB_PRE_TYPE_JAIS:
+            case JARVIS_VOCAB_PRE_TYPE_GPT2:
+            case JARVIS_VOCAB_PRE_TYPE_MPT:
+            case JARVIS_VOCAB_PRE_TYPE_OLMO:
+            case JARVIS_VOCAB_PRE_TYPE_JAIS:
                 regex_exprs = {
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
-            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+            case JARVIS_VOCAB_PRE_TYPE_STABLELM2:
+            case JARVIS_VOCAB_PRE_TYPE_QWEN2:
                 regex_exprs = {
                     // original regex from tokenizer.json
                     // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_PORO:
-            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
-            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
+            case JARVIS_VOCAB_PRE_TYPE_PORO:
+            case JARVIS_VOCAB_PRE_TYPE_BLOOM:
+            case JARVIS_VOCAB_PRE_TYPE_GPT3_FINNISH:
                 regex_exprs = {
                     " ?[^(\\s|.,!?…。，、।۔،)]+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
+            case JARVIS_VOCAB_PRE_TYPE_CHATGLM4:
                 regex_exprs = {
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+            case JARVIS_VOCAB_PRE_TYPE_VIKING:
                 regex_exprs = {
                     " ?[^(\\s|.,!?…。，、।۔،)]+",
                     "\\p{N}",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+            case JARVIS_VOCAB_PRE_TYPE_TEKKEN:
                 // original regex from tokenizer.json
                 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                 regex_exprs = {
                     "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
-            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+            case JARVIS_VOCAB_PRE_TYPE_CHAMELEON:
                 // Note: in theory, the special token (sentinel and image token) regex_exprs below
                 // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
                 // However, since the upstream pre-tokenizer uses them, they are also
@@ -494,14 +494,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 };
 
 struct llm_tokenizer_bpe_session {
-    llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
+    llm_tokenizer_bpe_session(const jarvis_vocab & vocab) : vocab(vocab),
         bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
 
-    static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output)  {
+    static void append(const jarvis_vocab::id token_id, std::vector<jarvis_vocab::id> & output)  {
         output.push_back(token_id);
     }
 
-    bool append_bos(std::vector<llama_vocab::id> & output) const {
+    bool append_bos(std::vector<jarvis_vocab::id> & output) const {
         if (vocab.tokenizer_add_bos) {
             GGML_ASSERT(vocab.special_bos_id != -1);
             output.push_back(vocab.special_bos_id);
@@ -510,7 +510,7 @@ struct llm_tokenizer_bpe_session {
         return false;
     }
 
-    bool append_eos(std::vector<llama_vocab::id> & output) const {
+    bool append_eos(std::vector<jarvis_vocab::id> & output) const {
         if (vocab.tokenizer_add_eos) {
             GGML_ASSERT(vocab.special_eos_id != -1);
             output.push_back(vocab.special_eos_id);
@@ -519,22 +519,22 @@ struct llm_tokenizer_bpe_session {
         return false;
     }
 
-    void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
+    void check_double_bos_eos(const std::vector<jarvis_vocab::id> & output) const {
         if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-            LLAMA_LOG_WARN(
+            JARVIS_LOG_WARN(
                 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                 "Are you sure this is what you want?\n", __FUNCTION__);
         }
         if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
-            LLAMA_LOG_WARN(
+            JARVIS_LOG_WARN(
                 "%s: Added a EOS token to the prompt as specified by the model but the prompt "
                 "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
                 "Are you sure this is what you want?\n", __FUNCTION__);
         }
     }
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<jarvis_vocab::id> & output) {
         int final_prev_index = -1;
         const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
 
@@ -665,7 +665,7 @@ struct llm_tokenizer_bpe_session {
         work_queue.push(bigram);
     }
 
-    const llama_vocab & vocab;
+    const jarvis_vocab & vocab;
     const llm_tokenizer_bpe * bpe_tokenizer;
 
     std::vector<llm_symbol> symbols;
@@ -678,13 +678,13 @@ struct llm_tokenizer_bpe_session {
 //
 
 struct llm_tokenizer_wpm : llm_tokenizer {
-    llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+    llm_tokenizer_wpm(const jarvis_vocab & /*vocab*/) : llm_tokenizer() {}
 };
 
 struct llm_tokenizer_wpm_session {
-    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_wpm_session(const jarvis_vocab & vocab) : vocab(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<jarvis_vocab::id> & output) {
         const auto & token_map = vocab.token_to_id;
         // normalize and split by whitespace
         std::vector<std::string> words = preprocess(text);
@@ -785,7 +785,7 @@ struct llm_tokenizer_wpm_session {
     }
 
 private:
-    const llama_vocab & vocab;
+    const jarvis_vocab & vocab;
     // currently unused
     // const llm_tokenizer_wpm * wpm_tokenizer;
 };
@@ -795,7 +795,7 @@ struct llm_tokenizer_wpm_session {
 //
 
 struct llm_tokenizer_ugm : llm_tokenizer {
-    llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
+    llm_tokenizer_ugm(const jarvis_vocab & vocab) : llm_tokenizer() {
         if (vocab.precompiled_charsmap.size() > 0) {
             size_t charsmap_offset = 0;
 
@@ -822,18 +822,18 @@ struct llm_tokenizer_ugm : llm_tokenizer {
         for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
             const auto &token_data = vocab.id_to_token[id];
 
-            if (llama_is_normal_token(vocab, id)) {
+            if (jarvis_is_normal_token(vocab, id)) {
                 min_score = std::min<float>(min_score, token_data.score);
                 max_score = std::max<float>(max_score, token_data.score);
             }
 
-            if (llama_is_normal_token(vocab, id) ||
-                llama_is_user_defined_token(vocab, id) ||
-                llama_is_unused_token(vocab, id)) {
+            if (jarvis_is_normal_token(vocab, id) ||
+                jarvis_is_user_defined_token(vocab, id) ||
+                jarvis_is_unused_token(vocab, id)) {
                 token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
             }
 
-            if (llama_is_user_defined_token(vocab, id)) {
+            if (jarvis_is_user_defined_token(vocab, id)) {
                 user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
             }
         }
@@ -862,7 +862,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
 };
 
 struct llm_tokenizer_ugm_session {
-    llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
+    llm_tokenizer_ugm_session(const jarvis_vocab & vocab) : vocab(vocab),
         ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
 
     /* This implementation is based on SentencePiece optimized Viterbi algorithm for
@@ -878,7 +878,7 @@ struct llm_tokenizer_ugm_session {
      * After processing the whole sequence we backtrack from the end to get
      * the best tokenization.
     */
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<jarvis_vocab::id> & output) {
         // get current size of output (for reversal later)
         size_t output_size = output.size();
 
@@ -912,14 +912,14 @@ struct llm_tokenizer_ugm_session {
                     if (prefix_offset - input_offset == n_utf8_code_units) {
                         single_codepoint_token_found = true;
                     }
-                    llama_token token_id = node->value;
+                    jarvis_token token_id = node->value;
                     const auto & token_data = vocab.id_to_token[token_id];
 
                     // we set the user-defined token scores to 0 to make them more likely to be selected
                     // (normal token scores are log probabilities, so they are negative)
                     // score type is double here to make tokenization results exactly
                     // the same as in the HF tokenizer using SentencePiece
-                    const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
+                    const double token_score = jarvis_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
@@ -1061,7 +1061,7 @@ struct llm_tokenizer_ugm_session {
 
     // this structure stores the best tokenization so far at input_offset
     struct best_tokenization {
-        llama_token token_id;
+        jarvis_token token_id;
         size_t input_offset;
         float score_sum;
     };
@@ -1137,7 +1137,7 @@ struct llm_tokenizer_ugm_session {
         }
     }
 
-    const llama_vocab & vocab;
+    const jarvis_vocab & vocab;
     const llm_tokenizer_ugm * ugm_tokenizer;
 };
 
@@ -1145,7 +1145,7 @@ struct llm_tokenizer_ugm_session {
 // RWKV tokenizer
 //
 
-static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
+static std::vector<uint8_t> jarvis_unescape_rwkv_token(const std::string & escaped) {
     std::vector<uint8_t> output;
     output.reserve(escaped.size());
 
@@ -1200,14 +1200,14 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
 }
 
 struct llm_tokenizer_rwkv : llm_tokenizer {
-    llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
+    llm_tokenizer_rwkv(const jarvis_vocab & vocab) : llm_tokenizer() {
         // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
         // For now, we decode the vocab here into the lookup we'll use for tokenization.
 
         // build trie
         for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
             const auto & token = vocab.id_to_token[id];
-            const auto data = llama_unescape_rwkv_token(token.text);
+            const auto data = jarvis_unescape_rwkv_token(token.text);
             token_matcher.insert((const char *) data.data(), data.size(), id);
         }
     }
@@ -1216,10 +1216,10 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
 };
 
 struct llm_tokenizer_rwkv_session {
-    llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
+    llm_tokenizer_rwkv_session(const jarvis_vocab & vocab) : vocab(vocab),
         rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<jarvis_vocab::id> & output) {
         uint32_t position = 0;
         while (position < text.size()) {
             const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
@@ -1248,25 +1248,25 @@ struct llm_tokenizer_rwkv_session {
     }
 
 private:
-    const llama_vocab & vocab;
+    const jarvis_vocab & vocab;
     const llm_tokenizer_rwkv & rwkv_tokenizer;
 };
 
-void llama_vocab::init_tokenizer() {
+void jarvis_vocab::init_tokenizer() {
     switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM:
+        case JARVIS_VOCAB_TYPE_SPM:
             tokenizer = new llm_tokenizer_spm(*this);
             break;
-        case LLAMA_VOCAB_TYPE_BPE:
+        case JARVIS_VOCAB_TYPE_BPE:
             tokenizer = new llm_tokenizer_bpe(*this);
             break;
-        case LLAMA_VOCAB_TYPE_WPM:
+        case JARVIS_VOCAB_TYPE_WPM:
             tokenizer = new llm_tokenizer_wpm(*this);
             break;
-        case LLAMA_VOCAB_TYPE_UGM:
+        case JARVIS_VOCAB_TYPE_UGM:
             tokenizer = new llm_tokenizer_ugm(*this);
             break;
-        case LLAMA_VOCAB_TYPE_RWKV:
+        case JARVIS_VOCAB_TYPE_RWKV:
             tokenizer = new llm_tokenizer_rwkv(*this);
             break;
         default:
@@ -1284,7 +1284,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
 } FRAGMENT_BUFFER_VARIANT_TYPE;
 
 struct fragment_buffer_variant {
-    fragment_buffer_variant(llama_vocab::id _token)
+    fragment_buffer_variant(jarvis_vocab::id _token)
     :
         type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
         token(_token),
@@ -1295,7 +1295,7 @@ struct fragment_buffer_variant {
     fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
     :
         type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
-        token((llama_vocab::id) - 1),
+        token((jarvis_vocab::id) - 1),
         raw_text(_raw_text),
         offset(_offset),
         length(_length){
@@ -1305,7 +1305,7 @@ struct fragment_buffer_variant {
         }
 
     const FRAGMENT_BUFFER_VARIANT_TYPE type;
-    const llama_vocab::id token;
+    const jarvis_vocab::id token;
     const std::string _dummy;
     const std::string & raw_text;
     const uint64_t offset;
@@ -1314,13 +1314,13 @@ struct fragment_buffer_variant {
 
 // #define PRETOKENIZERDEBUG
 
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
+static void tokenizer_st_partition(const jarvis_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
     // for each special token
-    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
+    for (const jarvis_vocab::id special_id : vocab.cache_special_tokens) {
         const auto & data = vocab.id_to_token[special_id];
         const auto & special_token = data.text;
 
-        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+        if (!parse_special && (data.attr & (JARVIS_TOKEN_ATTR_CONTROL | JARVIS_TOKEN_ATTR_UNKNOWN))) {
             // Ignore control and unknown tokens when parse_special == false
             continue;
             // User-defined tokens are still pre-tokenized before everything else
@@ -1354,7 +1354,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                     if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
 
 #ifdef PRETOKENIZERDEBUG
-                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+                    JARVIS_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                     auto source = std::distance(buffer.begin(), it);
 
@@ -1365,7 +1365,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         const int64_t left_reminder_offset = raw_text_base_offset + 0;
                         int64_t left_reminder_length = match - raw_text_base_offset;
 
-                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
+                        if (data.attr & JARVIS_TOKEN_ATTR_LSTRIP) {
                             while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
                                 left_reminder_length--;
                             }
@@ -1377,7 +1377,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         }
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+                        JARVIS_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
 #endif
                     }
 
@@ -1390,7 +1390,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         int64_t right_reminder_offset = match + special_token.length();
                         int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
 
-                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
+                        if (data.attr & JARVIS_TOKEN_ATTR_RSTRIP) {
                             while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
                                 right_reminder_offset++;
                                 right_reminder_length--;
@@ -1403,7 +1403,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         }
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+                        JARVIS_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
 #endif
 
                         if (source == 0) {
@@ -1417,7 +1417,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         raw_text_base_length = right_reminder_length;
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+                        JARVIS_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                     } else {
                         if (source == 0) {
@@ -1434,14 +1434,14 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
-std::vector<llama_vocab::id> llama_tokenize_internal(
-        const llama_vocab & vocab,
+std::vector<jarvis_vocab::id> jarvis_tokenize_internal(
+        const jarvis_vocab & vocab,
         std::string raw_text,
         bool add_special,
         bool parse_special) {
-    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call jarvis_vocab::init_tokenizer() first.");
 
-    std::vector<llama_vocab::id> output;
+    std::vector<jarvis_vocab::id> output;
     std::forward_list<fragment_buffer_variant> fragment_buffer;
 
     if (!raw_text.empty()) {
@@ -1450,7 +1450,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
     }
 
     switch (vocab.type) {
-        case LLAMA_VOCAB_TYPE_SPM:
+        case JARVIS_VOCAB_TYPE_SPM:
             {
                 // OG tokenizer behavior:
                 //
@@ -1475,9 +1475,9 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                         }
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        JARVIS_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        llama_escape_whitespace(raw_text);
+                        jarvis_escape_whitespace(raw_text);
                         llm_tokenizer_spm_session session(vocab);
                         session.tokenize(raw_text, output);
                         is_prev_special = false;
@@ -1488,7 +1488,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 }
 
                 if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-                    LLAMA_LOG_WARN(
+                    JARVIS_LOG_WARN(
                         "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                         "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                         "Are you sure this is what you want?\n", __FUNCTION__);
@@ -1499,7 +1499,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     output.push_back(vocab.special_eos_id);
                 }
             } break;
-        case LLAMA_VOCAB_TYPE_BPE:
+        case JARVIS_VOCAB_TYPE_BPE:
             {
                 llm_tokenizer_bpe_session session(vocab);
                 // it calls some other methods that are not exist in llm_tokenizer,
@@ -1512,7 +1512,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        JARVIS_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
@@ -1525,7 +1525,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     session.check_double_bos_eos(output);
                 }
             } break;
-        case LLAMA_VOCAB_TYPE_WPM:
+        case JARVIS_VOCAB_TYPE_WPM:
             {
                 if (add_special) {
                     GGML_ASSERT(vocab.special_cls_id != -1);
@@ -1539,7 +1539,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        JARVIS_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
@@ -1552,7 +1552,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     output.push_back(vocab.special_sep_id);
                 }
             } break;
-        case LLAMA_VOCAB_TYPE_UGM:
+        case JARVIS_VOCAB_TYPE_UGM:
             {
                 if (add_special && vocab.tokenizer_add_bos) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
@@ -1564,7 +1564,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        JARVIS_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
@@ -1573,7 +1573,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 }
 
                 if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-                    LLAMA_LOG_WARN(
+                    JARVIS_LOG_WARN(
                         "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                         "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                         "Are you sure this is what you want?\n", __FUNCTION__);
@@ -1584,7 +1584,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     output.push_back(vocab.special_eos_id);
                 }
             } break;
-        case LLAMA_VOCAB_TYPE_RWKV:
+        case JARVIS_VOCAB_TYPE_RWKV:
             {
                 llm_tokenizer_rwkv_session session(vocab);
                 for (const auto & fragment : fragment_buffer) {
@@ -1592,7 +1592,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        JARVIS_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
 
                         session.tokenize(raw_text, output);
@@ -1601,19 +1601,19 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                     }
                 }
             } break;
-        case LLAMA_VOCAB_TYPE_NONE:
+        case JARVIS_VOCAB_TYPE_NONE:
             GGML_ABORT("fatal error");
     }
 
     return output;
 }
 
-llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
-    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
+jarvis_token jarvis_byte_to_token_impl(const jarvis_vocab & vocab, uint8_t ch) {
+    GGML_ASSERT(jarvis_vocab_get_type(vocab) != JARVIS_VOCAB_TYPE_NONE);
     static const char * hex = "0123456789ABCDEF";
-    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM:
-        case LLAMA_VOCAB_TYPE_UGM: {
+    switch (jarvis_vocab_get_type(vocab)) {
+        case JARVIS_VOCAB_TYPE_SPM:
+        case JARVIS_VOCAB_TYPE_UGM: {
             const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
             auto token = vocab.token_to_id.find(buf);
             if (token != vocab.token_to_id.end()) {
@@ -1623,8 +1623,8 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
             const char buf2[2] = { (char)ch, 0 };
             return vocab.token_to_id.at(buf2);
         }
-        case LLAMA_VOCAB_TYPE_WPM:
-        case LLAMA_VOCAB_TYPE_BPE: {
+        case JARVIS_VOCAB_TYPE_WPM:
+        case JARVIS_VOCAB_TYPE_BPE: {
             return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
         }
         default:
@@ -1632,116 +1632,116 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
     }
 }
 
-const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+const char * jarvis_token_get_text_impl(const struct jarvis_vocab & vocab, jarvis_token token) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
     return vocab.id_to_token[token].text.c_str();
 }
 
-float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+float jarvis_token_get_score_impl(const struct jarvis_vocab & vocab, jarvis_token token) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
     return vocab.id_to_token[token].score;
 }
 
-llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+jarvis_token_attr jarvis_token_get_attr_impl(const struct jarvis_vocab & vocab, jarvis_token token) {
+    GGML_ASSERT(vocab.type != JARVIS_VOCAB_TYPE_NONE);
     return vocab.id_to_token[token].attr;
 }
 
-bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
+bool jarvis_token_is_eog_impl(const struct jarvis_vocab & vocab, jarvis_token token) {
     return token != -1 && vocab.special_eog_ids.count(token) > 0;
 }
 
-bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
-    return llama_is_control_token(vocab, token);
+bool jarvis_token_is_control_impl(const struct jarvis_vocab & vocab, jarvis_token token) {
+    return jarvis_is_control_token(vocab, token);
 }
 
-llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_bos_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_bos_id;
 }
 
-llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_eos_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_eos_id;
 }
 
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_eot_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_eot_id;
 }
 
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_eom_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_eom_id;
 }
 
-llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_cls_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_cls_id;
 }
 
-llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_sep_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_sep_id;
 }
 
-llama_token llama_token_nl_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_nl_impl(const struct jarvis_vocab & vocab) {
     return vocab.linefeed_id;
 }
 
-llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_pad_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_pad_id;
 }
 
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
+bool jarvis_add_bos_token_impl(const struct jarvis_vocab & vocab) {
     return vocab.tokenizer_add_bos;
 }
 
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
+bool jarvis_add_eos_token_impl(const struct jarvis_vocab & vocab) {
     return vocab.tokenizer_add_eos;
 }
 
-llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_prefix_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_pre_id;
 }
 
-llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_middle_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_mid_id;
 }
 
-llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_suffix_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_suf_id;
 }
 
-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_pre_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_pre_id;
 }
 
-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_suf_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_suf_id;
 }
 
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_mid_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_mid_id;
 }
 
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_pad_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_pad_id;
 }
 
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_rep_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_rep_id;
 }
 
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
+jarvis_token jarvis_token_fim_sep_impl(const struct jarvis_vocab & vocab) {
     return vocab.special_fim_sep_id;
 }
 
-int32_t llama_tokenize_impl(
-        const struct llama_vocab & vocab,
+int32_t jarvis_tokenize_impl(
+        const struct jarvis_vocab & vocab,
                       const char * text,
                          int32_t   text_len,
-                     llama_token * tokens,
+                     jarvis_token * tokens,
                          int32_t   n_tokens_max,
                             bool   add_special,
                             bool   parse_special) {
-    auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
+    auto res = jarvis_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
     if (n_tokens_max < (int) res.size()) {
-        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        // JARVIS_LOG_ERROR("%s: too many tokens\n", __func__);
         return -((int) res.size());
     }
 
@@ -1752,7 +1752,7 @@ int32_t llama_tokenize_impl(
     return res.size();
 }
 
-static std::string llama_decode_text(const std::string & text) {
+static std::string jarvis_decode_text(const std::string & text) {
     std::string decoded_text;
 
     const auto cpts = unicode_cpts_from_utf8(text);
@@ -1773,10 +1773,10 @@ static std::string llama_decode_text(const std::string & text) {
 }
 
 // does not write null-terminator to buf
-int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
-    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
-    static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
-    const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
+int32_t jarvis_token_to_piece_impl(const struct jarvis_vocab & vocab, jarvis_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
+    // ref: https://github.com/ggerganov/jarvis.cpp/pull/7587#discussion_r1620983843
+    static const int attr_special = JARVIS_TOKEN_ATTR_UNKNOWN | JARVIS_TOKEN_ATTR_CONTROL;
+    const jarvis_token_attr attr = jarvis_token_get_attr_impl(vocab, token);
     if (!special && (attr & attr_special)) {
         return 0;
     }
@@ -1807,40 +1807,40 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
 
     if (0 <= token && token < (int32_t) vocab.id_to_token.size()) {
         const std::string & token_text = vocab.id_to_token[token].text;
-        switch (llama_vocab_get_type(vocab)) {
-            case LLAMA_VOCAB_TYPE_WPM:
-            case LLAMA_VOCAB_TYPE_SPM:
-            case LLAMA_VOCAB_TYPE_UGM: {
+        switch (jarvis_vocab_get_type(vocab)) {
+            case JARVIS_VOCAB_TYPE_WPM:
+            case JARVIS_VOCAB_TYPE_SPM:
+            case JARVIS_VOCAB_TYPE_UGM: {
                 // NOTE: we accept all unsupported token types,
                 // suppressing them like CONTROL tokens.
-                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                if (attr & (attr_special | JARVIS_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
                 }
-                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                if (attr & JARVIS_TOKEN_ATTR_NORMAL) {
                     std::string result = token_text;
-                    llama_unescape_whitespace(result);
+                    jarvis_unescape_whitespace(result);
                     return _try_copy(result.data(), result.size());
                 }
-                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
-                    char byte = (char) llama_token_to_byte(vocab, token);
+                if (attr & JARVIS_TOKEN_ATTR_BYTE) {
+                    char byte = (char) jarvis_token_to_byte(vocab, token);
                     return _try_copy((char*) &byte, 1);
                 }
                 break;
             }
-            case LLAMA_VOCAB_TYPE_BPE: {
+            case JARVIS_VOCAB_TYPE_BPE: {
                 // NOTE: we accept all unsupported token types,
                 // suppressing them like CONTROL tokens.
-                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                if (attr & (attr_special | JARVIS_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
                 }
-                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
-                    std::string result = llama_decode_text(token_text);
+                if (attr & JARVIS_TOKEN_ATTR_NORMAL) {
+                    std::string result = jarvis_decode_text(token_text);
                     return _try_copy(result.data(), result.size());
                 }
                 break;
             }
-            case LLAMA_VOCAB_TYPE_RWKV: {
-                std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
+            case JARVIS_VOCAB_TYPE_RWKV: {
+                std::vector<uint8_t> result = jarvis_unescape_rwkv_token(token_text);
 
                 // If we don't have enough space, return an error
                 if (result.size() > (size_t)length) {
@@ -1858,15 +1858,15 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
     return 0;
 }
 
-int32_t llama_detokenize_impl(
-        const struct llama_vocab & vocab,
-               const llama_token * tokens,
+int32_t jarvis_detokenize_impl(
+        const struct jarvis_vocab & vocab,
+               const jarvis_token * tokens,
                          int32_t   n_tokens,
                             char * text,
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
-    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call jarvis_vocab::init_tokenizer() first.");
 
     int32_t avail = text_len_max;
     int32_t total = 0;
@@ -1890,7 +1890,7 @@ int32_t llama_detokenize_impl(
 
     for (int32_t i = 0; i < n_tokens; ++i) {
         GGML_ASSERT(avail >= 0);
-        int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
+        int32_t n_chars = jarvis_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
         remove_space = false;
         if (n_chars < 0) {
             avail = 0;
@@ -1967,13 +1967,13 @@ int32_t llama_detokenize_impl(
     return total <= text_len_max ? total : -total;
 }
 
-std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
+std::string jarvis_detokenize(const struct jarvis_vocab & vocab, const std::vector<jarvis_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = jarvis_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = jarvis_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
diff --git a/src/jarvis-vocab.h b/src/jarvis-vocab.h
new file mode 100644
index 0000000000000..fc272a285ba2a
--- /dev/null
+++ b/src/jarvis-vocab.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include "jarvis-impl.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <set>
+
+struct llm_tokenizer;
+
+struct jarvis_vocab {
+    using id    = jarvis_token;
+    using token = std::string;
+    using tattr = jarvis_token_attr;
+
+    struct token_data {
+        token text;
+        float score;
+        tattr attr;
+    };
+
+    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+
+    enum jarvis_vocab_type     type     = JARVIS_VOCAB_TYPE_SPM;
+    enum jarvis_vocab_pre_type type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
+
+    int max_token_len = 0; // used for optimizing longest token search
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::vector<id>    cache_special_tokens;
+    std::vector<token> cache_token_to_piece; // jarvis_token_to_piece(special = true);
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default JARVIS special tokens
+    // TODO: should we set all of these to JARVIS_TOKEN_NULL?
+    id special_bos_id  = 1;
+    id special_eos_id  = 2;
+    id special_eot_id  = JARVIS_TOKEN_NULL;
+    id special_eom_id  = JARVIS_TOKEN_NULL;
+    id special_unk_id  = 0;
+    id special_sep_id  = JARVIS_TOKEN_NULL;
+    id special_pad_id  = JARVIS_TOKEN_NULL;
+    id special_cls_id  = JARVIS_TOKEN_NULL;
+    id special_mask_id = JARVIS_TOKEN_NULL;
+
+    id linefeed_id = 13;
+
+    // fim tokens
+    id special_fim_pre_id = JARVIS_TOKEN_NULL;
+    id special_fim_suf_id = JARVIS_TOKEN_NULL;
+    id special_fim_mid_id = JARVIS_TOKEN_NULL;
+    id special_fim_pad_id = JARVIS_TOKEN_NULL;
+    id special_fim_rep_id = JARVIS_TOKEN_NULL; // repo
+    id special_fim_sep_id = JARVIS_TOKEN_NULL; // file separator
+
+    // set of all tokens that cause "end of generation"
+    std::set<id> special_eog_ids;
+
+    // tokenizer flags
+    bool tokenizer_add_space_prefix           = false;
+    bool tokenizer_add_bos                    = false;
+    bool tokenizer_add_eos                    = false;
+    bool tokenizer_ignore_merges              = false;
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
+    bool tokenizer_remove_extra_whitespaces   = false;
+    bool tokenizer_escape_whitespaces         = true;
+    bool tokenizer_treat_whitespace_as_suffix = false;
+
+    std::vector<char> precompiled_charsmap;
+
+    llm_tokenizer * tokenizer = nullptr;
+
+    jarvis_vocab() = default;
+    ~jarvis_vocab();
+
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    void init_tokenizer();
+};
+
+//
+// internal API
+//
+
+// TODO: rename to jarvis_tokenize_impl
+// TODO: This should probably be in jarvis.h
+std::vector<jarvis_vocab::id> jarvis_tokenize_internal(
+        const jarvis_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special = false);
+
+// TODO: move the API below as member functions of jarvis_vocab
+jarvis_token jarvis_byte_to_token_impl(const jarvis_vocab & vocab, uint8_t ch);
+
+const char * jarvis_token_get_text_impl(const struct jarvis_vocab & vocab, jarvis_token token);
+
+float jarvis_token_get_score_impl(const struct jarvis_vocab & vocab, jarvis_token token);
+
+jarvis_token_attr jarvis_token_get_attr_impl(const struct jarvis_vocab & vocab, jarvis_token token);
+
+bool jarvis_token_is_eog_impl(const struct jarvis_vocab & vocab, jarvis_token token);
+
+bool jarvis_token_is_control_impl(const struct jarvis_vocab & vocab, jarvis_token token);
+
+jarvis_token jarvis_token_bos_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_eos_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_eot_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_eom_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_cls_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_sep_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_nl_impl (const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_pad_impl(const struct jarvis_vocab & vocab);
+
+jarvis_token jarvis_token_prefix_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_middle_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_suffix_impl(const struct jarvis_vocab & vocab);
+
+jarvis_token jarvis_token_fim_pre_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_fim_suf_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_fim_mid_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_fim_pad_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_fim_rep_impl(const struct jarvis_vocab & vocab);
+jarvis_token jarvis_token_fim_sep_impl(const struct jarvis_vocab & vocab);
+
+bool jarvis_add_bos_token_impl(const struct jarvis_vocab & vocab);
+bool jarvis_add_eos_token_impl(const struct jarvis_vocab & vocab);
+
+int32_t jarvis_tokenize_impl(
+        const struct jarvis_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     jarvis_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+// does not write null-terminator to buf
+int32_t jarvis_token_to_piece_impl(
+        const struct jarvis_vocab & vocab,
+                     jarvis_token   token,
+                            char * buf,
+                         int32_t   length,
+                         int32_t   lstrip,
+                            bool   special);
+
+// check if token0 is contained as a prefix in token1
+bool jarvis_token_is_prefix_impl(
+        const struct jarvis_vocab & vocab,
+                     jarvis_token   token0,
+                     jarvis_token   token1);
+
+int32_t jarvis_detokenize_impl(
+        const struct jarvis_vocab & vocab,
+               const jarvis_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
+
+std::string jarvis_detokenize(
+        const struct jarvis_vocab & vocab,
+  const std::vector<jarvis_token> & tokens,
+                            bool   special);
diff --git a/src/llama.cpp b/src/jarvis.cpp
similarity index 86%
rename from src/llama.cpp
rename to src/jarvis.cpp
index 50eebc2c298f5..45f8edb5f5df2 100644
--- a/src/llama.cpp
+++ b/src/jarvis.cpp
@@ -1,6 +1,6 @@
-#include "llama-impl.h"
-#include "llama-vocab.h"
-#include "llama-sampling.h"
+#include "jarvis-impl.h"
+#include "jarvis-vocab.h"
+#include "jarvis-sampling.h"
 
 #include "unicode.h"
 
@@ -88,8 +88,8 @@
 #endif
 
 // bump if necessary
-#define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
+#define JARVIS_MAX_LAYERS  512
+#define JARVIS_MAX_EXPERTS 160  // DeepSeekV2
 
 //
 // helpers
@@ -135,7 +135,7 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
-LLAMA_ATTRIBUTE_FORMAT(1, 2)
+JARVIS_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -156,7 +156,7 @@ static std::string format(const char * fmt, ...) {
 //
 
 enum llm_arch {
-    LLM_ARCH_LLAMA,
+    LLM_ARCH_JARVIS,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
     LLM_ARCH_GROK,
@@ -209,7 +209,7 @@ enum llm_arch {
 };
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-    { LLM_ARCH_LLAMA,           "llama"        },
+    { LLM_ARCH_JARVIS,           "jarvis"        },
     { LLM_ARCH_FALCON,          "falcon"       },
     { LLM_ARCH_GROK,            "grok"         },
     { LLM_ARCH_GPT2,            "gpt2"         },
@@ -616,7 +616,7 @@ enum llm_tensor {
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
     {
-        LLM_ARCH_LLAMA,
+        LLM_ARCH_JARVIS,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
@@ -1552,7 +1552,7 @@ static llm_arch llm_arch_from_string(const std::string & name) {
 // helper to handle gguf constants
 // usage:
 //
-//   const auto tn = LLM_TN(LLM_ARCH_LLAMA);
+//   const auto tn = LLM_TN(LLM_ARCH_JARVIS);
 //
 //   std::string name = tn(LLM_TENSOR_OUTPUT);                     -> "output"
 //   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
@@ -1603,20 +1603,20 @@ struct LLM_TN {
 // gguf helpers
 //
 
-static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
+static const std::map<jarvis_rope_scaling_type, const char *> JARVIS_ROPE_SCALING_TYPES = {
+    { JARVIS_ROPE_SCALING_TYPE_NONE,   "none"   },
+    { JARVIS_ROPE_SCALING_TYPE_LINEAR, "linear" },
+    { JARVIS_ROPE_SCALING_TYPE_YARN,   "yarn"   },
 };
 
-static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
-    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+static jarvis_rope_scaling_type jarvis_rope_scaling_type_from_string(const std::string & name) {
+    for (const auto & kv : JARVIS_ROPE_SCALING_TYPES) {
         if (kv.second == name) {
-            return (llama_rope_scaling_type) kv.first;
+            return (jarvis_rope_scaling_type) kv.first;
         }
     }
 
-    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    return JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
 }
 
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1674,11 +1674,11 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 }
 
 //
-// llama helpers
+// jarvis helpers
 //
 
 #if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
+static std::string jarvis_format_win_err(DWORD err) {
     LPSTR buf;
     size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
@@ -1697,7 +1697,7 @@ struct no_init {
     no_init() { /* do nothing */ }
 };
 
-struct llama_file {
+struct jarvis_file {
 
 #if defined(_WIN32)
     // use FILE * so we don't have to re-open the file to mmap
@@ -1723,7 +1723,7 @@ struct llama_file {
 
 public:
 
-    llama_file(const char * fname, const char * mode) {
+    jarvis_file(const char * fname, const char * mode) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -1812,7 +1812,7 @@ struct llama_file {
         write_raw(&val, sizeof(val));
     }
 
-    ~llama_file() {
+    ~jarvis_file() {
         if (fp) {
             std::fclose(fp);
         }
@@ -1822,7 +1822,7 @@ struct llama_file {
     FILE * fp;
     size_t size;
 
-    llama_file(const char * fname, const char * mode) {
+    jarvis_file(const char * fname, const char * mode) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -1891,20 +1891,20 @@ struct llama_file {
         write_raw(&val, sizeof(val));
     }
 
-    ~llama_file() {
+    ~jarvis_file() {
         if (fp) {
             std::fclose(fp);
         }
     }
 #endif
 };
-using llama_files = std::vector<std::unique_ptr<llama_file>>;
+using jarvis_files = std::vector<std::unique_ptr<jarvis_file>>;
 
-struct llama_mmap {
+struct jarvis_mmap {
     void * addr;
     size_t size;
 
-    llama_mmap(const llama_mmap &) = delete;
+    jarvis_mmap(const jarvis_mmap &) = delete;
 
 #ifdef _POSIX_MAPPED_FILES
     static constexpr bool SUPPORTED = true;
@@ -1912,7 +1912,7 @@ struct llama_mmap {
     // list of mapped fragments (first_offset, last_offset)
     std::vector<std::pair<size_t, size_t>> mapped_fragments;
 
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+    jarvis_mmap(struct jarvis_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
         size = file->size;
         int fd = fileno(file->fp);
         int flags = MAP_SHARED;
@@ -1921,7 +1921,7 @@ struct llama_mmap {
 #ifdef __linux__
         // advise the kernel to read the file sequentially (increases readahead)
         if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
-            LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
+            JARVIS_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
                     strerror(errno));
         }
         if (prefetch) { flags |= MAP_POPULATE; }
@@ -1934,7 +1934,7 @@ struct llama_mmap {
         if (prefetch > 0) {
             // advise the kernel to preload the mapped memory
             if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
+                JARVIS_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
                         strerror(errno));
             }
         }
@@ -1942,7 +1942,7 @@ struct llama_mmap {
             // advise the kernel not to use readahead
             // (because the next page might not belong on the same node)
             if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
+                JARVIS_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
                         strerror(errno));
             }
         }
@@ -1985,7 +1985,7 @@ struct llama_mmap {
 
         // unmap the range
         if (munmap(next_page_start, len)) {
-            LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+            JARVIS_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
         }
 
         // update the list of mapped fragments to avoid unmapping the same range again in the destructor
@@ -2011,17 +2011,17 @@ struct llama_mmap {
         mapped_fragments = std::move(new_mapped_fragments);
     }
 
-    ~llama_mmap() {
+    ~jarvis_mmap() {
         for (const auto & frag : mapped_fragments) {
             if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
-                LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+                JARVIS_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
             }
         }
     }
 #elif defined(_WIN32)
     static constexpr bool SUPPORTED = true;
 
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
+    jarvis_mmap(struct jarvis_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
         GGML_UNUSED(numa);
 
         size = file->size;
@@ -2032,7 +2032,7 @@ struct llama_mmap {
 
         if (hMapping == NULL) {
             DWORD error = GetLastError();
-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", jarvis_format_win_err(error).c_str()));
         }
 
         addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -2040,7 +2040,7 @@ struct llama_mmap {
         CloseHandle(hMapping);
 
         if (addr == NULL) {
-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
+            throw std::runtime_error(format("MapViewOfFile failed: %s", jarvis_format_win_err(error).c_str()));
         }
 
         if (prefetch > 0) {
@@ -2058,8 +2058,8 @@ struct llama_mmap {
                 range.VirtualAddress = addr;
                 range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
                 if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
-                            llama_format_win_err(GetLastError()).c_str());
+                    JARVIS_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
+                            jarvis_format_win_err(GetLastError()).c_str());
                 }
             }
 #else
@@ -2074,16 +2074,16 @@ struct llama_mmap {
         GGML_UNUSED(last);
     }
 
-    ~llama_mmap() {
+    ~jarvis_mmap() {
         if (!UnmapViewOfFile(addr)) {
-            LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
+            JARVIS_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
+                    jarvis_format_win_err(GetLastError()).c_str());
         }
     }
 #else
     static constexpr bool SUPPORTED = false;
 
-    llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
+    jarvis_mmap(struct jarvis_file * file, size_t prefetch = -1, bool numa = false) {
         GGML_UNUSED(file);
         GGML_UNUSED(prefetch);
         GGML_UNUSED(numa);
@@ -2099,20 +2099,20 @@ struct llama_mmap {
     }
 #endif
 };
-using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
+using jarvis_mmaps = std::vector<std::unique_ptr<jarvis_mmap>>;
 
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
-struct llama_mlock {
+struct jarvis_mlock {
     void * addr = NULL;
     size_t size = 0;
 
     bool failed_already = false;
 
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
+    jarvis_mlock() {}
+    jarvis_mlock(const jarvis_mlock &) = delete;
 
-    ~llama_mlock() {
+    ~jarvis_mlock() {
         if (size) {
             raw_unlock(addr, size);
         }
@@ -2172,7 +2172,7 @@ struct llama_mlock {
             suggest = false;
         }
 
-        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+        JARVIS_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                 size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
         return false;
     }
@@ -2181,7 +2181,7 @@ struct llama_mlock {
 
     static void raw_unlock(void * addr, size_t size) {
         if (munlock(addr, size)) {
-            LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
+            JARVIS_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
         }
     }
 #elif defined(_WIN32)
@@ -2199,8 +2199,8 @@ struct llama_mlock {
                 return true;
             }
             if (tries == 2) {
-                LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
+                JARVIS_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                    len, size, jarvis_format_win_err(GetLastError()).c_str());
                 return false;
             }
 
@@ -2208,8 +2208,8 @@ struct llama_mlock {
             // set size and try again.
             SIZE_T min_ws_size, max_ws_size;
             if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
+                JARVIS_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
+                        jarvis_format_win_err(GetLastError()).c_str());
                 return false;
             }
             // Per MSDN: "The maximum number of pages that a process can lock
@@ -2221,8 +2221,8 @@ struct llama_mlock {
             min_ws_size += increment;
             max_ws_size += increment;
             if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
+                JARVIS_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
+                        jarvis_format_win_err(GetLastError()).c_str());
                 return false;
             }
         }
@@ -2230,8 +2230,8 @@ struct llama_mlock {
 
     static void raw_unlock(void * ptr, size_t len) {
         if (!VirtualUnlock(ptr, len)) {
-            LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
+            JARVIS_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
+                    jarvis_format_win_err(GetLastError()).c_str());
         }
     }
 #else
@@ -2242,23 +2242,23 @@ struct llama_mlock {
     }
 
     bool raw_lock(const void * addr, size_t len) const {
-        LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
+        JARVIS_LOG_WARN("warning: mlock not supported on this system\n");
         return false;
     }
 
     static void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
-using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
+using jarvis_mlocks = std::vector<std::unique_ptr<jarvis_mlock>>;
 
 // NOTE: avoid ever using this except for building the token_to_piece caches
-static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
+static std::string jarvis_token_to_piece(const struct jarvis_model * model, jarvis_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache
-    const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+    const int n_chars = jarvis_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
     if (n_chars < 0) {
         piece.resize(-n_chars);
-        int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+        int check = jarvis_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
         GGML_ASSERT(check == -n_chars);
     }
     else {
@@ -2272,14 +2272,14 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
 // globals
 //
 
-struct llama_logger_state {
-    ggml_log_callback log_callback = llama_log_callback_default;
+struct jarvis_logger_state {
+    ggml_log_callback log_callback = jarvis_log_callback_default;
     void * log_callback_user_data = nullptr;
 };
 
-static llama_logger_state g_logger_state;
+static jarvis_logger_state g_logger_state;
 
-// available llama models
+// available jarvis models
 enum e_model {
     MODEL_UNKNOWN,
     MODEL_14M,
@@ -2347,7 +2347,7 @@ static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
-struct llama_hparams {
+struct jarvis_hparams {
     bool vocab_only;
     bool rope_finetuned;
     bool use_par_res;
@@ -2366,9 +2366,9 @@ struct llama_hparams {
     uint32_t n_vocab_type = 0; // for BERT-style token types
     uint32_t n_rel_attn_bkts = 0;
 
-    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
-    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
-    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+    std::array<uint32_t, JARVIS_MAX_LAYERS> n_head_arr;
+    std::array<uint32_t, JARVIS_MAX_LAYERS> n_head_kv_arr;
+    std::array<uint32_t, JARVIS_MAX_LAYERS> n_ff_arr;
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
@@ -2417,14 +2417,14 @@ struct llama_hparams {
     bool attn_soft_cap = false;
 
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
-    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
+    // ref: https://github.com/ggerganov/jarvis.cpp/pull/8141
+    jarvis_token dec_start_token_id = JARVIS_TOKEN_NULL;
 
-    enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
-    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
-    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
+    enum jarvis_pooling_type      pooling_type            = JARVIS_POOLING_TYPE_NONE;
+    enum jarvis_rope_type         rope_type               = JARVIS_ROPE_TYPE_NONE;
+    enum jarvis_rope_scaling_type rope_scaling_type_train = JARVIS_ROPE_SCALING_TYPE_NONE;
 
-    bool operator!=(const llama_hparams & other) const {
+    bool operator!=(const jarvis_hparams & other) const {
         if (this->vocab_only    != other.vocab_only)    return true;
         if (this->n_vocab       != other.n_vocab)       return true;
         if (this->n_ctx_train   != other.n_ctx_train)   return true;
@@ -2551,9 +2551,9 @@ struct llama_hparams {
     }
 };
 
-static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
+static_assert(std::is_trivially_copyable<jarvis_hparams>::value, "jarvis_hparams must be trivially copyable");
 
-struct llama_cparams {
+struct jarvis_cparams {
     uint32_t n_ctx;           // context size used during inference
     uint32_t n_batch;
     uint32_t n_ubatch;
@@ -2579,14 +2579,14 @@ struct llama_cparams {
     bool flash_attn;
     bool no_perf;
 
-    enum llama_pooling_type pooling_type;
+    enum jarvis_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
 };
 
-// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
-struct llama_layer {
+// TODO: separate into "jarvis_layer_enc" and "jarvis_layer_dec"
+struct jarvis_layer {
     // normalization
     struct ggml_tensor * attn_norm;
     struct ggml_tensor * attn_norm_b;
@@ -2732,9 +2732,9 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
 };
 
-// very similar to llama_batch,
+// very similar to jarvis_batch,
 // but has more metadata about sequences
-struct llama_ubatch {
+struct jarvis_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
 
@@ -2742,23 +2742,23 @@ struct llama_ubatch {
     uint32_t n_seq_tokens; // tokens per sequence
     uint32_t n_seqs;
 
-    llama_token  *  token;    // [n_tokens]
+    jarvis_token  *  token;    // [n_tokens]
     float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
+    jarvis_pos    *  pos;      // [n_tokens]
     int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
+    jarvis_seq_id ** seq_id;   // [n_seqs]
     int8_t       *  output;   // [n_tokens]
 };
 
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
+struct jarvis_kv_cell {
+    jarvis_pos pos   = -1;
+    jarvis_pos delta = 0;
     int32_t   src   = -1; // used by recurrent state models to copy states
     int32_t   tail  = -1;
 
-    std::set<llama_seq_id> seq_id;
+    std::set<jarvis_seq_id> seq_id;
 
-    bool has_seq_id(const llama_seq_id & id) const {
+    bool has_seq_id(const jarvis_seq_id & id) const {
         return seq_id.find(id) != seq_id.end();
     }
 
@@ -2766,20 +2766,20 @@ struct llama_kv_cell {
         return seq_id.empty();
     }
 
-    bool is_same_seq(const llama_kv_cell & other) const {
+    bool is_same_seq(const jarvis_kv_cell & other) const {
         return seq_id == other.seq_id;
     }
 };
 
 // ring-buffer of cached KV data
-struct llama_kv_cache {
+struct jarvis_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
     bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
     bool v_trans   = true;  // the value tensor is transposed
 
     // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_internal also uses it, so it
+    // for a free KV slot. jarvis_decode_internal also uses it, so it
     // cannot be freely changed after a slot has been allocated.
     uint32_t head = 0;
     uint32_t size = 0;
@@ -2791,7 +2791,7 @@ struct llama_kv_cache {
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
-    std::vector<llama_kv_cell> cells;
+    std::vector<jarvis_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
     std::vector<struct ggml_tensor *> v_l;
@@ -2807,7 +2807,7 @@ struct llama_kv_cache {
         return size;
     }
 
-    ~llama_kv_cache() {
+    ~jarvis_kv_cache() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
@@ -2817,7 +2817,7 @@ struct llama_kv_cache {
     }
 };
 
-struct llama_control_vector {
+struct jarvis_control_vector {
     std::vector<struct ggml_tensor *> tensors; // per layer
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
@@ -2840,7 +2840,7 @@ struct llama_control_vector {
         return cur;
     }
 
-    ~llama_control_vector() {
+    ~jarvis_control_vector() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
@@ -2850,15 +2850,15 @@ struct llama_control_vector {
     }
 };
 
-struct llama_model {
+struct jarvis_model {
     e_model     type  = MODEL_UNKNOWN;
     llm_arch    arch  = LLM_ARCH_UNKNOWN;
-    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+    jarvis_ftype ftype = JARVIS_FTYPE_ALL_F32;
 
     std::string name = "n/a";
 
-    llama_hparams hparams = {};
-    llama_vocab   vocab;
+    jarvis_hparams hparams = {};
+    jarvis_vocab   vocab;
 
     // TODO: should init all tensors to nullptr
     struct ggml_tensor * tok_embd;
@@ -2879,12 +2879,12 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
-    std::vector<llama_layer> layers;
+    std::vector<jarvis_layer> layers;
 
     // gguf metadata
     std::unordered_map<std::string, std::string> gguf_kv;
 
-    llama_split_mode split_mode;
+    jarvis_split_mode split_mode;
     int main_gpu;
     int n_gpu_layers;
 
@@ -2914,11 +2914,11 @@ struct llama_model {
     std::vector<ggml_backend_buffer_t> bufs;
 
     // model memory mapped files
-    llama_mmaps mappings;
+    jarvis_mmaps mappings;
 
     // objects representing data potentially being locked in memory
-    llama_mlocks mlock_bufs;
-    llama_mlocks mlock_mmaps;
+    jarvis_mlocks mlock_bufs;
+    jarvis_mlocks mlock_mmaps;
 
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2927,9 +2927,9 @@ struct llama_model {
     int64_t t_start_us = 0;
 
     // keep track of loaded lora adapters
-    std::set<struct llama_lora_adapter *> lora_adapters;
+    std::set<struct jarvis_lora_adapter *> lora_adapters;
 
-    ~llama_model() {
+    ~jarvis_model() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
@@ -2937,20 +2937,20 @@ struct llama_model {
             ggml_backend_buffer_free(buf);
         }
         while (!lora_adapters.empty()) {
-            llama_lora_adapter_free(*lora_adapters.begin());
+            jarvis_lora_adapter_free(*lora_adapters.begin());
         }
     }
 };
 
-struct llama_sbatch_seq {
+struct jarvis_sbatch_seq {
     int32_t n_seq_id;
-    llama_seq_id * seq_id;
+    jarvis_seq_id * seq_id;
     size_t offset;
     size_t length;
 };
 
 // sequence-length-aware batch splitting
-struct llama_sbatch {
+struct jarvis_sbatch {
     // tokens left in this batch
     size_t n_tokens;
 
@@ -2962,18 +2962,18 @@ struct llama_sbatch {
     std::vector<size_t> ids;
     // batch indices of the output
     std::vector<size_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
-    const llama_batch * batch = nullptr;
+    std::vector<jarvis_sbatch_seq> seq;
+    const jarvis_batch * batch = nullptr;
 
     // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
+    std::vector<jarvis_token>    ubatch_token;
     std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
+    std::vector<jarvis_pos>      ubatch_pos;
     std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
+    std::vector<jarvis_seq_id *> ubatch_seq_id;
     std::vector<int8_t>         ubatch_output;
 
-    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) {
+    jarvis_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) {
         // clear empty sequences
         // the previous ubatch is assumed to be gone,
         // so nothing should refer to values in these sequences anymore.
@@ -2990,7 +2990,7 @@ struct llama_sbatch {
         ubatch_n_seq_id.resize(n_ubatch);
         ubatch_seq_id.resize(n_ubatch);
         ubatch_output.resize(n_ubatch);
-        llama_ubatch ubatch = {
+        jarvis_ubatch ubatch = {
             /*equal_seqs   =*/ true,
             /*n_tokens     =*/ 0,
             /*n_seq_tokens =*/ 0,
@@ -3005,7 +3005,7 @@ struct llama_sbatch {
         return ubatch;
     }
 
-    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
+    void add_seq_to_ubatch(jarvis_ubatch & ubatch, jarvis_sbatch_seq & seq, size_t length) {
         GGML_ASSERT(batch != nullptr);
         GGML_ASSERT(length <= seq.length);
         // Can only add sequences of equal lengths to a batch,
@@ -3108,12 +3108,12 @@ struct llama_sbatch {
     }
 
     // simple split, unknown number of sequences of unequal lengths
-    llama_ubatch split_simple(size_t n_ubatch) {
+    jarvis_ubatch split_simple(size_t n_ubatch) {
         n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-        llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+        jarvis_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
         ubatch.equal_seqs = false;
         if (!seq.empty()) {
-            llama_sbatch_seq & s = seq[0];
+            jarvis_sbatch_seq & s = seq[0];
             size_t length = s.length < n_ubatch ? s.length : n_ubatch;
             GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
             add_seq_to_ubatch(ubatch, s, length);
@@ -3122,9 +3122,9 @@ struct llama_sbatch {
     }
 
     // make batches of equal-length sequences
-    llama_ubatch split_equal(size_t n_ubatch) {
+    jarvis_ubatch split_equal(size_t n_ubatch) {
         n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-        llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+        jarvis_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
         if (!seq.empty()) {
             size_t length = 0;
             size_t n_tokens_in_ubatch = 0;
@@ -3132,7 +3132,7 @@ struct llama_sbatch {
             // smallest first, because it's easier to split this way;
             // starting from the end to pop in constant time.
             for (size_t i = seq.size(); i-- > 0;) {
-                llama_sbatch_seq & s = seq[i];
+                jarvis_sbatch_seq & s = seq[i];
                 GGML_ASSERT(s.length > 0);
                 if (length == 0) {
                     length = s.length < n_ubatch ? s.length : n_ubatch;
@@ -3150,11 +3150,11 @@ struct llama_sbatch {
     }
 
     // sequence-wise split
-    llama_ubatch split_seq(size_t n_ubatch) {
+    jarvis_ubatch split_seq(size_t n_ubatch) {
         n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-        llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+        jarvis_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
         if (!seq.empty()) {
-            llama_sbatch_seq & s = seq[seq.size() - 1];
+            jarvis_sbatch_seq & s = seq[seq.size() - 1];
             size_t length = s.length < n_ubatch ? s.length : n_ubatch;
             GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
             add_seq_to_ubatch(ubatch, s, length);
@@ -3162,7 +3162,7 @@ struct llama_sbatch {
         return ubatch;
     }
 
-    void from_batch(const llama_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) {
+    void from_batch(const jarvis_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) {
         GGML_ASSERT(batch.n_tokens >= 0);
         this->batch = &batch;
         this->n_embd = n_embd;
@@ -3178,7 +3178,7 @@ struct llama_sbatch {
         }
         if (simple_split) {
             seq.resize(1);
-            llama_sbatch_seq & s = seq[0];
+            jarvis_sbatch_seq & s = seq[0];
             s.n_seq_id = 0;
             s.seq_id = nullptr;
             s.offset = 0;
@@ -3193,8 +3193,8 @@ struct llama_sbatch {
                 if (n_seq_a == n_seq_b) {
                     if (batch.seq_id) {
                         for (int32_t i = 0; i < n_seq_a; ++i) {
-                            llama_seq_id seq_id_a = batch.seq_id[a][i];
-                            llama_seq_id seq_id_b = batch.seq_id[b][i];
+                            jarvis_seq_id seq_id_a = batch.seq_id[a][i];
+                            jarvis_seq_id seq_id_b = batch.seq_id[b][i];
                             // smaller seq_ids go first
                             if (seq_id_a != seq_id_b) {
                                 return seq_id_a < seq_id_b;
@@ -3213,12 +3213,12 @@ struct llama_sbatch {
             }
         );
         // init seq
-        llama_sbatch_seq * last_seq = nullptr;
+        jarvis_sbatch_seq * last_seq = nullptr;
 
         for (size_t i = 0; i < n_tokens; ++i) {
             const size_t bi = ids[i];
             const int32_t n_seqs = batch.n_seq_id[bi];
-            llama_seq_id * seq_ids = batch.seq_id[bi];
+            jarvis_seq_id * seq_ids = batch.seq_id[bi];
             if (last_seq != nullptr) {
                 bool same = n_seqs == last_seq->n_seq_id;
                 for (int32_t j = 0; same && j < n_seqs; ++j) {
@@ -3231,13 +3231,13 @@ struct llama_sbatch {
                     continue;
                 }
             }
-            llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
+            jarvis_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
             seq.push_back(new_seq);
             last_seq = &seq.back();
         }
         // keep shared prompts first at the end, then sort by length descending.
         std::sort(seq.begin(), seq.end(),
-            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
+            [](jarvis_sbatch_seq & a, jarvis_sbatch_seq & b) {
                 if (a.n_seq_id == b.n_seq_id) {
                     return a.length > b.length;
                 }
@@ -3247,13 +3247,13 @@ struct llama_sbatch {
     }
 };
 
-struct llama_context {
-    llama_context(const llama_model & model)
+struct jarvis_context {
+    jarvis_context(const jarvis_model & model)
         : model(model)
         , t_start_us(model.t_start_us)
         , t_load_us(model.t_load_us) {}
 
-    ~llama_context() {
+    ~jarvis_context() {
         ggml_backend_sched_free(sched);
 
         for (ggml_backend_t backend : backends) {
@@ -3263,14 +3263,14 @@ struct llama_context {
         ggml_backend_buffer_free(buf_output);
     }
 
-    const struct llama_model & model;
+    const struct jarvis_model & model;
 
-    struct llama_cparams        cparams;
-    struct llama_sbatch         sbatch;
-    struct llama_kv_cache       kv_self;
-    struct llama_control_vector cvec;
+    struct jarvis_cparams        cparams;
+    struct jarvis_sbatch         sbatch;
+    struct jarvis_kv_cache       kv_self;
+    struct jarvis_control_vector cvec;
 
-    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
+    std::unordered_map<struct jarvis_lora_adapter *, float> lora_adapters;
 
     std::vector<ggml_backend_t> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
@@ -3307,20 +3307,20 @@ struct llama_context {
     bool logits_all = false;
 
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
-    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
+    // populated only when pooling_type == JARVIS_POOLING_TYPE_NONE
     size_t  embd_size = 0; // capacity (of floats) for embeddings
     float * embd      = nullptr;
 
     // sequence embeddings output (map of [n_embd] vectors)
-    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
-    std::map<llama_seq_id, std::vector<float>> embd_seq;
+    // populated only when pooling_type != JARVIS_POOLING_TYPE_NONE
+    std::map<jarvis_seq_id, std::vector<float>> embd_seq;
 
     // whether we are computing encoder output or decoder output
     bool is_encoding = false;
 
     // output of the encoder part of the encoder-decoder models
     std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+    std::vector<std::set<jarvis_seq_id>> seq_ids_enc;
 
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
@@ -3347,27 +3347,27 @@ struct llama_context {
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 };
 
-struct llama_lora_weight {
+struct jarvis_lora_weight {
     struct ggml_tensor * a = nullptr;
     struct ggml_tensor * b = nullptr;
-    llama_lora_weight() = default;
-    llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
+    jarvis_lora_weight() = default;
+    jarvis_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
 };
 
-struct llama_lora_adapter {
-    struct llama_model * base_model;
+struct jarvis_lora_adapter {
+    struct jarvis_model * base_model;
     // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_lora_weight> ab_map;
+    std::unordered_map<std::string, struct jarvis_lora_weight> ab_map;
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
 
     float alpha;
 
-    llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
+    jarvis_lora_adapter(struct jarvis_model * base_model): base_model(base_model) {
         base_model->lora_adapters.insert(this);
     }
 
-    llama_lora_weight * get_weight(struct ggml_tensor * w) {
+    jarvis_lora_weight * get_weight(struct ggml_tensor * w) {
         std::string name(w->name);
         auto pos = ab_map.find(name);
         if (ab_map.find(name) != ab_map.end()) {
@@ -3376,7 +3376,7 @@ struct llama_lora_adapter {
         return nullptr;
     }
 
-    ~llama_lora_adapter() {
+    ~jarvis_lora_adapter() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
@@ -3390,7 +3390,7 @@ struct llama_lora_adapter {
     }
 };
 
-static int llama_get_device_count(const llama_model & model) {
+static int jarvis_get_device_count(const jarvis_model & model) {
     int count = (int) model.devices.size();
 
 #if defined(GGML_USE_RPC)
@@ -3402,7 +3402,7 @@ static int llama_get_device_count(const llama_model & model) {
     GGML_UNUSED(model);
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
+static ggml_backend_buffer_type_t jarvis_default_buffer_type_cpu(const jarvis_model & model, bool host_buffer) {
     ggml_backend_buffer_type_t buft = nullptr;
 
     if (host_buffer) {
@@ -3426,7 +3426,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
     GGML_UNUSED(host_buffer);
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
+static ggml_backend_buffer_type_t jarvis_default_buffer_type_offload(const jarvis_model & model, int device) {
     ggml_backend_buffer_type_t buft = nullptr;
 
     if (device < (int)model.devices.size()) {
@@ -3439,14 +3439,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 #endif
 
     if (buft == nullptr) {
-        buft = llama_default_buffer_type_cpu(model, true);
+        buft = jarvis_default_buffer_type_cpu(model, true);
     }
     return buft;
 
     GGML_UNUSED(model);
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
+static ggml_backend_buffer_type_t jarvis_default_buffer_type_split(const jarvis_model & model, int fallback_gpu, const float * tensor_split) {
     ggml_backend_buffer_type_t buft = nullptr;
 
     // find a backend that supports split buffers
@@ -3463,14 +3463,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
     }
 
     if (buft == nullptr) {
-        buft = llama_default_buffer_type_offload(model, fallback_gpu);
+        buft = jarvis_default_buffer_type_offload(model, fallback_gpu);
     }
     return buft;
 
     GGML_UNUSED(tensor_split);
 }
 
-static size_t llama_get_device_memory(const llama_model & model, int device) {
+static size_t jarvis_get_device_memory(const jarvis_model & model, int device) {
     if (device < (int)model.devices.size()) {
         ggml_backend_dev_t dev = model.devices[device];
         size_t total;
@@ -3481,9 +3481,9 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
 
     if (model.devices.size() > 0) {
         ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]);
-        LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
+        JARVIS_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
     } else {
-        LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
+        JARVIS_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
     }
     return 1;
 
@@ -3495,23 +3495,23 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
 // kv cache helpers
 //
 
-static bool llama_kv_cache_init(
-             struct llama_kv_cache & cache,
-               const llama_context * ctx,
+static bool jarvis_kv_cache_init(
+             struct jarvis_kv_cache & cache,
+               const jarvis_context * ctx,
                          ggml_type   type_k,
                          ggml_type   type_v,
                           uint32_t   kv_size,
                               bool   offload) {
-    const llama_model & model = ctx->model;
-    const llama_cparams & cparams = ctx->cparams;
+    const jarvis_model & model = ctx->model;
+    const jarvis_cparams & cparams = ctx->cparams;
 
-    const struct llama_hparams & hparams = model.hparams;
+    const struct jarvis_hparams & hparams = model.hparams;
 
     const int64_t  n_layer = hparams.n_layer;
 
     cache.has_shift = false;
 
-    cache.recurrent = llama_model_is_recurrent(&model);
+    cache.recurrent = jarvis_model_is_recurrent(&model);
     cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
 
     cache.head = 0;
@@ -3531,7 +3531,7 @@ static bool llama_kv_cache_init(
             buft_layer_count[model.buft_layer[i].buft]++;
         }
     } else {
-        buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
+        buft_layer_count[jarvis_default_buffer_type_cpu(model, true)] = n_layer;
     }
 
     // create a context for each buffer type
@@ -3545,7 +3545,7 @@ static bool llama_kv_cache_init(
         };
         ggml_context * ctx = ggml_init(params);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
             return false;
         }
         ctx_map[it.first] = ctx;
@@ -3574,11 +3574,11 @@ static bool llama_kv_cache_init(
         ggml_context * ctx = it.second;
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        JARVIS_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         cache.bufs.push_back(buf);
     }
 
@@ -3589,9 +3589,9 @@ static bool llama_kv_cache_init(
 // updates the cache head
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
-static bool llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch) {
+static bool jarvis_kv_cache_find_slot(
+           struct jarvis_kv_cache & cache,
+       const struct jarvis_ubatch & batch) {
     const uint32_t n_tokens = batch.n_tokens;
     const uint32_t n_seqs   = batch.n_seqs;
     const uint32_t n_seq_tokens = batch.n_seq_tokens;
@@ -3611,18 +3611,18 @@ static bool llama_kv_cache_find_slot(
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const uint32_t n_seq_id = batch.n_seq_id[s];
             for (uint32_t j = 0; j < n_seq_id; ++j) {
-                const llama_seq_id seq_id = batch.seq_id[s][j];
+                const jarvis_seq_id seq_id = batch.seq_id[s][j];
 
                 if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
-                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
+                    JARVIS_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
                     return false;
                 }
                 if (j > 0) {
-                    llama_kv_cell & seq = cache.cells[seq_id];
+                    jarvis_kv_cell & seq = cache.cells[seq_id];
                     if (seq.tail >= 0) {
-                        llama_kv_cell & cell = cache.cells[seq.tail];
+                        jarvis_kv_cell & cell = cache.cells[seq.tail];
                         // clear cells from seq_ids that become shared
                         // (should not normally happen, but let's handle it anyway)
                         cell.seq_id.erase(seq_id);
@@ -3642,17 +3642,17 @@ static bool llama_kv_cache_find_slot(
             std::vector<int32_t> tails_verif;
             tails_verif.assign(cache.size, -1);
             for (uint32_t i = 0; i < cache.size; ++i) {
-                llama_kv_cell & cell = cache.cells[i];
-                for (llama_seq_id seq_id : cell.seq_id) {
+                jarvis_kv_cell & cell = cache.cells[i];
+                for (jarvis_seq_id seq_id : cell.seq_id) {
                     if (tails_verif[seq_id] != -1) {
-                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                        JARVIS_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
                     }
                     tails_verif[seq_id] = i;
                 }
             }
             for (uint32_t i = 0; i < cache.size; ++i) {
                 if (tails_verif[i] != cache.cells[i].tail) {
-                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
+                    JARVIS_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
                 }
             }
         }
@@ -3663,28 +3663,28 @@ static bool llama_kv_cache_find_slot(
 
         for (uint32_t i = 0; i < cache.size; ++i) {
             if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-            llama_kv_cell & cell = cache.cells[next_empty_cell];
+            jarvis_kv_cell & cell = cache.cells[next_empty_cell];
             if (cell.is_empty()) { break; }
             next_empty_cell += 1;
         }
 
         // find usable cell range
         for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
-            llama_kv_cell & seq_meta = cache.cells[seq_id];
+            const jarvis_seq_id seq_id = batch.seq_id[s][0];
+            jarvis_kv_cell & seq_meta = cache.cells[seq_id];
             bool has_cell = false;
             if (seq_meta.tail >= 0) {
-                llama_kv_cell & cell = cache.cells[seq_meta.tail];
+                jarvis_kv_cell & cell = cache.cells[seq_meta.tail];
                 GGML_ASSERT(cell.has_seq_id(seq_id));
                 // does this seq_id "own" the cell?
                 if (cell.seq_id.size() == 1) { has_cell = true; }
             }
             if (!has_cell) {
-                llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
+                jarvis_kv_cell & empty_cell = cache.cells[next_empty_cell];
                 GGML_ASSERT(empty_cell.is_empty());
                 // copy old tail into the empty cell
                 if (seq_meta.tail >= 0) {
-                    llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
+                    jarvis_kv_cell & orig_cell = cache.cells[seq_meta.tail];
                     empty_cell.pos = orig_cell.pos;
                     empty_cell.src = orig_cell.src;
                     orig_cell.seq_id.erase(seq_id);
@@ -3696,7 +3696,7 @@ static bool llama_kv_cache_find_slot(
                     next_empty_cell += 1;
                     for (uint32_t i = 0; i < cache.size; ++i) {
                         if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-                        llama_kv_cell & cell = cache.cells[next_empty_cell];
+                        jarvis_kv_cell & cell = cache.cells[next_empty_cell];
                         if (cell.is_empty()) { break; }
                         next_empty_cell += 1;
                     }
@@ -3711,18 +3711,18 @@ static bool llama_kv_cache_find_slot(
             int32_t dst_id = s + min;
             int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
             if (dst_id != src_id) {
-                llama_kv_cell & dst_cell = cache.cells[dst_id];
-                llama_kv_cell & src_cell = cache.cells[src_id];
+                jarvis_kv_cell & dst_cell = cache.cells[dst_id];
+                jarvis_kv_cell & src_cell = cache.cells[src_id];
 
                 std::swap(dst_cell.pos, src_cell.pos);
                 std::swap(dst_cell.src, src_cell.src);
                 std::swap(dst_cell.seq_id, src_cell.seq_id);
 
                 // swap tails (assuming they NEVER overlap)
-                for (const llama_seq_id seq_id : src_cell.seq_id) {
+                for (const jarvis_seq_id seq_id : src_cell.seq_id) {
                     cache.cells[seq_id].tail = src_id;
                 }
-                for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                for (const jarvis_seq_id seq_id : dst_cell.seq_id) {
                     cache.cells[seq_id].tail = dst_id;
                 }
             }
@@ -3730,20 +3730,20 @@ static bool llama_kv_cache_find_slot(
 
         // update the pos of the used seqs
         for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+            const jarvis_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
             int32_t cell_id = s + min;
-            llama_kv_cell & cell = cache.cells[cell_id];
+            jarvis_kv_cell & cell = cache.cells[cell_id];
 
-            if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+            if (cell.pos >= 0 && last_pos != cell.pos + (jarvis_pos) n_seq_tokens) {
                 // What should happen when the pos backtracks or skips a value?
                 // Clearing the state mid-batch would require special-casing which isn't done.
-                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                JARVIS_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
                     __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
             }
             cell.pos = last_pos;
             cell.seq_id.clear();
             for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
-                const llama_seq_id seq_id = batch.seq_id[s][j];
+                const jarvis_seq_id seq_id = batch.seq_id[s][j];
                 cell.seq_id.insert(seq_id);
                 cache.cells[seq_id].tail = cell_id;
             }
@@ -3759,7 +3759,7 @@ static bool llama_kv_cache_find_slot(
     // otherwise, one cell per token.
 
     if (n_tokens > cache.size) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
+        JARVIS_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
         return false;
     }
 
@@ -3787,7 +3787,7 @@ static bool llama_kv_cache_find_slot(
         }
 
         if (n_tested >= cache.size) {
-            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            //JARVIS_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
             return false;
         }
     }
@@ -3809,9 +3809,9 @@ static bool llama_kv_cache_find_slot(
 }
 
 // find how many cells are currently in use
-static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
+static uint32_t jarvis_kv_cache_cell_max(const struct jarvis_kv_cache & cache) {
     for (uint32_t i = cache.size; i > 0; --i) {
-        const llama_kv_cell & cell = cache.cells[i - 1];
+        const jarvis_kv_cell & cell = cache.cells[i - 1];
 
         if (cell.pos >= 0 && !cell.is_empty()) {
             return i;
@@ -3821,7 +3821,7 @@ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
+static void jarvis_kv_cache_clear(struct jarvis_kv_cache & cache) {
     for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
@@ -3836,15 +3836,15 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
     }
 }
 
-static bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1) {
+static bool jarvis_kv_cache_seq_rm(
+        struct jarvis_kv_cache & cache,
+                 jarvis_seq_id   seq_id,
+                    jarvis_pos   p0,
+                    jarvis_pos   p1) {
     uint32_t new_head = cache.size;
 
     if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    if (p1 < 0) p1 = std::numeric_limits<jarvis_pos>::max();
 
     // models like Mamba or RWKV can't have a state partially erased
     if (cache.recurrent) {
@@ -3855,7 +3855,7 @@ static bool llama_kv_cache_seq_rm(
         if (0 <= seq_id) {
             int32_t & tail_id = cache.cells[seq_id].tail;
             if (tail_id >= 0) {
-                const llama_kv_cell & cell = cache.cells[tail_id];
+                const jarvis_kv_cell & cell = cache.cells[tail_id];
                 // partial intersection is invalid
                 if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
                     return false;
@@ -3867,7 +3867,7 @@ static bool llama_kv_cache_seq_rm(
             }
         } else {
             // seq_id is negative, then the range should include everything or nothing
-            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<jarvis_pos>::max())) {
                 return false;
             }
         }
@@ -3899,22 +3899,22 @@ static bool llama_kv_cache_seq_rm(
     return true;
 }
 
-static void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1) {
+static void jarvis_kv_cache_seq_cp(
+        struct jarvis_kv_cache & cache,
+                 jarvis_seq_id   seq_id_src,
+                 jarvis_seq_id   seq_id_dst,
+                    jarvis_pos   p0,
+                    jarvis_pos   p1) {
     if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    if (p1 < 0) p1 = std::numeric_limits<jarvis_pos>::max();
 
     if (cache.recurrent) {
         if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
-            llama_kv_cell & tail_src = cache.cells[seq_id_src];
-            llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
+            jarvis_kv_cell & tail_src = cache.cells[seq_id_src];
+            jarvis_kv_cell & tail_dst = cache.cells[seq_id_dst];
             if (tail_dst.tail >= 0) {
                 // clear destination seq_id if it wasn't empty
-                llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
+                jarvis_kv_cell & cell_dst = cache.cells[tail_dst.tail];
 
                 cell_dst.seq_id.erase(seq_id_dst);
                 tail_dst.tail = -1;
@@ -3926,7 +3926,7 @@ static void llama_kv_cache_seq_cp(
                 }
             }
             if (tail_src.tail >= 0) {
-                llama_kv_cell & cell_src = cache.cells[tail_src.tail];
+                jarvis_kv_cell & cell_src = cache.cells[tail_src.tail];
 
                 cell_src.seq_id.insert(seq_id_dst);
                 tail_dst.tail = tail_src.tail;
@@ -3946,11 +3946,11 @@ static void llama_kv_cache_seq_cp(
     }
 }
 
-static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+static void jarvis_kv_cache_seq_keep(struct jarvis_kv_cache & cache, jarvis_seq_id seq_id) {
     uint32_t new_head = cache.size;
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.recurrent && (llama_seq_id) i != seq_id) {
+        if (cache.recurrent && (jarvis_seq_id) i != seq_id) {
             cache.cells[i].tail = -1;
         }
         if (!cache.cells[i].has_seq_id(seq_id)) {
@@ -3969,16 +3969,16 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
     if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
 }
 
-static void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta) {
+static void jarvis_kv_cache_seq_add(
+        struct jarvis_kv_cache & cache,
+                 jarvis_seq_id   seq_id,
+                    jarvis_pos   p0,
+                    jarvis_pos   p1,
+                    jarvis_pos   delta) {
     uint32_t new_head = cache.size;
 
     if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    if (p1 < 0) p1 = std::numeric_limits<jarvis_pos>::max();
     // If there is no range then return early to avoid looping over the cache.
     if (p0 == p1) return;
 
@@ -3987,7 +3987,7 @@ static void llama_kv_cache_seq_add(
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             const int32_t tail_id = cache.cells[seq_id].tail;
             if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
+                jarvis_kv_cell & cell = cache.cells[tail_id];
                 if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
                     cell.pos += delta;
                 }
@@ -4020,14 +4020,14 @@ static void llama_kv_cache_seq_add(
     cache.head = new_head != cache.size ? new_head : 0;
 }
 
-static void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
+static void jarvis_kv_cache_seq_div(
+        struct jarvis_kv_cache & cache,
+                 jarvis_seq_id   seq_id,
+                    jarvis_pos   p0,
+                    jarvis_pos   p1,
                           int   d) {
     if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    if (p1 < 0) p1 = std::numeric_limits<jarvis_pos>::max();
     // If there is no range then return early to avoid looping over the cache.
     if (p0 == p1) return;
 
@@ -4036,7 +4036,7 @@ static void llama_kv_cache_seq_div(
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             const int32_t tail_id = cache.cells[seq_id].tail;
             if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
+                jarvis_kv_cell & cell = cache.cells[tail_id];
                 if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
                     cell.pos /= d;
                 }
@@ -4050,7 +4050,7 @@ static void llama_kv_cache_seq_div(
             cache.has_shift = true;
 
             {
-                llama_pos p_old = cache.cells[i].pos;
+                jarvis_pos p_old = cache.cells[i].pos;
                 cache.cells[i].pos   /= d;
                 cache.cells[i].delta += cache.cells[i].pos - p_old;
             }
@@ -4058,8 +4058,8 @@ static void llama_kv_cache_seq_div(
     }
 }
 
-static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    llama_pos result = 0;
+static jarvis_pos jarvis_kv_cache_seq_pos_max(struct jarvis_kv_cache & cache, jarvis_seq_id seq_id) {
+    jarvis_pos result = 0;
 
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id)) {
@@ -4070,13 +4070,13 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama
     return result;
 }
 
-static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
+static void jarvis_kv_cache_defrag(struct jarvis_kv_cache & cache) {
     if (!cache.recurrent) {
         cache.do_defrag = true;
     }
 }
 
-static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
+static uint32_t jarvis_kv_cache_get_padding(const struct jarvis_cparams & cparams) {
     // the FA kernels require padding to avoid extra runtime boundary checks
     return cparams.flash_attn ? 256u : 32u;
 }
@@ -4085,13 +4085,13 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams)
 // model loading and saving
 //
 
-enum llama_fver {
+enum jarvis_fver {
     GGUF_FILE_VERSION_V1 = 1,
     GGUF_FILE_VERSION_V2 = 2,
     GGUF_FILE_VERSION_V3 = 3,
 };
 
-static const char * llama_file_version_name(llama_fver version) {
+static const char * jarvis_file_version_name(jarvis_fver version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
         case GGUF_FILE_VERSION_V2: return "GGUF V2";
@@ -4101,7 +4101,7 @@ static const char * llama_file_version_name(llama_fver version) {
     return "unknown";
 }
 
-static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
+static std::string jarvis_format_tensor_shape(const std::vector<int64_t> & ne) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
     for (size_t i = 1; i < ne.size(); i++) {
@@ -4110,7 +4110,7 @@ static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
     return buf;
 }
 
-static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
+static std::string jarvis_format_tensor_shape(const struct ggml_tensor * t) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -4185,33 +4185,33 @@ namespace GGUFMeta {
             return GKV::getter(ctx, k);
         }
 
-        static const char * override_type_to_str(const llama_model_kv_override_type ty) {
+        static const char * override_type_to_str(const jarvis_model_kv_override_type ty) {
             switch (ty) {
-                case LLAMA_KV_OVERRIDE_TYPE_BOOL:  return "bool";
-                case LLAMA_KV_OVERRIDE_TYPE_INT:   return "int";
-                case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
-                case LLAMA_KV_OVERRIDE_TYPE_STR:   return "str";
+                case JARVIS_KV_OVERRIDE_TYPE_BOOL:  return "bool";
+                case JARVIS_KV_OVERRIDE_TYPE_INT:   return "int";
+                case JARVIS_KV_OVERRIDE_TYPE_FLOAT: return "float";
+                case JARVIS_KV_OVERRIDE_TYPE_STR:   return "str";
             }
             return "unknown";
         }
 
-        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
+        static bool validate_override(const jarvis_model_kv_override_type expected_type, const struct jarvis_model_kv_override * ovrd) {
             if (!ovrd) { return false; }
             if (ovrd->tag == expected_type) {
-                LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
+                JARVIS_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
                     __func__, override_type_to_str(ovrd->tag), ovrd->key);
                 switch (ovrd->tag) {
-                    case LLAMA_KV_OVERRIDE_TYPE_BOOL:  {
-                        LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
+                    case JARVIS_KV_OVERRIDE_TYPE_BOOL:  {
+                        JARVIS_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
                     } break;
-                    case LLAMA_KV_OVERRIDE_TYPE_INT:   {
-                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
+                    case JARVIS_KV_OVERRIDE_TYPE_INT:   {
+                        JARVIS_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
                     } break;
-                    case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
-                        LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
+                    case JARVIS_KV_OVERRIDE_TYPE_FLOAT: {
+                        JARVIS_LOG_INFO("%.6f\n", ovrd->val_f64);
                     } break;
-                    case LLAMA_KV_OVERRIDE_TYPE_STR: {
-                        LLAMA_LOG_INFO("%s\n", ovrd->val_str);
+                    case JARVIS_KV_OVERRIDE_TYPE_STR: {
+                        JARVIS_LOG_INFO("%s\n", ovrd->val_str);
                     } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
@@ -4221,15 +4221,15 @@ namespace GGUFMeta {
                 }
                 return true;
             }
-            LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
+            JARVIS_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
                 __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
             return false;
         }
 
         template<typename OT>
         static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
+        try_override(OT & target, const struct jarvis_model_kv_override * ovrd) {
+            if (validate_override(JARVIS_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
                 target = ovrd->val_bool;
                 return true;
             }
@@ -4238,8 +4238,8 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
+        try_override(OT & target, const struct jarvis_model_kv_override * ovrd) {
+            if (validate_override(JARVIS_KV_OVERRIDE_TYPE_INT, ovrd)) {
                 target = ovrd->val_i64;
                 return true;
             }
@@ -4248,8 +4248,8 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
+        try_override(T & target, const struct jarvis_model_kv_override * ovrd) {
+            if (validate_override(JARVIS_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
                 target = ovrd->val_f64;
                 return true;
             }
@@ -4258,15 +4258,15 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
+        try_override(T & target, const struct jarvis_model_kv_override * ovrd) {
+            if (validate_override(JARVIS_KV_OVERRIDE_TYPE_STR, ovrd)) {
                 target = ovrd->val_str;
                 return true;
             }
             return false;
         }
 
-        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+        static bool set(const gguf_context * ctx, const int k, T & target, const struct jarvis_model_kv_override * ovrd = nullptr) {
             if (try_override<T>(target, ovrd)) {
                 return true;
             }
@@ -4275,23 +4275,23 @@ namespace GGUFMeta {
             return true;
         }
 
-        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+        static bool set(const gguf_context * ctx, const char * key, T & target, const struct jarvis_model_kv_override * ovrd = nullptr) {
             return set(ctx, gguf_find_key(ctx, key), target, ovrd);
         }
 
-        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct jarvis_model_kv_override * ovrd = nullptr) {
             return set(ctx, key.c_str(), target, ovrd);
         }
     };
 }
 
-using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+using jarvis_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
 
-static size_t llama_model_max_nodes(const llama_model & model) {
+static size_t jarvis_model_max_nodes(const jarvis_model & model) {
     return std::max<size_t>(8192, model.tensors_by_name.size()*5);
 }
 
-struct llama_model_loader {
+struct jarvis_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
     int n_created = 0;
@@ -4302,20 +4302,20 @@ struct llama_model_loader {
     bool use_mmap = false;
     bool check_tensors;
 
-    llama_files files;
-    llama_ftype ftype;
-    llama_fver  fver;
+    jarvis_files files;
+    jarvis_ftype ftype;
+    jarvis_fver  fver;
 
-    llama_mmaps mappings;
+    jarvis_mmaps mappings;
 
     // Holds information on a model weight
-    struct llama_tensor_weight {
+    struct jarvis_tensor_weight {
         uint16_t  idx; // source file index
         size_t   offs; // tensor data offset in the original file
 
         ggml_tensor * tensor;
 
-        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+        jarvis_tensor_weight(const jarvis_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
             const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
             offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
 
@@ -4324,9 +4324,9 @@ struct llama_model_loader {
             }
         }
     };
-    std::vector<llama_tensor_weight> weights;
+    std::vector<jarvis_tensor_weight> weights;
 
-    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
+    std::unordered_map<std::string, struct jarvis_model_kv_override> kv_overrides;
 
     struct gguf_context * meta = NULL;
     std::vector<ggml_context *> contexts;
@@ -4334,14 +4334,14 @@ struct llama_model_loader {
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
 
-    llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
+    jarvis_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct jarvis_model_kv_override * param_overrides_p) {
         int trace = 0;
-        if (getenv("LLAMA_TRACE")) {
-            trace = atoi(getenv("LLAMA_TRACE"));
+        if (getenv("JARVIS_TRACE")) {
+            trace = atoi(getenv("JARVIS_TRACE"));
         }
 
         if (param_overrides_p != nullptr) {
-            for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
+            for (const struct jarvis_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
                 kv_overrides.insert({std::string(p->key), *p});
             }
         }
@@ -4360,7 +4360,7 @@ struct llama_model_loader {
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-        files.emplace_back(new llama_file(fname.c_str(), "rb"));
+        files.emplace_back(new jarvis_file(fname.c_str(), "rb"));
         contexts.emplace_back(ctx);
 
         // Save tensors data offset of the main file.
@@ -4381,17 +4381,17 @@ struct llama_model_loader {
             }
 
             char split_prefix[PATH_MAX] = {0};
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
+            if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
                 throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
             }
 
             if (trace > 0) {
-                LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+                JARVIS_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
             }
 
             char split_path[PATH_MAX] = {0};
             for (idx = 1; idx < n_split; idx++) {
-                llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+                jarvis_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
 
                 struct gguf_init_params split_params = {
                     /*.no_alloc = */ true,
@@ -4402,7 +4402,7 @@ struct llama_model_loader {
                     throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
                 }
 
-                files.emplace_back(new llama_file(split_path, "rb"));
+                files.emplace_back(new jarvis_file(split_path, "rb"));
                 contexts.emplace_back(ctx);
 
                 // Save tensors data offset info of the shard.
@@ -4423,13 +4423,13 @@ struct llama_model_loader {
                 }
             }
 
-            LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
+            JARVIS_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         }
 
         n_kv      = gguf_get_n_kv(meta);
         n_tensors = weights.size();
 
-        fver = (enum llama_fver) gguf_get_version(meta);
+        fver = (enum jarvis_fver) gguf_get_version(meta);
 
         std::set<std::string> tensor_names;
         for (auto & w : weights) {
@@ -4444,8 +4444,8 @@ struct llama_model_loader {
             tensor_names.insert(name);
         }
 
-        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-                __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+        JARVIS_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
+                __func__, n_kv, n_tensors, fname.c_str(), jarvis_file_version_name(fver));
 
         // determine file type based on the number of tensors for each quantization and print meta data
         // TODO: make optional
@@ -4468,56 +4468,56 @@ struct llama_model_loader {
 
                 if (trace > 0) {
                     const uint16_t sid = weights.at(i).idx;
-                    LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                    JARVIS_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), jarvis_format_tensor_shape(tensor).c_str());
                 }
             }
 
             switch (type_max) {
-                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
-                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
-                case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
-                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
-                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
-                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
-                case GGML_TYPE_Q5_1:    ftype = LLAMA_FTYPE_MOSTLY_Q5_1;    break;
-                case GGML_TYPE_Q8_0:    ftype = LLAMA_FTYPE_MOSTLY_Q8_0;    break;
-                case GGML_TYPE_Q2_K:    ftype = LLAMA_FTYPE_MOSTLY_Q2_K;    break;
-                case GGML_TYPE_Q3_K:    ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M;  break;
-                case GGML_TYPE_Q4_K:    ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M;  break;
-                case GGML_TYPE_Q5_K:    ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M;  break;
-                case GGML_TYPE_Q6_K:    ftype = LLAMA_FTYPE_MOSTLY_Q6_K;    break;
-                case GGML_TYPE_TQ1_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ1_0;   break;
-                case GGML_TYPE_TQ2_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ2_0;   break;
-                case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
-                case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
-                case GGML_TYPE_IQ2_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_S;   break;
-                case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
-                case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
-                case GGML_TYPE_IQ1_M:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_M;   break;
-                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
-                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
-                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
-                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
-                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
+                case GGML_TYPE_F32:     ftype = JARVIS_FTYPE_ALL_F32;        break;
+                case GGML_TYPE_F16:     ftype = JARVIS_FTYPE_MOSTLY_F16;     break;
+                case GGML_TYPE_BF16:    ftype = JARVIS_FTYPE_MOSTLY_BF16;    break;
+                case GGML_TYPE_Q4_0:    ftype = JARVIS_FTYPE_MOSTLY_Q4_0;    break;
+                case GGML_TYPE_Q4_1:    ftype = JARVIS_FTYPE_MOSTLY_Q4_1;    break;
+                case GGML_TYPE_Q5_0:    ftype = JARVIS_FTYPE_MOSTLY_Q5_0;    break;
+                case GGML_TYPE_Q5_1:    ftype = JARVIS_FTYPE_MOSTLY_Q5_1;    break;
+                case GGML_TYPE_Q8_0:    ftype = JARVIS_FTYPE_MOSTLY_Q8_0;    break;
+                case GGML_TYPE_Q2_K:    ftype = JARVIS_FTYPE_MOSTLY_Q2_K;    break;
+                case GGML_TYPE_Q3_K:    ftype = JARVIS_FTYPE_MOSTLY_Q3_K_M;  break;
+                case GGML_TYPE_Q4_K:    ftype = JARVIS_FTYPE_MOSTLY_Q4_K_M;  break;
+                case GGML_TYPE_Q5_K:    ftype = JARVIS_FTYPE_MOSTLY_Q5_K_M;  break;
+                case GGML_TYPE_Q6_K:    ftype = JARVIS_FTYPE_MOSTLY_Q6_K;    break;
+                case GGML_TYPE_TQ1_0:   ftype = JARVIS_FTYPE_MOSTLY_TQ1_0;   break;
+                case GGML_TYPE_TQ2_0:   ftype = JARVIS_FTYPE_MOSTLY_TQ2_0;   break;
+                case GGML_TYPE_IQ2_XXS: ftype = JARVIS_FTYPE_MOSTLY_IQ2_XXS; break;
+                case GGML_TYPE_IQ2_XS:  ftype = JARVIS_FTYPE_MOSTLY_IQ2_XS;  break;
+                case GGML_TYPE_IQ2_S:   ftype = JARVIS_FTYPE_MOSTLY_IQ2_S;   break;
+                case GGML_TYPE_IQ3_XXS: ftype = JARVIS_FTYPE_MOSTLY_IQ3_XXS; break;
+                case GGML_TYPE_IQ1_S:   ftype = JARVIS_FTYPE_MOSTLY_IQ1_S;   break;
+                case GGML_TYPE_IQ1_M:   ftype = JARVIS_FTYPE_MOSTLY_IQ1_M;   break;
+                case GGML_TYPE_IQ4_NL:  ftype = JARVIS_FTYPE_MOSTLY_IQ4_NL;  break;
+                case GGML_TYPE_IQ4_XS:  ftype = JARVIS_FTYPE_MOSTLY_IQ4_XS;  break;
+                case GGML_TYPE_IQ3_S:   ftype = JARVIS_FTYPE_MOSTLY_IQ3_S;   break;
+                case GGML_TYPE_Q4_0_4_4: ftype = JARVIS_FTYPE_MOSTLY_Q4_0_4_4; break;
+                case GGML_TYPE_Q4_0_4_8: ftype = JARVIS_FTYPE_MOSTLY_Q4_0_4_8; break;
+                case GGML_TYPE_Q4_0_8_8: ftype = JARVIS_FTYPE_MOSTLY_Q4_0_8_8; break;
                 default:
                     {
-                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
-                        ftype = LLAMA_FTYPE_ALL_F32;
+                        JARVIS_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
+                        ftype = JARVIS_FTYPE_ALL_F32;
                     } break;
             }
 
             // this is a way to mark that we have "guessed" the file type
-            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
+            ftype = (jarvis_ftype) (ftype | JARVIS_FTYPE_GUESSED);
 
             {
                 const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
                 if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
+                    ftype = (jarvis_ftype) gguf_get_val_u32(meta, kid);
                 }
             }
 
-            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+            JARVIS_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
             for (int i = 0; i < n_kv; i++) {
                 const char * name           = gguf_get_key(meta, i);
@@ -4534,7 +4534,7 @@ struct llama_model_loader {
                 }
                 replace_all(value, "\n", "\\n");
 
-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+                JARVIS_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
             }
 
             // print type counts
@@ -4543,12 +4543,12 @@ struct llama_model_loader {
                     continue;
                 }
 
-                LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+                JARVIS_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
             }
         }
 
-        if (!llama_mmap::SUPPORTED) {
-            LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
+        if (!jarvis_mmap::SUPPORTED) {
+            JARVIS_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
             use_mmap = false;
         }
 
@@ -4556,7 +4556,7 @@ struct llama_model_loader {
         this->check_tensors = check_tensors;
     }
 
-    ~llama_model_loader() {
+    ~jarvis_model_loader() {
         if (meta) {
             gguf_free(meta);
         }
@@ -4661,7 +4661,7 @@ struct llama_model_loader {
     bool get_key(const std::string & key, T & result, const bool required = true) {
         auto it = kv_overrides.find(key);
 
-        const struct llama_model_kv_override * override =
+        const struct jarvis_model_kv_override * override =
             it != kv_overrides.end() ? &it->second : nullptr;
 
         const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
@@ -4736,7 +4736,7 @@ struct llama_model_loader {
         return weights.at(i).tensor->name;
     }
 
-    const llama_tensor_weight * get_weight(const char * name) const {
+    const jarvis_tensor_weight * get_weight(const char * name) const {
         for (const auto & weight : weights) {
             if (strcmp(name, weight.tensor->name) == 0) {
                 return &weight;
@@ -4745,12 +4745,12 @@ struct llama_model_loader {
         return nullptr;
     }
 
-    const llama_tensor_weight * get_weight(int i) const {
+    const jarvis_tensor_weight * get_weight(int i) const {
         return get_weight(get_tensor_name(i));
     }
 
-    const llama_tensor_weight & require_weight(const char * name) const {
-        const llama_tensor_weight * weight = get_weight(name);
+    const jarvis_tensor_weight & require_weight(const char * name) const {
+        const jarvis_tensor_weight * weight = get_weight(name);
         if (!weight) {
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
         }
@@ -4812,8 +4812,8 @@ struct llama_model_loader {
                 throw std::runtime_error(
                         format("%s: tensor '%s' has wrong shape; expected %s, got %s",
                             __func__, name.c_str(),
-                            llama_format_tensor_shape(ne).c_str(),
-                            llama_format_tensor_shape(cur).c_str()));
+                            jarvis_format_tensor_shape(ne).c_str(),
+                            jarvis_format_tensor_shape(cur).c_str()));
             }
         }
 
@@ -4867,15 +4867,15 @@ struct llama_model_loader {
         }
     }
 
-    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
+    void init_mappings(bool prefetch = true, jarvis_mlocks * mlock_mmaps = nullptr) {
         if (use_mmap) {
             mappings.reserve(files.size());
             mmaps_used.reserve(files.size());
             for (const auto & file : files) {
-                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
+                std::unique_ptr<jarvis_mmap> mapping(new jarvis_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
                 mmaps_used.emplace_back(mapping->size, 0);
                 if (mlock_mmaps) {
-                    std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
+                    std::unique_ptr<jarvis_mlock> mlock_mmap(new jarvis_mlock());
                     mlock_mmap->init(mapping->addr);
                     mlock_mmaps->emplace_back(std::move(mlock_mmap));
                 }
@@ -4944,9 +4944,9 @@ struct llama_model_loader {
     // Returns false if cancelled by progress_callback
     bool load_all_data(
             struct ggml_context * ctx,
-            llama_buf_map & bufs,
-            llama_mlocks * lmlocks,
-            llama_progress_callback progress_callback,
+            jarvis_buf_map & bufs,
+            jarvis_mlocks * lmlocks,
+            jarvis_progress_callback progress_callback,
             void * progress_callback_user_data) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
@@ -4970,20 +4970,20 @@ struct llama_model_loader {
             // First determine if the backend supports the necessary features for async uploads.
             auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
             if (!buf) {
-                LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
+                JARVIS_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
                 return nullptr;
             }
 
             auto * buft = ggml_backend_buffer_get_type(buf);
             auto * dev = ggml_backend_buft_get_device(buft);
             if (!dev) {
-                LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
+                JARVIS_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
                     ggml_backend_buft_name(buft));
                 return nullptr;
             }
 
             if (buft != ggml_backend_dev_buffer_type(dev)) {
-                LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
+                JARVIS_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
                     ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -4991,14 +4991,14 @@ struct llama_model_loader {
             ggml_backend_dev_props props;
             ggml_backend_dev_get_props(dev, &props);
             if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
-                LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
+                JARVIS_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
 
             auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
             if (!host_buft) {
-                LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
+                JARVIS_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -5007,7 +5007,7 @@ struct llama_model_loader {
             for (size_t idx = 0; idx < n_buffers; ++idx) {
                 auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
                 if (!buf) {
-                    LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
+                    JARVIS_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
                         ggml_backend_dev_name(dev));
                     return nullptr;
                 }
@@ -5017,7 +5017,7 @@ struct llama_model_loader {
 
                 auto * event = ggml_backend_event_new(dev);
                 if (!event) {
-                    LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
+                    JARVIS_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
                         ggml_backend_dev_name(dev));
                     return nullptr;
                 }
@@ -5027,7 +5027,7 @@ struct llama_model_loader {
 
             ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
             if (!backend) {
-                LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
+                JARVIS_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -5036,7 +5036,7 @@ struct llama_model_loader {
         }(__func__);
 
         if (upload_backend) {
-            LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
+            JARVIS_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
                 ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
                 ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
                 ggml_backend_name(upload_backend));
@@ -5145,7 +5145,7 @@ struct llama_model_loader {
         for (auto & future : validation_result) {
             auto result = future.get();
             if (!result.second) {
-                LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
+                JARVIS_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
                 validation_failed = true;
             }
         }
@@ -5178,21 +5178,21 @@ struct llama_model_loader {
 };
 
 // temporary allocate memory for the input batch if needed
-static const llama_seq_id batch_default_seq_id = 0;
-struct llama_batch_allocr {
-    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
-    std::vector<llama_pos>      pos;
+static const jarvis_seq_id batch_default_seq_id = 0;
+struct jarvis_batch_allocr {
+    std::array<jarvis_seq_id, 1> seq_id_0 = {batch_default_seq_id};
+    std::vector<jarvis_pos>      pos;
     std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id *> seq_id;
+    std::vector<jarvis_seq_id *> seq_id;
     std::vector<int8_t>         logits;
-    struct llama_batch          batch;
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
+    struct jarvis_batch          batch;
+    // optionally fulfill the batch returned by jarvis_batch_get_one
+    jarvis_batch_allocr(jarvis_context & ctx, struct jarvis_batch in_batch) {
         batch = in_batch;
         GGML_ASSERT(batch.n_tokens > 0);
         if (!batch.pos) {
             // determine the last position in KV cache
-            llama_pos last_pos = -1;
+            jarvis_pos last_pos = -1;
             for (const auto & cell : ctx.kv_self.cells) {
                 if (cell.has_seq_id(batch_default_seq_id)) {
                     last_pos = std::max(last_pos, cell.pos);
@@ -5229,23 +5229,23 @@ struct llama_batch_allocr {
 };
 
 template<>
-bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
+bool jarvis_model_loader::get_key(const enum llm_kv kid, enum jarvis_pooling_type & result, const bool required) {
     uint32_t tmp;
     const bool found = get_key(kid, tmp, required);
     if (found) {
-        result = (enum llama_pooling_type) tmp;
+        result = (enum jarvis_pooling_type) tmp;
     } else {
-        result = LLAMA_POOLING_TYPE_UNSPECIFIED;
+        result = JARVIS_POOLING_TYPE_UNSPECIFIED;
     }
     return found;
 }
 
 
 //
-// load LLaMA models
+// load JARVIS models
 //
 
-static const char * llama_model_arch_name(llm_arch arch) {
+static const char * jarvis_model_arch_name(llm_arch arch) {
     auto it = LLM_ARCH_NAMES.find(arch);
     if (it == LLM_ARCH_NAMES.end()) {
         return "unknown";
@@ -5253,53 +5253,53 @@ static const char * llama_model_arch_name(llm_arch arch) {
     return it->second;
 }
 
-static std::string llama_model_ftype_name(llama_ftype ftype) {
-    if (ftype & LLAMA_FTYPE_GUESSED) {
-        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+static std::string jarvis_model_ftype_name(jarvis_ftype ftype) {
+    if (ftype & JARVIS_FTYPE_GUESSED) {
+        return jarvis_model_ftype_name((enum jarvis_ftype) (ftype & ~JARVIS_FTYPE_GUESSED)) + " (guessed)";
     }
 
     switch (ftype) {
-        case LLAMA_FTYPE_ALL_F32:         return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
-        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
-        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
-        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
+        case JARVIS_FTYPE_ALL_F32:         return "all F32";
+        case JARVIS_FTYPE_MOSTLY_F16:      return "F16";
+        case JARVIS_FTYPE_MOSTLY_BF16:     return "BF16";
+        case JARVIS_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
+        case JARVIS_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
+        case JARVIS_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
+        case JARVIS_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
+        case JARVIS_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case JARVIS_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
+        case JARVIS_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
+        case JARVIS_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
+        case JARVIS_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
+        case JARVIS_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
+        case JARVIS_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
+        case JARVIS_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
+        case JARVIS_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
+        case JARVIS_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
+        case JARVIS_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case JARVIS_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case JARVIS_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
+        case JARVIS_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
+        case JARVIS_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
+        case JARVIS_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
+        case JARVIS_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
+        case JARVIS_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
 
         default: return "unknown, may not work";
     }
 }
 
-static const char * llama_model_type_name(e_model type) {
+static const char * jarvis_model_type_name(e_model type) {
     switch (type) {
         case MODEL_14M:           return "14M";
         case MODEL_17M:           return "17M";
@@ -5364,19 +5364,19 @@ static const char * llama_model_type_name(e_model type) {
     }
 }
 
-static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
+static const char * jarvis_model_vocab_type_name(enum jarvis_vocab_type type){
     switch (type) {
-        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
-        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
-        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
-        case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
+        case JARVIS_VOCAB_TYPE_NONE: return "no vocab";
+        case JARVIS_VOCAB_TYPE_SPM:  return "SPM";
+        case JARVIS_VOCAB_TYPE_BPE:  return "BPE";
+        case JARVIS_VOCAB_TYPE_WPM:  return "WPM";
+        case JARVIS_VOCAB_TYPE_UGM:  return "UGM";
+        case JARVIS_VOCAB_TYPE_RWKV: return "RWKV";
         default:                    return "unknown";
     }
 }
 
-static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
+static void llm_load_arch(jarvis_model_loader & ml, jarvis_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
         throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
@@ -5384,8 +5384,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
 }
 
 static void llm_load_hparams(
-        llama_model_loader & ml,
-        llama_model & model) {
+        jarvis_model_loader & ml,
+        jarvis_model & model) {
     auto & hparams = model.hparams;
     const gguf_context * ctx = ml.meta;
 
@@ -5417,7 +5417,7 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
 
-    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
+    GGML_ASSERT(hparams.n_expert <= JARVIS_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
     if (hparams.n_expert > 0) {
         GGML_ASSERT(hparams.n_expert_used > 0);
@@ -5451,8 +5451,8 @@ static void llm_load_hparams(
 
     std::string rope_scaling("linear");
     ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
-    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
-    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
+    hparams.rope_scaling_type_train = jarvis_rope_scaling_type_from_string(rope_scaling);
+    GGML_ASSERT(hparams.rope_scaling_type_train != JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED);
 
     // rope_freq_scale (inverse of the kv) is optional
     float ropescale = 0.0f;
@@ -5480,7 +5480,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+        if (model.arch == LLM_ARCH_JARVIS || model.arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
@@ -5493,7 +5493,7 @@ static void llm_load_hparams(
 
     // arch-specific KVs
     switch (model.arch) {
-        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_JARVIS:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
@@ -5505,10 +5505,10 @@ static void llm_load_hparams(
                     }
                 } else {
                     switch (hparams.n_layer) {
-                        case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
+                        case 16: model.type = e_model::MODEL_1B; break; // Jarvis 3.2 1B
                         case 22: model.type = e_model::MODEL_1B; break;
                         case 26: model.type = e_model::MODEL_3B; break;
-                        case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
+                        case 28: model.type = e_model::MODEL_3B; break; // Jarvis 3.2 3B
                         // granite uses a vocab with len 49152
                         case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
                         case 36: model.type = e_model::MODEL_8B; break; // granite
@@ -5733,7 +5733,7 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
 
-                // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
+                // for backward compatibility ; see: https://github.com/ggerganov/jarvis.cpp/pull/8931
                 if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
                     // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
                     hparams.n_swa = 2047;
@@ -6146,12 +6146,12 @@ static void llm_load_hparams(
         hparams.use_alibi = true;
     }
 
-    hparams.rope_type = llama_rope_type(&model);
+    hparams.rope_type = jarvis_rope_type(&model);
 }
 
 static void llm_load_vocab(
-        llama_model_loader & ml,
-        llama_model & model) {
+        jarvis_model_loader & ml,
+        jarvis_model & model) {
     auto & vocab = model.vocab;
 
     struct gguf_context * ctx = ml.meta;
@@ -6167,50 +6167,50 @@ static void llm_load_vocab(
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
         if (tokenizer_model == "no_vocab") {
-            vocab.type = LLAMA_VOCAB_TYPE_NONE;
+            vocab.type = JARVIS_VOCAB_TYPE_NONE;
 
             // default special tokens
-            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
-            vocab.special_eos_id  = LLAMA_TOKEN_NULL;
-            vocab.special_unk_id  = LLAMA_TOKEN_NULL;
-            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
-            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
-            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
-            vocab.special_mask_id = LLAMA_TOKEN_NULL;
-            vocab.linefeed_id     = LLAMA_TOKEN_NULL;
+            vocab.special_bos_id  = JARVIS_TOKEN_NULL;
+            vocab.special_eos_id  = JARVIS_TOKEN_NULL;
+            vocab.special_unk_id  = JARVIS_TOKEN_NULL;
+            vocab.special_sep_id  = JARVIS_TOKEN_NULL;
+            vocab.special_pad_id  = JARVIS_TOKEN_NULL;
+            vocab.special_cls_id  = JARVIS_TOKEN_NULL;
+            vocab.special_mask_id = JARVIS_TOKEN_NULL;
+            vocab.linefeed_id     = JARVIS_TOKEN_NULL;
 
             // read vocab size from metadata
             if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
                 vocab.n_vocab = 0;
-                LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
+                JARVIS_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
             }
             return;
         }
 
-        if (tokenizer_model == "llama") {
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
+        if (tokenizer_model == "jarvis") {
+            vocab.type = JARVIS_VOCAB_TYPE_SPM;
 
             // default special tokens
             vocab.special_bos_id  = 1;
             vocab.special_eos_id  = 2;
             vocab.special_unk_id  = 0;
-            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
-            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
-            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
-            vocab.special_mask_id = LLAMA_TOKEN_NULL;
+            vocab.special_sep_id  = JARVIS_TOKEN_NULL;
+            vocab.special_pad_id  = JARVIS_TOKEN_NULL;
+            vocab.special_cls_id  = JARVIS_TOKEN_NULL;
+            vocab.special_mask_id = JARVIS_TOKEN_NULL;
         } else if (tokenizer_model == "bert") {
-            vocab.type = LLAMA_VOCAB_TYPE_WPM;
+            vocab.type = JARVIS_VOCAB_TYPE_WPM;
 
             // default special tokens
-            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
-            vocab.special_eos_id  = LLAMA_TOKEN_NULL;
+            vocab.special_bos_id  = JARVIS_TOKEN_NULL;
+            vocab.special_eos_id  = JARVIS_TOKEN_NULL;
             vocab.special_unk_id  = 100;
             vocab.special_sep_id  = 102;
             vocab.special_pad_id  = 0;
             vocab.special_cls_id  = 101;
             vocab.special_mask_id = 103;
         } else if (tokenizer_model == "gpt2") {
-            vocab.type = LLAMA_VOCAB_TYPE_BPE;
+            vocab.type = JARVIS_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
             const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
@@ -6239,22 +6239,22 @@ static void llm_load_vocab(
             // default special tokens
             vocab.special_bos_id  = 11;
             vocab.special_eos_id  = 11;
-            vocab.special_unk_id  = LLAMA_TOKEN_NULL;
-            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
-            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
-            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
-            vocab.special_mask_id = LLAMA_TOKEN_NULL;
+            vocab.special_unk_id  = JARVIS_TOKEN_NULL;
+            vocab.special_sep_id  = JARVIS_TOKEN_NULL;
+            vocab.special_pad_id  = JARVIS_TOKEN_NULL;
+            vocab.special_cls_id  = JARVIS_TOKEN_NULL;
+            vocab.special_mask_id = JARVIS_TOKEN_NULL;
         } else if (tokenizer_model == "t5") {
-            vocab.type = LLAMA_VOCAB_TYPE_UGM;
+            vocab.type = JARVIS_VOCAB_TYPE_UGM;
 
             // default special tokens
-            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
+            vocab.special_bos_id  = JARVIS_TOKEN_NULL;
             vocab.special_eos_id  = 1;
             vocab.special_unk_id  = 2;
-            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
+            vocab.special_sep_id  = JARVIS_TOKEN_NULL;
             vocab.special_pad_id  = 0;
-            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
-            vocab.special_mask_id = LLAMA_TOKEN_NULL;
+            vocab.special_cls_id  = JARVIS_TOKEN_NULL;
+            vocab.special_mask_id = JARVIS_TOKEN_NULL;
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
@@ -6274,57 +6274,57 @@ static void llm_load_vocab(
 #endif
             }
         } else if (tokenizer_model == "rwkv") {
-            vocab.type = LLAMA_VOCAB_TYPE_RWKV;
+            vocab.type = JARVIS_VOCAB_TYPE_RWKV;
 
             // default special tokens
-            vocab.special_bos_id = LLAMA_TOKEN_NULL;
-            vocab.special_eos_id = LLAMA_TOKEN_NULL;
-            vocab.special_unk_id = LLAMA_TOKEN_NULL;
-            vocab.special_sep_id = LLAMA_TOKEN_NULL;
-            vocab.special_pad_id = LLAMA_TOKEN_NULL;
+            vocab.special_bos_id = JARVIS_TOKEN_NULL;
+            vocab.special_eos_id = JARVIS_TOKEN_NULL;
+            vocab.special_unk_id = JARVIS_TOKEN_NULL;
+            vocab.special_sep_id = JARVIS_TOKEN_NULL;
+            vocab.special_pad_id = JARVIS_TOKEN_NULL;
         } else {
             throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
         }
 
         // for now, only BPE models have pre-tokenizers
-        if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+        if (vocab.type == JARVIS_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
             if (tokenizer_pre.empty()) {
-                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
-                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                JARVIS_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+                JARVIS_LOG_WARN("%s:                                             \n", __func__);
+                JARVIS_LOG_WARN("%s: ************************************        \n", __func__);
+                JARVIS_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
+                JARVIS_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
+                JARVIS_LOG_WARN("%s: ************************************        \n", __func__);
+                JARVIS_LOG_WARN("%s:                                             \n", __func__);
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             } else if (tokenizer_pre == "default") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
-                    tokenizer_pre == "llama3"   ||
-                    tokenizer_pre == "llama-v3" ||
-                    tokenizer_pre == "llama-bpe") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+                    tokenizer_pre == "jarvis3"   ||
+                    tokenizer_pre == "jarvis-v3" ||
+                    tokenizer_pre == "jarvis-bpe") {
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_JARVIS3;
                 vocab.tokenizer_ignore_merges = true;
                 vocab.tokenizer_add_bos = true;
             } else if (
                     tokenizer_pre == "deepseek-llm") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                     tokenizer_pre == "deepseek-coder") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                     tokenizer_pre == "falcon") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_FALCON;
             } else if (
                     tokenizer_pre == "mpt") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_MPT;
             } else if (
                     tokenizer_pre == "starcoder") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_STARCODER;
             } else if (
                     tokenizer_pre == "gpt-2"   ||
                     tokenizer_pre == "phi-2"   ||
@@ -6334,99 +6334,99 @@ static void llm_load_vocab(
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
                     tokenizer_pre == "jina-v2-code") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_GPT2;
             } else if (
                     tokenizer_pre == "refact") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_REFACT;
             } else if (
                 tokenizer_pre == "command-r") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_COMMAND_R;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                 tokenizer_pre == "qwen2") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_QWEN2;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                 tokenizer_pre == "stablelm2") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_STABLELM2;
             } else if (
                 tokenizer_pre == "olmo") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_OLMO;
             } else if (
                 tokenizer_pre == "dbrx") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DBRX;
             } else if (
                 tokenizer_pre == "smaug-bpe") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_SMAUG;
             } else if (
                 tokenizer_pre == "poro-chat") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_PORO;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                 tokenizer_pre == "chatglm-bpe") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
-                vocab.special_bos_id = LLAMA_TOKEN_NULL;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_CHATGLM4;
+                vocab.special_bos_id = JARVIS_TOKEN_NULL;
             } else if (
                 tokenizer_pre == "viking") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_VIKING;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                 tokenizer_pre == "jais") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_JAIS;
             } else if (
                 tokenizer_pre == "tekken") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_TEKKEN;
                 vocab.tokenizer_clean_spaces = false;
                 vocab.tokenizer_ignore_merges = true;
                 vocab.tokenizer_add_bos = true;
             } else if (
                 tokenizer_pre == "smollm") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_SMOLLM;
                 vocab.tokenizer_clean_spaces = false;
             } else if (
                 tokenizer_pre == "codeshell") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_CODESHELL;
             } else if (
                 tokenizer_pre == "bloom") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_BLOOM;
             } else if (
                 tokenizer_pre == "gpt3-finnish") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_GPT3_FINNISH;
             } else if (
                 tokenizer_pre == "exaone") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_EXAONE;
             } else if (
                 tokenizer_pre == "chameleon") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+                vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_CHAMELEON;
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
-        } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (vocab.type == JARVIS_VOCAB_TYPE_SPM) {
+            vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_space_prefix = true;
             vocab.tokenizer_clean_spaces = false;
             vocab.tokenizer_add_bos = true;
             vocab.tokenizer_add_eos = false;
-        } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (vocab.type == JARVIS_VOCAB_TYPE_WPM) {
+            vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
             vocab.tokenizer_add_bos = true;
             vocab.tokenizer_add_eos = false;
-        } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (vocab.type == JARVIS_VOCAB_TYPE_UGM) {
+            vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_bos = false;
             vocab.tokenizer_add_eos = true;
-        } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (vocab.type == JARVIS_VOCAB_TYPE_RWKV) {
+            vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = false;
             vocab.tokenizer_add_bos = false;
             vocab.tokenizer_add_eos = false;
         } else {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            vocab.type_pre = JARVIS_VOCAB_PRE_TYPE_DEFAULT;
         }
 
         ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      vocab.tokenizer_add_space_prefix,         false);
@@ -6460,7 +6460,7 @@ static void llm_load_vocab(
 
         //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
         if (word.empty()) {
-            LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+            JARVIS_LOG_WARN("%s: empty token at index %u\n", __func__, i);
             word = "[EMPTY_" + std::to_string(i) + "]";
         }
 
@@ -6470,18 +6470,18 @@ static void llm_load_vocab(
         auto & token_data = vocab.id_to_token[i];
         token_data.text  = std::move(word);
         token_data.score = scores ? scores[i] : 0.0f;
-        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
+        token_data.attr  = JARVIS_TOKEN_ATTR_NORMAL;
 
         if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
             switch(toktypes[i]) {
-                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN;      break;
-                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attr = LLAMA_TOKEN_ATTR_UNUSED;       break;
-                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;       break;
-                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;      break;
-                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
-                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attr = LLAMA_TOKEN_ATTR_BYTE;         break;
-                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
-                default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+                case JARVIS_TOKEN_TYPE_UNKNOWN:      token_data.attr = JARVIS_TOKEN_ATTR_UNKNOWN;      break;
+                case JARVIS_TOKEN_TYPE_UNUSED:       token_data.attr = JARVIS_TOKEN_ATTR_UNUSED;       break;
+                case JARVIS_TOKEN_TYPE_NORMAL:       token_data.attr = JARVIS_TOKEN_ATTR_NORMAL;       break;
+                case JARVIS_TOKEN_TYPE_CONTROL:      token_data.attr = JARVIS_TOKEN_ATTR_CONTROL;      break;
+                case JARVIS_TOKEN_TYPE_USER_DEFINED: token_data.attr = JARVIS_TOKEN_ATTR_USER_DEFINED; break;
+                case JARVIS_TOKEN_TYPE_BYTE:         token_data.attr = JARVIS_TOKEN_ATTR_BYTE;         break;
+                case JARVIS_TOKEN_TYPE_UNDEFINED:    token_data.attr = JARVIS_TOKEN_ATTR_UNDEFINED;    break;
+                default:                            token_data.attr = JARVIS_TOKEN_ATTR_UNDEFINED;    break;
             }
         }
     }
@@ -6489,26 +6489,26 @@ static void llm_load_vocab(
 
     vocab.init_tokenizer();
 
-    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
-    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
+    // determine the newline token: JARVIS "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+    if (vocab.type == JARVIS_VOCAB_TYPE_SPM) {
         try {
-            vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
+            vocab.linefeed_id = jarvis_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
-            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            JARVIS_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
             vocab.linefeed_id = vocab.special_pad_id;
         }
-    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
+    } else if (vocab.type == JARVIS_VOCAB_TYPE_WPM) {
         vocab.linefeed_id = vocab.special_pad_id;
-    } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
+    } else if (vocab.type == JARVIS_VOCAB_TYPE_RWKV) {
+        const std::vector<int> ids = jarvis_tokenize_internal(vocab, "\n", false);
         GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
         vocab.linefeed_id = ids[0];
     } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
+        const std::vector<int> ids = jarvis_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
 
         //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
         if (ids.empty()) {
-            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+            JARVIS_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
             vocab.linefeed_id = vocab.special_pad_id;
         } else {
             vocab.linefeed_id = ids[0];
@@ -6549,7 +6549,7 @@ static void llm_load_vocab(
                 continue;
             }
             if (new_id >= vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
+                JARVIS_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
                     __func__, key.c_str(), new_id, id);
             } else {
                 id = new_id;
@@ -6574,7 +6574,7 @@ static void llm_load_vocab(
 
         for (const auto & t : vocab.token_to_id) {
             // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
-            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_eot_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
@@ -6585,30 +6585,30 @@ static void llm_load_vocab(
                         || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                    ) {
                     vocab.special_eot_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find EOM token: "<|eom_id|>"
-            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_eom_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|eom_id|>"
                         ) {
                     vocab.special_eom_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
-            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_pre_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|fim_prefix|>"  // Qwen
                         || t.first == "<fim-prefix>"
@@ -6616,16 +6616,16 @@ static void llm_load_vocab(
                         || t.first == "<PRE>"
                         ) {
                     vocab.special_fim_pre_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
-            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_suf_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|fim_suffix|>" // Qwen
                         || t.first == "<fim-suffix>"
@@ -6633,16 +6633,16 @@ static void llm_load_vocab(
                         || t.first == "<SUF>"
                         ) {
                     vocab.special_fim_suf_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
-            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_mid_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|fim_middle|>" // Qwen
                         || t.first == "<fim-middle>"
@@ -6650,32 +6650,32 @@ static void llm_load_vocab(
                         || t.first == "<MID>"
                         ) {
                     vocab.special_fim_mid_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
-            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_pad_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|fim_pad|>" // Qwen
                         || t.first == "<fim-pad>"
                         || t.first == "<PAD>"
                         ) {
                     vocab.special_fim_pad_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
-            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_rep_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|fim_repo|>"  // Qwen
                         || t.first == "<|repo_name|>"
@@ -6683,24 +6683,24 @@ static void llm_load_vocab(
                         || t.first == "<REPO>"
                         ) {
                     vocab.special_fim_rep_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
 
             // find FIM_SEP token: "<|file_sep|>"
-            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+            if (vocab.special_fim_sep_id == JARVIS_TOKEN_NULL) {
                 if (false
                         || t.first == "<|file_sep|>" // Qwen
                         ) {
                     vocab.special_fim_sep_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                    if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                        JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                 __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                        vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                     }
                 }
             }
@@ -6708,18 +6708,18 @@ static void llm_load_vocab(
 
         // maintain a list of tokens that cause end-of-generation
         // this is currently determined based on the token text, which is obviously not ideal
-        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        // ref: https://github.com/ggerganov/jarvis.cpp/issues/9606
         vocab.special_eog_ids.clear();
 
-        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+        if (vocab.special_fim_pad_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
         }
 
-        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+        if (vocab.special_fim_rep_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
         }
 
-        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+        if (vocab.special_fim_sep_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
         }
 
@@ -6734,69 +6734,69 @@ static void llm_load_vocab(
                     || t.first == "<EOT>"
                ) {
                 vocab.special_eog_ids.insert(t.second);
-                if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                if ((vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL) == 0) {
+                    JARVIS_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                             __func__, t.second, t.first.c_str());
-                    vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    vocab.id_to_token[t.second].attr = JARVIS_TOKEN_ATTR_CONTROL;
                 }
             } else {
                 // token is control, but not marked as EOG -> print a debug log
-                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                if (vocab.id_to_token[t.second].attr & JARVIS_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+                    JARVIS_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
                             __func__, t.second, t.first.c_str());
                 }
             }
         }
 
         // sanity checks
-        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        if (vocab.special_eos_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
-            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+            JARVIS_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
-            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+            JARVIS_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != JARVIS_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
-            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+            JARVIS_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
     }
 
     // build special tokens cache
     {
-        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
+        for (jarvis_vocab::id id = 0; id < (jarvis_vocab::id)n_vocab; ++id) {
+            if (vocab.id_to_token[id].attr & (JARVIS_TOKEN_ATTR_CONTROL | JARVIS_TOKEN_ATTR_USER_DEFINED | JARVIS_TOKEN_ATTR_UNKNOWN)) {
                 vocab.cache_special_tokens.push_back(id);
             }
         }
 
         std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
-            [&] (const llama_vocab::id a, const llama_vocab::id b) {
+            [&] (const jarvis_vocab::id a, const jarvis_vocab::id b) {
                 return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
             }
         );
 
-        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
+        JARVIS_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
     }
 
     // build token to piece cache
     {
         size_t size_cache = 0;
 
-        std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
+        std::vector<jarvis_vocab::token> cache_token_to_piece(n_vocab);
 
         for (uint32_t id = 0; id < n_vocab; ++id) {
-            cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
+            cache_token_to_piece[id] = jarvis_token_to_piece(&model, id, true);
 
             size_cache += cache_token_to_piece[id].size();
         }
 
         std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
 
-        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+        JARVIS_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
     }
 
     // Handle per token attributes
@@ -6813,13 +6813,13 @@ static void llm_load_vocab(
             return false;
         };
 
-        auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
+        auto _set_tokenid_attr = [&] (const jarvis_vocab::id id, jarvis_token_attr attr, bool value) {
             uint32_t current = vocab.id_to_token.at(id).attr;
             current = value ? (current | attr) : (current & ~attr);
-            vocab.id_to_token[id].attr = (llama_token_attr) current;
+            vocab.id_to_token[id].attr = (jarvis_token_attr) current;
         };
 
-        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+        auto _set_token_attr = [&] (const std::string & token, jarvis_token_attr attr, bool value) {
             _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
         };
 
@@ -6838,26 +6838,26 @@ static void llm_load_vocab(
 
         // set attributes by model/tokenizer name
         if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
-            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+            _set_token_attr("<mask>", JARVIS_TOKEN_ATTR_LSTRIP, true);
         } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
             for (auto id : vocab.cache_special_tokens) {
-                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
+                _set_tokenid_attr(id, JARVIS_TOKEN_ATTR_RSTRIP, true);
             }
             for (auto token : {"</s>"}) {
-                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
+                _set_token_attr(token, JARVIS_TOKEN_ATTR_RSTRIP, true);
             }
             for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
-                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
+                _set_token_attr(token, JARVIS_TOKEN_ATTR_RSTRIP, false);
             }
         }
     }
 }
 
-static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
+static void llm_load_print_meta(jarvis_model_loader & ml, jarvis_model & model) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
-    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+    const char * rope_scaling_type = JARVIS_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
         bool is_var = false;
@@ -6889,135 +6889,135 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     };
 
     // hparams
-    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
-    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch));
-    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, llama_model_vocab_type_name(vocab.type));
-    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab);
-    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size());
-    LLAMA_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
+    JARVIS_LOG_INFO("%s: format           = %s\n",     __func__, jarvis_file_version_name(ml.fver));
+    JARVIS_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch));
+    JARVIS_LOG_INFO("%s: vocab type       = %s\n",     __func__, jarvis_model_vocab_type_name(vocab.type));
+    JARVIS_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab);
+    JARVIS_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size());
+    JARVIS_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
 
     if (!hparams.vocab_only) {
-        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
-        LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
-        LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
-        LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
-        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
-        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
-        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
-        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
-        LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
-        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
-        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
-        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
-        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
-        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
-        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
-        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
-        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
-        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
-        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
-        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
-        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
-        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
-        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
-        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
-        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
-        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
-    }
-
-    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
-    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
+        JARVIS_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
+        JARVIS_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
+        JARVIS_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
+        JARVIS_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
+        JARVIS_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
+        JARVIS_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
+        JARVIS_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
+        JARVIS_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
+        JARVIS_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
+        JARVIS_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
+        JARVIS_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+        JARVIS_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
+        JARVIS_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+        JARVIS_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
+        JARVIS_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
+        JARVIS_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
+        JARVIS_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
+        JARVIS_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
+        JARVIS_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
+        JARVIS_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
+        JARVIS_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
+        JARVIS_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        JARVIS_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+        JARVIS_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
+        JARVIS_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
+        JARVIS_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
+        JARVIS_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
+        JARVIS_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
+    }
+
+    JARVIS_LOG_INFO("%s: model type       = %s\n",     __func__, jarvis_model_type_name(model.type));
+    JARVIS_LOG_INFO("%s: model ftype      = %s\n",     __func__, jarvis_model_ftype_name(model.ftype).c_str());
     if (ml.n_elements >= 1e12) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
+        JARVIS_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
     } else if (ml.n_elements >= 1e9) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
+        JARVIS_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
     } else if (ml.n_elements >= 1e6) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
+        JARVIS_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
     } else {
-        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
+        JARVIS_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
     }
     if (ml.n_bytes < GiB) {
-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
+        JARVIS_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
     } else {
-        LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
+        JARVIS_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
     }
 
     // general kv
-    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
+    JARVIS_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
-    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
-    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
-
-    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
-
-    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
-    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
-    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
-    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
-    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
-    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
+    if (vocab.special_bos_id  != -1)    { JARVIS_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { JARVIS_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { JARVIS_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { JARVIS_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { JARVIS_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { JARVIS_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { JARVIS_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { JARVIS_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { JARVIS_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+
+    if (vocab.linefeed_id != -1)        { JARVIS_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { JARVIS_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { JARVIS_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { JARVIS_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { JARVIS_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { JARVIS_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { JARVIS_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
-        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
+        JARVIS_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
     }
 
-    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
+    JARVIS_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
 
     if (model.arch == LLM_ARCH_DEEPSEEK2) {
-        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
-        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
-        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
-        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
-        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
-        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
+        JARVIS_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
+        JARVIS_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
+        JARVIS_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
+        JARVIS_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        JARVIS_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
+        JARVIS_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+        JARVIS_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
     }
 
     if (model.arch == LLM_ARCH_QWEN2MOE) {
-        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
-        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
+        JARVIS_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
+        JARVIS_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
     }
 
     if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
-        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
-        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
-        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
+        JARVIS_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
+        JARVIS_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
+        JARVIS_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
     }
 }
 
 // Returns false if cancelled by progress_callback
 static bool llm_load_tensors(
-        llama_model_loader & ml,
-        llama_model & model,
+        jarvis_model_loader & ml,
+        jarvis_model & model,
         int n_gpu_layers,
-        enum llama_split_mode split_mode,
+        enum jarvis_split_mode split_mode,
         int main_gpu,
         const float * tensor_split,
         bool use_mlock,
-        llama_progress_callback progress_callback,
+        jarvis_progress_callback progress_callback,
         void * progress_callback_user_data) {
     auto & hparams = model.hparams;
 
     // check if the value of main_gpu is valid
-    if (llama_get_device_count(model) > 0 &&
-        split_mode != LLAMA_SPLIT_MODE_LAYER &&
-        (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
-        throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
+    if (jarvis_get_device_count(model) > 0 &&
+        split_mode != JARVIS_SPLIT_MODE_LAYER &&
+        (main_gpu < 0 || main_gpu >= jarvis_get_device_count(model))) {
+        throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, jarvis_get_device_count(model)));
     }
 
     model.split_mode   = split_mode;
@@ -7029,8 +7029,8 @@ static bool llm_load_tensors(
     bool use_mmap_buffer = true;
 
     // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    model.buft_input = llama_default_buffer_type_cpu(model, true);
-    //model.buft_input = llama_default_buffer_type_offload(main_gpu);
+    model.buft_input = jarvis_default_buffer_type_cpu(model, true);
+    //model.buft_input = jarvis_default_buffer_type_offload(main_gpu);
 
     model.buft_layer.resize(n_layer);
 
@@ -7039,22 +7039,22 @@ static bool llm_load_tensors(
 #ifdef GGML_USE_AMX
         model.buft_layer[i] = {
             ggml_backend_amx_buffer_type(),
-            llama_default_buffer_type_cpu(model, true)
+            jarvis_default_buffer_type_cpu(model, true)
         };
 #else
-        model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
+        model.buft_layer[i] = jarvis_default_buffer_type_cpu(model, true);
 #endif
     }
 
-    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
+    if (split_mode == JARVIS_SPLIT_MODE_LAYER) {
         // calculate the split points
-        int device_count = llama_get_device_count(model);
+        int device_count = jarvis_get_device_count(model);
         bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
         std::vector<float> splits(device_count);
         if (all_zero) {
             // default split, by free memory
             for (int i = 0; i < device_count; ++i) {
-                splits[i] = llama_get_device_memory(model, i);
+                splits[i] = jarvis_get_device_memory(model, i);
             }
         } else {
             std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -7074,38 +7074,38 @@ static bool llm_load_tensors(
         int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
         for (int i = i_gpu_start; i < n_layer; ++i) {
             int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-            model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
+            model.buft_layer[i] = jarvis_default_buffer_type_offload(model, layer_gpu);
         }
         // assign the output layer
         if (n_gpu_layers > n_layer) {
             int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-            model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
+            model.buft_output = jarvis_default_buffer_type_offload(model, layer_gpu);
         } else {
-            model.buft_output = llama_default_buffer_type_cpu(model, true);
+            model.buft_output = jarvis_default_buffer_type_cpu(model, true);
         }
     } else {
         ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
+        if (split_mode == JARVIS_SPLIT_MODE_ROW) {
+            split_buft = jarvis_default_buffer_type_split(model, main_gpu, tensor_split);
         } else {
-            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(model, main_gpu);
+            // JARVIS_SPLIT_MODE_NONE or JARVIS_SPLIT_MODE_LAYER in backends where it is not supported
+            split_buft = jarvis_default_buffer_type_offload(model, main_gpu);
         }
         // assign the repeating layers
         for (int i = i_gpu_start; i < n_layer; ++i) {
             model.buft_layer[i] = {
                 split_buft,
-                llama_default_buffer_type_offload(model, main_gpu)
+                jarvis_default_buffer_type_offload(model, main_gpu)
             };
         }
         // assign the output layer
         if (n_gpu_layers > n_layer) {
             model.buft_output = {
                 split_buft,
-                llama_default_buffer_type_offload(model, main_gpu)
+                jarvis_default_buffer_type_offload(model, main_gpu)
             };
         } else {
-            model.buft_output = llama_default_buffer_type_cpu(model, true);
+            model.buft_output = jarvis_default_buffer_type_cpu(model, true);
         }
     }
 
@@ -7141,7 +7141,7 @@ static bool llm_load_tensors(
         model.ctxs.push_back(ctx);
     }
 
-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
+    JARVIS_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
 
     // create tensors for the weights
     {
@@ -7177,7 +7177,7 @@ static bool llm_load_tensors(
 
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
-            case LLM_ARCH_LLAMA:
+            case LLM_ARCH_JARVIS:
             case LLM_ARCH_REFACT:
             case LLM_ARCH_MINICPM:
             case LLM_ARCH_GRANITE:
@@ -7188,11 +7188,11 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -7210,14 +7210,14 @@ static bool llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
 
                         if (n_expert == 0) {
                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
@@ -7225,13 +7225,13 @@ static bool llm_load_tensors(
                             layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
                             // optional MLP bias
-                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         } else {
                             layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
 
-                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                             if (layer.ffn_gate_exps) {
                                 layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
                                 layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
@@ -7274,11 +7274,11 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -7306,8 +7306,8 @@ static bool llm_load_tensors(
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
-                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head_qk_rope/2 }, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
             case LLM_ARCH_GROK:
@@ -7321,11 +7321,11 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -7347,7 +7347,7 @@ static bool llm_load_tensors(
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
                         layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
-                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         if (layer.ffn_gate_exps) {
                             layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
@@ -7450,9 +7450,9 @@ static bool llm_load_tensors(
                         model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         if (!model.output) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
                         }
                     }
 
@@ -7465,8 +7465,8 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -7484,10 +7484,10 @@ static bool llm_load_tensors(
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         if (!model.output) {
                             // needs to be on GPU
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
 
                     }
@@ -7526,11 +7526,11 @@ static bool llm_load_tensors(
                     if (model.arch == LLM_ARCH_BERT) {
                         model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train});
 
-                        model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
-                        model.cls_out   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         jarvis_model_loader::TENSOR_NOT_REQUIRED);
                     }
 
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
@@ -7583,8 +7583,8 @@ static bool llm_load_tensors(
                     model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
                     model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}); //LayerNorm bias
 
-                    model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                    model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {1},         jarvis_model_loader::TENSOR_NOT_REQUIRED);
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
                         ggml_context * ctx_split = ctx_for_layer_split(i);
@@ -7594,14 +7594,14 @@ static bool llm_load_tensors(
                         layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
 
-                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
                         layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa});
 
-                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
                         layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa});
@@ -7612,8 +7612,8 @@ static bool llm_load_tensors(
                         layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
                         layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd});
 
-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
@@ -7666,16 +7666,16 @@ static bool llm_load_tensors(
             case LLM_ARCH_MPT:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         if (!model.output) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
                         }
                     }
 
@@ -7686,31 +7686,31 @@ static bool llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // AWQ ScaleActivation layer
-                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_STABLELM:
@@ -7739,17 +7739,17 @@ static bool llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
                         // optional bias tensors, present in Stable LM 2 1.6B
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // optional q and k layernorms, present in StableLM 2 12B
-                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
@@ -7792,10 +7792,10 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -7896,8 +7896,8 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         if (layer.wqkv == nullptr) {
                             layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -7940,7 +7940,7 @@ static bool llm_load_tensors(
 
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
@@ -7948,8 +7948,8 @@ static bool llm_load_tensors(
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
                         layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
 
-                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head/2 }, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
             case LLM_ARCH_PLAMO:
@@ -8118,7 +8118,7 @@ static bool llm_load_tensors(
 
                     // output
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
@@ -8145,7 +8145,7 @@ static bool llm_load_tensors(
 
                     // output
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
@@ -8177,10 +8177,10 @@ static bool llm_load_tensors(
                         model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
 
                     }
@@ -8232,10 +8232,10 @@ static bool llm_load_tensors(
                     {
                         model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed, duplicated to allow offloading
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8301,7 +8301,7 @@ static bool llm_load_tensors(
                     {
                         model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         // init output from the input tok embed
-                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
@@ -8327,16 +8327,16 @@ static bool llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
-            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
+            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_JARVIS with norm params removed
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8401,7 +8401,7 @@ static bool llm_load_tensors(
                     {
                         model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         // init output from the input tok embed
-                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
@@ -8470,11 +8470,11 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8592,23 +8592,23 @@ static bool llm_load_tensors(
                         layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
 
                         layer.wq       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wk       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wv       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wo       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd});
                         layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
 
                         layer.ffn_gate       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_down       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_up         = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_scale   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_scale   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_T5:
@@ -8622,10 +8622,10 @@ static bool llm_load_tensors(
                         model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm     = ml.create_tensor(ctx_output, tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd});
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8636,7 +8636,7 @@ static bool llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm_enc  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
                         layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
@@ -8644,12 +8644,12 @@ static bool llm_load_tensors(
                         layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
 
                         layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up_enc   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
                         layer.attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
                         layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
@@ -8658,7 +8658,7 @@ static bool llm_load_tensors(
 
                         layer.attn_norm_cross  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd});
                         // this tensor seems to be unused in HF transformers implementation
-                        layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wq_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
                         layer.wk_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
@@ -8666,7 +8666,7 @@ static bool llm_load_tensors(
                         layer.wo_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
@@ -8680,10 +8680,10 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8694,7 +8694,7 @@ static bool llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm_enc  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
                         layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
@@ -8702,7 +8702,7 @@ static bool llm_load_tensors(
                         layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
 
                         layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up_enc   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
@@ -8802,10 +8802,10 @@ static bool llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                         layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
@@ -8814,8 +8814,8 @@ static bool llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
                         // optional MLP bias
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_EXAONE:
@@ -8842,7 +8842,7 @@ static bool llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, jarvis_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? jarvis_model_loader::TENSOR_DUPLICATED : 0));
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
@@ -8917,11 +8917,11 @@ static bool llm_load_tensors(
                  // output
                     {
                         model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         // if output is NULL, init from the input tok embed
                         if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, jarvis_model_loader::TENSOR_DUPLICATED);
                         }
                     }
 
@@ -8934,8 +8934,8 @@ static bool llm_load_tensors(
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
                         layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, jarvis_model_loader::TENSOR_NOT_REQUIRED);
 
                         layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
                         layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
@@ -8960,7 +8960,7 @@ static bool llm_load_tensors(
     model.mappings.reserve(ml.mappings.size());
 
     // create the backend buffers
-    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, jarvis_buf_map>> ctx_bufs;
     ctx_bufs.reserve(ctx_map.size());
 
     // Ensure we have enough capacity for the maximum backend buffer we will potentially create
@@ -8971,12 +8971,12 @@ static bool llm_load_tensors(
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx              = it.second;
 
-        llama_buf_map bufs;
+        jarvis_buf_map bufs;
         bufs.reserve(n_max_backend_buffer);
 
         // check if this backend device supports buffer_from_host_ptr
         // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == jarvis_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
         bool buffer_from_host_ptr_supported = false;
         if (dev) {
             ggml_backend_dev_props props;
@@ -9011,7 +9011,7 @@ static bool llm_load_tensors(
             }
             model.bufs.push_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
-                model.mlock_bufs.emplace_back(new llama_mlock);
+                model.mlock_bufs.emplace_back(new jarvis_mlock);
                 auto & mlock_buf = model.mlock_bufs.back();
                 mlock_buf->init   (ggml_backend_buffer_get_base(buf));
                 mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
@@ -9034,23 +9034,23 @@ static bool llm_load_tensors(
         ctx_bufs.emplace_back(ctx, bufs);
     }
 
-    if (llama_supports_gpu_offload()) {
+    if (jarvis_supports_gpu_offload()) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        JARVIS_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
         if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
+            JARVIS_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
         }
 
         const int max_backend_supported_layers = hparams.n_layer + 1;
         const int max_offloadable_layers       = hparams.n_layer + 1;
 
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+        JARVIS_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
     }
 
     // print memory requirements
     for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+        JARVIS_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
     }
 
     // populate tensors_by_name
@@ -9078,12 +9078,12 @@ static bool llm_load_tensors(
     return true;
 }
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
+// Returns 0 on success, -1 on error, and -2 on cancellation via jarvis_progress_callback
+static int jarvis_model_load(const std::string & fname, jarvis_model & model, jarvis_model_params & params) {
     model.t_start_us = ggml_time_us();
 
     try {
-        llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
+        jarvis_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
 
@@ -9105,29 +9105,29 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         llm_load_print_meta(ml, model);
 
-        if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
+        if (model.vocab.type != JARVIS_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
             throw std::runtime_error("vocab size mismatch");
         }
 
         if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+            JARVIS_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
             return 0;
         }
 
 #ifdef GGML_USE_KOMPUTE
         if (params.n_gpu_layers > 0 && (
-            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
+            !(model.arch == LLM_ARCH_JARVIS || model.arch == LLM_ARCH_FALCON)
             || !(
-                model.ftype == LLAMA_FTYPE_ALL_F32 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+                model.ftype == JARVIS_FTYPE_ALL_F32 ||
+                model.ftype == JARVIS_FTYPE_MOSTLY_F16 ||
+                model.ftype == JARVIS_FTYPE_MOSTLY_BF16 ||
+                model.ftype == JARVIS_FTYPE_MOSTLY_Q4_0 ||
+                model.ftype == JARVIS_FTYPE_MOSTLY_Q4_1
             )
         )) {
-            // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
-            LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
+            // TODO(cebtenzzre): propagate this error outside of jarvis_load_model_from_file
+            JARVIS_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
             params.n_gpu_layers = 0;
         }
 #endif
@@ -9139,7 +9139,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             return -2;
         }
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
         return -1;
     }
 
@@ -9176,9 +9176,9 @@ enum llm_norm_type {
 
 static struct ggml_tensor * llm_build_inp_embd(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
-        const llama_hparams & hparams,
-         const llama_ubatch & batch,
+       struct jarvis_context & lctx,
+        const jarvis_hparams & hparams,
+         const jarvis_ubatch & batch,
          struct ggml_tensor * tok_embd,
          const llm_build_cb & cb) {
     const int64_t n_embd = hparams.n_embd;
@@ -9209,9 +9209,9 @@ static struct ggml_tensor * llm_build_inp_embd(
 
 static void llm_build_kv_store(
         struct ggml_context * ctx,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
-       const llama_kv_cache & kv,
+        const jarvis_hparams & hparams,
+        const jarvis_cparams & cparams,
+       const jarvis_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * k_cur,
          struct ggml_tensor * v_cur,
@@ -9253,13 +9253,13 @@ static void llm_build_kv_store(
 
 // do mat_mul, while optionally apply lora
 static struct ggml_tensor * llm_build_lora_mm(
-        struct llama_context & lctx,
+        struct jarvis_context & lctx,
          struct ggml_context * ctx0,
           struct ggml_tensor * w,
           struct ggml_tensor * cur) {
     struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
     for (auto & it : lctx.lora_adapters) {
-        struct llama_lora_weight * lora = it.first->get_weight(w);
+        struct jarvis_lora_weight * lora = it.first->get_weight(w);
         if (lora == nullptr) {
             continue;
         }
@@ -9278,14 +9278,14 @@ static struct ggml_tensor * llm_build_lora_mm(
 
 // do mat_mul_id, while optionally apply lora
 static struct ggml_tensor * llm_build_lora_mm_id(
-        struct llama_context & lctx,
+        struct jarvis_context & lctx,
          struct ggml_context * ctx0,
           struct ggml_tensor * w,   // struct ggml_tensor * as
           struct ggml_tensor * cur, // struct ggml_tensor * b
           struct ggml_tensor * ids) {
     struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
     for (auto & it : lctx.lora_adapters) {
-        struct llama_lora_weight * lora = it.first->get_weight(w);
+        struct jarvis_lora_weight * lora = it.first->get_weight(w);
         if (lora == nullptr) {
             continue;
         }
@@ -9306,7 +9306,7 @@ static struct ggml_tensor * llm_build_lora_mm_id(
 static struct ggml_tensor * llm_build_norm(
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
-        const llama_hparams & hparams,
+        const jarvis_hparams & hparams,
          struct ggml_tensor * mw,
          struct ggml_tensor * mb,
               llm_norm_type   type,
@@ -9337,7 +9337,7 @@ static struct ggml_tensor * llm_build_norm(
 
 static struct ggml_tensor * llm_build_ffn(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
+       struct jarvis_context & lctx,
          struct ggml_tensor * cur,
          struct ggml_tensor * up,
          struct ggml_tensor * up_b,
@@ -9464,7 +9464,7 @@ static struct ggml_tensor * llm_build_ffn(
 
 static struct ggml_tensor * llm_build_moe_ffn(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
+       struct jarvis_context & lctx,
          struct ggml_tensor * cur,
          struct ggml_tensor * gate_inp,
          struct ggml_tensor * up_exps,
@@ -9565,8 +9565,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
 
 static struct ggml_tensor * llm_build_kqv(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
+       struct jarvis_context & lctx,
+       const jarvis_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * wo,
          struct ggml_tensor * wo_b,
@@ -9577,9 +9577,9 @@ static struct ggml_tensor * llm_build_kqv(
                     float     kq_scale,
          const llm_build_cb & cb,
                     int       il) {
-    const llama_model   & model   = lctx.model;
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
+    const jarvis_model   & model   = lctx.model;
+    const jarvis_hparams & hparams = lctx.model.hparams;
+    const jarvis_cparams & cparams = lctx.cparams;
 
     const int64_t n_ctx         = cparams.n_ctx;
     const int64_t n_head        = hparams.n_head(il);
@@ -9629,7 +9629,7 @@ static struct ggml_tensor * llm_build_kqv(
 
         if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) {
             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-            // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
+            // ref: https://github.com/ggerganov/jarvis.cpp/pull/4490#issuecomment-1859055847
             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
         }
 
@@ -9696,8 +9696,8 @@ static struct ggml_tensor * llm_build_kqv(
 
 static struct ggml_tensor * llm_build_kv(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
+       struct jarvis_context & lctx,
+       const jarvis_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * wo,
          struct ggml_tensor * wo_b,
@@ -9711,8 +9711,8 @@ static struct ggml_tensor * llm_build_kv(
                     float     kq_scale,
          const llm_build_cb & cb,
                     int       il) {
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
+    const jarvis_hparams & hparams = lctx.model.hparams;
+    const jarvis_cparams & cparams = lctx.cparams;
 
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
@@ -9765,8 +9765,8 @@ static struct ggml_tensor * llm_build_copy_mask_state(
 // TODO: split
 static struct ggml_tensor * llm_build_mamba(
         struct ggml_context * ctx,
-       struct llama_context & lctx,
-         const llama_ubatch & batch,
+       struct jarvis_context & lctx,
+         const jarvis_ubatch & batch,
          struct ggml_cgraph * graph,
          struct ggml_tensor * cur,
          struct ggml_tensor * state_copy,
@@ -9775,9 +9775,9 @@ static struct ggml_tensor * llm_build_mamba(
                     int32_t   n_kv,
          const llm_build_cb & cb,
                     int       il) {
-    const llama_model    & model   = lctx.model;
-    const llama_hparams  & hparams = model.hparams;
-    const llama_kv_cache & kv      = lctx.kv_self;
+    const jarvis_model    & model   = lctx.model;
+    const jarvis_hparams  & hparams = model.hparams;
+    const jarvis_kv_cache & kv      = lctx.kv_self;
     const int64_t d_conv  = hparams.ssm_d_conv;
     const int64_t d_inner = hparams.ssm_d_inner;
     const int64_t d_state = hparams.ssm_d_state;
@@ -9899,9 +9899,9 @@ static struct ggml_tensor * llm_build_mamba(
 }
 
 static struct ggml_tensor * llm_build_rwkv6_time_mix(
-        struct llama_context & lctx,
+        struct jarvis_context & lctx,
         struct ggml_context * ctx,
-        const struct llama_layer * layer,
+        const struct jarvis_layer * layer,
         struct ggml_tensor * cur,
         struct ggml_tensor * x_prev,
         struct ggml_tensor ** wkv_state) {
@@ -10042,9 +10042,9 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
 }
 
 static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-        struct llama_context & lctx,
+        struct jarvis_context & lctx,
         struct ggml_context * ctx,
-        const struct llama_layer * layer,
+        const struct jarvis_layer * layer,
         struct ggml_tensor * cur,
         struct ggml_tensor * x_prev) {
     struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
@@ -10064,12 +10064,12 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
 }
 
 struct llm_build_context {
-    const llama_model    & model;
-          llama_context  & lctx;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_ubatch   & ubatch;
-    const llama_kv_cache & kv_self;
+    const jarvis_model    & model;
+          jarvis_context  & lctx;
+    const jarvis_hparams  & hparams;
+    const jarvis_cparams  & cparams;
+    const jarvis_ubatch   & ubatch;
+    const jarvis_kv_cache & kv_self;
 
     const int64_t n_embd;
     const int64_t n_layer;
@@ -10102,8 +10102,8 @@ struct llm_build_context {
 
     const bool flash_attn;
 
-    const enum llama_pooling_type pooling_type;
-    const enum llama_rope_type    rope_type;
+    const enum jarvis_pooling_type pooling_type;
+    const enum jarvis_rope_type    rope_type;
 
     const llm_build_cb & cb;
 
@@ -10113,8 +10113,8 @@ struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-        llama_context  & lctx,
-    const llama_ubatch & ubatch,
+        jarvis_context  & lctx,
+    const jarvis_ubatch & ubatch,
     const llm_build_cb & cb,
                   bool   worst_case) :
         model            (lctx.model),
@@ -10191,7 +10191,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_k_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         GGML_ASSERT(kv_self.size == n_ctx);
 
@@ -10241,7 +10241,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         for (uint32_t i = 0; i < ids.size(); ++i) {
             const uint32_t id = ids[i];
@@ -10303,7 +10303,7 @@ struct llm_build_context {
             i += nm - 1;
         }
 
-        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+        //JARVIS_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
 
         return gf;
     }
@@ -10403,22 +10403,22 @@ struct llm_build_context {
         struct ggml_tensor * cur;
 
         switch (pooling_type) {
-            case LLAMA_POOLING_TYPE_NONE:
+            case JARVIS_POOLING_TYPE_NONE:
                 {
                     cur = inp;
                 } break;
-            case LLAMA_POOLING_TYPE_MEAN:
+            case JARVIS_POOLING_TYPE_MEAN:
                 {
                     struct ggml_tensor * inp_mean = build_inp_mean();
                     cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
                 } break;
-            case LLAMA_POOLING_TYPE_CLS:
-            case LLAMA_POOLING_TYPE_LAST:
+            case JARVIS_POOLING_TYPE_CLS:
+            case JARVIS_POOLING_TYPE_LAST:
                 {
                     struct ggml_tensor * inp_cls = build_inp_cls();
                     cur = ggml_get_rows(ctx0, inp, inp_cls);
                 } break;
-            case LLAMA_POOLING_TYPE_RANK:
+            case JARVIS_POOLING_TYPE_RANK:
                 {
                     struct ggml_tensor * inp_cls = build_inp_cls();
                     inp = ggml_get_rows(ctx0, inp, inp_cls);
@@ -10499,8 +10499,8 @@ struct llm_build_context {
         return lctx.inp_KQ_mask_cross;
     }
 
-    struct ggml_cgraph * build_llama() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+    struct ggml_cgraph * build_jarvis() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -10532,7 +10532,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                // rope freq factors for jarvis3; may return nullptr for jarvis2 and other models
                 struct ggml_tensor * rope_factors = build_rope_factors(il);
 
                 // compute Q and K and RoPE them
@@ -10663,7 +10663,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_baichuan() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10778,7 +10778,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_xverse() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10881,7 +10881,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_falcon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11001,7 +11001,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_grok() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -11158,7 +11158,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_dbrx() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -11284,7 +11284,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_starcoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11388,7 +11388,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_refact() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11482,7 +11482,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bert() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11610,7 +11610,7 @@ struct llm_build_context {
             }
             cb(cur, "kqv_out", il);
 
-            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            if (il == n_layer - 1 && pooling_type == JARVIS_POOLING_TYPE_NONE) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
@@ -11676,7 +11676,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bloom() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11777,7 +11777,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_mpt() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -12067,7 +12067,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12179,7 +12179,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12291,7 +12291,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen2moe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -12437,7 +12437,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_phi2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -12558,7 +12558,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_phi3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -12790,7 +12790,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gpt2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -12895,7 +12895,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_codeshell() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -13006,7 +13006,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_orion() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13124,7 +13124,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_internlm2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13242,10 +13242,10 @@ struct llm_build_context {
     }
 
     // ref: https://arxiv.org/abs/2203.03466
-    //      https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
-    // based on the original build_llama() function
+    //      https://github.com/ggerganov/jarvis.cpp/issues/5276#issuecomment-1925774738
+    // based on the original build_jarvis() function
     struct ggml_cgraph * build_minicpm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13389,7 +13389,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_minicpm3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         //TODO: if the model varies, these parameters need to be read from the model
         const int64_t n_embd_base = 256;
@@ -13598,7 +13598,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gemma() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
@@ -13706,7 +13706,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gemma2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
@@ -13842,7 +13842,7 @@ struct llm_build_context {
 
 
     struct ggml_cgraph * build_starcoder2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13961,7 +13961,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_mamba() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
@@ -14016,7 +14016,7 @@ struct llm_build_context {
 
     struct ggml_cgraph * build_command_r() {
 
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14164,13 +14164,13 @@ struct llm_build_context {
     }
 
     // ref: https://allenai.org/olmo
-    // based on the original build_llama() function, changes:
+    // based on the original build_jarvis() function, changes:
     //   * non-parametric layer norm
     //   * clamp qkv
     //   * removed bias
     //   * removed MoE
     struct ggml_cgraph * build_olmo() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -14298,7 +14298,7 @@ struct llm_build_context {
     //   * removed bias
     //   * added q, k norm
     struct ggml_cgraph * build_olmoe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -14422,7 +14422,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_openelm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14547,7 +14547,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gptneox() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -14689,7 +14689,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_arctic() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -14821,7 +14821,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_deepseek2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -14829,7 +14829,7 @@ struct llm_build_context {
         bool is_lite = (hparams.n_layer == 27);
 
         // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
-        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+        // See https://github.com/ggerganov/jarvis.cpp/discussions/7416 for detailed explanation.
         const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
         const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
         const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
@@ -15049,7 +15049,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bitnet() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15199,7 +15199,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_t5_encoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -15331,7 +15331,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_t5_decoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -15346,7 +15346,7 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         GGML_ASSERT(!lctx.is_encoding);
-        GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+        GGML_ASSERT(n_outputs_enc > 0 && "call jarvis_encode() first");
 
         struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
         struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
@@ -15536,7 +15536,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_jais() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -15628,7 +15628,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_chatglm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -15742,7 +15742,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_nemotron() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15863,7 +15863,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_exaone() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -15894,7 +15894,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                // rope freq factors for jarvis3; may return nullptr for jarvis2 and other models
                 struct ggml_tensor * rope_factors = build_rope_factors(il);
 
                 // compute Q and K and RoPE them
@@ -15990,7 +15990,7 @@ struct llm_build_context {
     }
 
     ggml_cgraph * build_rwkv6() {
-        ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        ggml_cgraph *gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // Token shift state dimensions should be 2 * n_emb
         GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
@@ -16011,7 +16011,7 @@ struct llm_build_context {
         inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
+            const jarvis_layer * layer = &model.layers[il];
 
             // (ab)using the KV cache to store the states
             struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
@@ -16103,13 +16103,13 @@ struct llm_build_context {
     }
 
     // ref: https://github.com/facebookresearch/chameleon
-    // based on the original build_llama() function, changes:
+    // based on the original build_jarvis() function, changes:
     //   * qk-norm
     //   * swin-norm
     //   * removed bias
     //   * removed MoE
     struct ggml_cgraph * build_chameleon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, jarvis_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -16281,8 +16281,8 @@ struct llm_build_context {
     }
 };
 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-    llama_ubatch dummy = {};
+static struct ggml_cgraph * jarvis_build_graph_defrag(jarvis_context & lctx, const std::vector<uint32_t> & ids) {
+    jarvis_ubatch dummy = {};
     dummy.equal_seqs = true;
 
     llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
@@ -16298,8 +16298,8 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
     return result;
 }
 
-static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
-    llama_ubatch dummy = {};
+static struct ggml_cgraph * jarvis_build_graph_k_shift(jarvis_context & lctx) {
+    jarvis_ubatch dummy = {};
     dummy.equal_seqs = true;
 
     llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
@@ -16315,9 +16315,9 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
     return result;
 }
 
-static struct ggml_cgraph * llama_build_graph(
-         llama_context & lctx,
-    const llama_ubatch & ubatch,
+static struct ggml_cgraph * jarvis_build_graph(
+         jarvis_context & lctx,
+    const jarvis_ubatch & ubatch,
                   bool   worst_case) {
     const auto & model = lctx.model;
 
@@ -16359,11 +16359,11 @@ static struct ggml_cgraph * llama_build_graph(
     llm.init();
 
     switch (model.arch) {
-        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_JARVIS:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
             {
-                result = llm.build_llama();
+                result = llm.build_jarvis();
             } break;
         case LLM_ARCH_BAICHUAN:
             {
@@ -16557,7 +16557,7 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-static void llama_set_k_shift(llama_context & lctx) {
+static void jarvis_set_k_shift(jarvis_context & lctx) {
     const int64_t kv_size = lctx.kv_self.size;
 
     assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
@@ -16569,7 +16569,7 @@ static void llama_set_k_shift(llama_context & lctx) {
     }
 }
 
-static void llama_set_s_copy(llama_context & lctx) {
+static void jarvis_set_s_copy(jarvis_context & lctx) {
     const int64_t kv_size = lctx.kv_self.size;
 
     assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
@@ -16581,7 +16581,7 @@ static void llama_set_s_copy(llama_context & lctx) {
     }
 }
 
-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+static int32_t jarvis_relative_position_bucket(jarvis_pos x, jarvis_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
 
@@ -16605,7 +16605,7 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+static void jarvis_set_inputs(jarvis_context & lctx, const jarvis_ubatch & ubatch) {
     //
     // set input data
     //
@@ -16633,7 +16633,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
     }
 
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+    if (hparams.causal_attn || cparams.pooling_type == JARVIS_POOLING_TYPE_NONE) {
         GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
         const int64_t n_tokens = ubatch.n_tokens;
 
@@ -16695,10 +16695,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
             // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
             for (int h = 0; h < 1; ++h) {
                 for (int s = 0; s < n_seqs; ++s) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                    const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
 
                     for (int j = 0; j < n_seq_tokens; ++j) {
-                        const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
+                        const jarvis_pos pos = ubatch.pos[s*n_seq_tokens + j];
 
                         for (int i = 0; i < n_kv; ++i) {
                             float f;
@@ -16756,7 +16756,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+                    const jarvis_seq_id seq_id = ubatch.seq_id[s1][0];
 
                     for (int j = 0; j < n_seq_tokens; ++j) {
                         const int32_t tj = s1*n_seq_tokens + j;
@@ -16790,7 +16790,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         }
     }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+    if (cparams.embeddings && cparams.pooling_type == JARVIS_POOLING_TYPE_MEAN) {
         const int64_t n_tokens     = ubatch.n_tokens;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
@@ -16804,7 +16804,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         std::vector<uint64_t> sum(n_tokens, 0);
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
@@ -16821,7 +16821,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         }
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
@@ -16830,8 +16830,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
     if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+                cparams.pooling_type == JARVIS_POOLING_TYPE_CLS ||
+                cparams.pooling_type == JARVIS_POOLING_TYPE_RANK)) {
         const int64_t n_tokens     = ubatch.n_tokens;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
@@ -16843,13 +16843,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+                const jarvis_pos pos = ubatch.pos[s*n_seq_tokens + i];
 
                 if (pos == 0) {
                     data[seq_id] = s*n_seq_tokens + i;
@@ -16858,7 +16858,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         }
     }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+    if (cparams.embeddings && cparams.pooling_type == JARVIS_POOLING_TYPE_LAST) {
         const int64_t n_tokens     = ubatch.n_tokens;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
@@ -16873,13 +16873,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
         std::vector<int> last_row(n_tokens, -1);
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+                const jarvis_pos pos = ubatch.pos[s*n_seq_tokens + i];
 
                 if (pos >= last_pos[seq_id]) {
                     last_pos[seq_id] = pos;
@@ -16905,7 +16905,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                jarvis_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
 
                 data[i] = (float) (kv_cell.src >= 0);
 
@@ -16923,7 +16923,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
             // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
             for (uint32_t i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                jarvis_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
 
                 // prevent out-of-bound sources
                 if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
@@ -16953,7 +16953,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = jarvis_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                     }
                 }
             }
@@ -16961,7 +16961,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = jarvis_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                     }
                 }
             }
@@ -16989,7 +16989,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
                 for (int i = 0; i < n_output_enc; ++i) {
                     float f = -INFINITY;
                     for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
+                        const jarvis_seq_id seq_id = ubatch.seq_id[j][s];
                         if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
                             f = 0.0f;
                         }
@@ -17009,7 +17009,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
 
 // Make sure enough space is available for outputs.
 // Returns max number of outputs for which space was reserved.
-static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+static size_t jarvis_output_reserve(jarvis_context & lctx, size_t n_outputs) {
     const auto & cparams = lctx.cparams;
     const auto & hparams = lctx.model.hparams;
 
@@ -17021,7 +17021,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
 
     // TODO: use a per-batch flag for logits presence instead
     const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == JARVIS_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
     const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
@@ -17040,7 +17040,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
         if (lctx.buf_output) {
 #ifndef NDEBUG
             // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            JARVIS_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
             ggml_backend_buffer_free(lctx.buf_output);
             lctx.buf_output = nullptr;
@@ -17048,9 +17048,9 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
             lctx.embd = nullptr;
         }
 
-        lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
+        lctx.buf_output = ggml_backend_buft_alloc_buffer(jarvis_default_buffer_type_cpu(lctx.model, true), new_size);
         if (lctx.buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            JARVIS_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
             return 0;
         }
     }
@@ -17075,7 +17075,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
 }
 
 // make the outputs have the same order they had in the user-provided batch
-static void llama_output_reorder(struct llama_context * ctx) {
+static void jarvis_output_reorder(struct jarvis_context * ctx) {
     std::vector<size_t> & out_ids = ctx->sbatch.out_ids;
     if (!out_ids.empty()) {
         uint32_t n_vocab = ctx->model.hparams.n_vocab;
@@ -17112,8 +17112,8 @@ static void llama_output_reorder(struct llama_context * ctx) {
     }
 }
 
-static void llama_graph_compute(
-          llama_context & lctx,
+static void jarvis_graph_compute(
+          jarvis_context & lctx,
             ggml_cgraph * gf,
                     int   n_threads,
         ggml_threadpool * threadpool) {
@@ -17129,7 +17129,7 @@ static void llama_graph_compute(
 
     auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
     if (err != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+        JARVIS_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
     }
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -17137,27 +17137,27 @@ static void llama_graph_compute(
 
 // decode a batch of tokens by evaluating the transformer
 //
-//   - lctx:      llama context
+//   - lctx:      jarvis context
 //   - batch:     batch to evaluate
 //
 // return 0 on success
 // return positive int on warning
 // return negative int on error
 //
-static int llama_decode_internal(
-         llama_context & lctx,
-           llama_batch   inp_batch) {
+static int jarvis_decode_internal(
+         jarvis_context & lctx,
+           jarvis_batch   inp_batch) {
 
     lctx.is_encoding = false;
 
     if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        JARVIS_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(lctx, inp_batch);
-    const llama_batch & batch = batch_allocr.batch;
+    jarvis_batch_allocr batch_allocr(lctx, inp_batch);
+    const jarvis_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
 
     const auto & model   = lctx.model;
@@ -17169,7 +17169,7 @@ static int llama_decode_internal(
     if (batch.token) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                JARVIS_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
         }
@@ -17195,7 +17195,7 @@ static int llama_decode_internal(
     const auto n_ubatch = cparams.n_ubatch;
 
     // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != JARVIS_POOLING_TYPE_NONE;
 
     lctx.embd_seq.clear();
 
@@ -17216,13 +17216,13 @@ static int llama_decode_internal(
         /* logits_all   */ n_outputs == n_tokens_all);
 
     // reserve output buffer
-    if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
+    if (jarvis_output_reserve(lctx, n_outputs) < n_outputs) {
+        JARVIS_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
         return -2;
     };
 
     while (lctx.sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
+        jarvis_ubatch ubatch;
         if (kv_self.recurrent) {
             if (embd_pooled) {
                 // Pooled embeddings cannot be split across ubatches (yet)
@@ -17261,7 +17261,7 @@ static int llama_decode_internal(
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_cache_update(&lctx);
+            jarvis_kv_cache_update(&lctx);
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -17269,7 +17269,7 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
 
-            if (!llama_kv_cache_find_slot(kv_self, ubatch)) {
+            if (!jarvis_kv_cache_find_slot(kv_self, ubatch)) {
                 return 1;
             }
 
@@ -17277,9 +17277,9 @@ static int llama_decode_internal(
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = llama_kv_cache_get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
-                //kv_self.n = llama_kv_cache_cell_max(kv_self);
+                const uint32_t pad = jarvis_kv_cache_get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(jarvis_kv_cache_cell_max(kv_self), pad)));
+                //kv_self.n = jarvis_kv_cache_cell_max(kv_self);
             }
         }
 
@@ -17288,7 +17288,7 @@ static int llama_decode_internal(
         ggml_backend_sched_reset(lctx.sched);
         ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
+        ggml_cgraph * gf = jarvis_build_graph(lctx, ubatch, false);
 
         // the output is always the last tensor in the graph
         struct ggml_tensor * res  = ggml_graph_node(gf, -1);
@@ -17312,13 +17312,13 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+        // JARVIS_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
-        llama_set_inputs(lctx, ubatch);
+        jarvis_set_inputs(lctx, ubatch);
 
-        llama_graph_compute(lctx, gf, n_threads, threadpool);
+        jarvis_graph_compute(lctx, gf, n_threads, threadpool);
 
         // update the kv ring buffer
         {
@@ -17332,7 +17332,7 @@ static int llama_decode_internal(
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
-        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //    ggml_graph_dump_dot(gf, NULL, "jarvis.dot");
         //}
 
         // extract logits
@@ -17357,7 +17357,7 @@ static int llama_decode_internal(
             GGML_ASSERT(backend_embd != nullptr);
 
             switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
+                case JARVIS_POOLING_TYPE_NONE:
                     {
                         // extract token embeddings
                         GGML_ASSERT(lctx.embd != nullptr);
@@ -17370,15 +17370,15 @@ static int llama_decode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
                         }
                     } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
+                case JARVIS_POOLING_TYPE_MEAN:
+                case JARVIS_POOLING_TYPE_CLS:
+                case JARVIS_POOLING_TYPE_LAST:
                     {
                         // extract sequence embeddings (cleared before processing each batch)
                         auto & embd_seq_out = lctx.embd_seq;
 
                         for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
                             if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                 continue;
                             }
@@ -17386,13 +17386,13 @@ static int llama_decode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
-                case LLAMA_POOLING_TYPE_RANK:
+                case JARVIS_POOLING_TYPE_RANK:
                     {
                         // extract the rerank score - a single float per sequence
                         auto & embd_seq_out = lctx.embd_seq;
 
                         for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            const jarvis_seq_id seq_id = ubatch.seq_id[s][0];
                             if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                 continue;
                             }
@@ -17400,7 +17400,7 @@ static int llama_decode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
                         }
                     } break;
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                case JARVIS_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
                     }
@@ -17428,21 +17428,21 @@ static int llama_decode_internal(
         }
     }
 
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    // set to total number of outputs in the batch, for use in jarvis_get_logits_ith
     lctx.n_outputs = n_outputs;
 
     // wait for the computation to finish (automatically done when obtaining the model output)
-    //llama_synchronize(&lctx);
+    //jarvis_synchronize(&lctx);
 
     // decide if we need to defrag the kv cache
     if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
         const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
 
-        // queue defragmentation for next llama_kv_cache_update
+        // queue defragmentation for next jarvis_kv_cache_update
         if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+            //JARVIS_LOG_INFO("fragmentation: %.2f\n", fragmentation);
 
-            llama_kv_cache_defrag(kv_self);
+            jarvis_kv_cache_defrag(kv_self);
         }
     }
 
@@ -17455,27 +17455,27 @@ static int llama_decode_internal(
 
 // encode a batch of tokens by evaluating the encoder part of the transformer
 //
-//   - lctx:      llama context
+//   - lctx:      jarvis context
 //   - batch:     batch to evaluate
 //
 // return 0 on success
 // return positive int on warning
 // return negative int on error
 //
-static int llama_encode_internal(
-         llama_context & lctx,
-           llama_batch   inp_batch) {
+static int jarvis_encode_internal(
+         jarvis_context & lctx,
+           jarvis_batch   inp_batch) {
 
     lctx.is_encoding = true;
 
     if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        JARVIS_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(lctx, inp_batch);
-    const llama_batch & batch = batch_allocr.batch;
+    jarvis_batch_allocr batch_allocr(lctx, inp_batch);
+    const jarvis_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
 
     const auto & model   = lctx.model;
@@ -17487,7 +17487,7 @@ static int llama_encode_internal(
     if (batch.token) {
         for (uint32_t i = 0; i < n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                JARVIS_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
         }
@@ -17506,11 +17506,11 @@ static int llama_encode_internal(
 
     lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 
-    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
+    const jarvis_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
 
     // reserve output buffer
-    if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+    if (jarvis_output_reserve(lctx, n_tokens) < n_tokens) {
+        JARVIS_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
         return -2;
     };
 
@@ -17529,13 +17529,13 @@ static int llama_encode_internal(
     ggml_backend_sched_reset(lctx.sched);
     ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
-    ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
+    ggml_cgraph * gf = jarvis_build_graph(lctx, ubatch, false);
 
     // the output embeddings after the final encoder normalization
     struct ggml_tensor * embd = nullptr;
 
     // there are two cases here
-    if (llama_model_has_decoder(&lctx.model)) {
+    if (jarvis_model_has_decoder(&lctx.model)) {
         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
         embd = ggml_graph_node(gf, -1);
         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
@@ -17553,16 +17553,16 @@ static int llama_encode_internal(
 
     ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
-    llama_set_inputs(lctx, ubatch);
+    jarvis_set_inputs(lctx, ubatch);
 
-    llama_graph_compute(lctx, gf, n_threads, threadpool);
+    jarvis_graph_compute(lctx, gf, n_threads, threadpool);
 
     // extract embeddings
     if (embd) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
         GGML_ASSERT(backend_embd != nullptr);
 
-        if (llama_model_has_decoder(&lctx.model)) {
+        if (jarvis_model_has_decoder(&lctx.model)) {
             lctx.embd_enc.resize(n_tokens*n_embd);
             float * embd_out = lctx.embd_enc.data();
 
@@ -17573,7 +17573,7 @@ static int llama_encode_internal(
             lctx.seq_ids_enc.resize(n_tokens);
             for (uint32_t i = 0; i < n_tokens; i++) {
                 for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    jarvis_seq_id seq_id = ubatch.seq_id[i][s];
                     lctx.seq_ids_enc[i].insert(seq_id);
                 }
             }
@@ -17581,7 +17581,7 @@ static int llama_encode_internal(
             GGML_ASSERT(lctx.embd != nullptr);
 
             switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
+                case JARVIS_POOLING_TYPE_NONE:
                     {
                         // extract token embeddings
                         GGML_ASSERT(lctx.embd != nullptr);
@@ -17590,9 +17590,9 @@ static int llama_encode_internal(
                         GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
                         ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
                     } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
+                case JARVIS_POOLING_TYPE_MEAN:
+                case JARVIS_POOLING_TYPE_CLS:
+                case JARVIS_POOLING_TYPE_LAST:
                     {
                         // extract sequence embeddings
                         auto & embd_seq_out = lctx.embd_seq;
@@ -17601,7 +17601,7 @@ static int llama_encode_internal(
                         GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
 
                         for (uint32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            const jarvis_seq_id seq_id = ubatch.seq_id[i][0];
                             if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                 continue;
                             }
@@ -17609,14 +17609,14 @@ static int llama_encode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
-                case LLAMA_POOLING_TYPE_RANK:
+                case JARVIS_POOLING_TYPE_RANK:
                     {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        // TODO: this likely should be the same logic as in jarvis_decoder_internal, but better to
                         //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        //       https://github.com/ggerganov/jarvis.cpp/pull/9510
                         GGML_ABORT("RANK pooling not implemented yet");
                     }
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                case JARVIS_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
                     }
@@ -17632,14 +17632,14 @@ static int llama_encode_internal(
 }
 
 // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+static void jarvis_kv_cache_defrag_internal(struct jarvis_context & lctx) {
     auto & kv_self = lctx.kv_self;
 
     const auto & hparams = lctx.model.hparams;
 
     const uint32_t n_layer = hparams.n_layer;
 
-    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_kv   = jarvis_kv_cache_cell_max(kv_self);
     const uint32_t n_used = kv_self.used;
 
     assert(n_used <= n_kv);
@@ -17652,9 +17652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     // each move requires 6*n_layer tensors (see build_defrag)
     //   - source view, destination view, copy operation
     //   - x2 for keys and values
-    //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
+    //const uint32_t max_moves = jarvis_model_max_nodes(model)/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/jarvis.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (jarvis_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
 
     // determine which KV cells to move where
     //
@@ -17735,7 +17735,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
             kv_self.cells[i0 + nf] = cell1;
 
             // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
+            cell1 = jarvis_kv_cell();
             kv_self.head = n_used;
 
             if (!cont) {
@@ -17754,7 +17754,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
             break;
         }
 
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+        //JARVIS_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
 
         i0 += nh - 1;
     }
@@ -17763,9 +17763,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
         return;
     }
 
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+    //JARVIS_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
 
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    //JARVIS_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
 
 #if 0
     // CPU defrag
@@ -17842,21 +17842,21 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
     ggml_backend_sched_reset(lctx.sched);
 
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+    ggml_cgraph * gf = jarvis_build_graph_defrag(lctx, ids);
 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+    jarvis_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif
 
     //const int64_t t_end = ggml_time_us();
 
-    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
+    //JARVIS_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
+static void jarvis_kv_cache_update_internal(struct jarvis_context & lctx) {
     bool need_reserve = false;
 
     // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+    if (lctx.model.hparams.rope_type != JARVIS_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
         if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
             GGML_ABORT("Deepseek2 does not support K-shift");
         }
@@ -17864,13 +17864,13 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         {
             ggml_backend_sched_reset(lctx.sched);
 
-            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
+            ggml_cgraph * gf = jarvis_build_graph_k_shift(lctx);
 
             ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
-            llama_set_k_shift(lctx);
+            jarvis_set_k_shift(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+            jarvis_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -17888,7 +17888,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
     // defragment the KV cache if needed
     if (lctx.kv_self.do_defrag) {
-        llama_kv_cache_defrag_internal(lctx);
+        jarvis_kv_cache_defrag_internal(lctx);
 
         need_reserve = true;
 
@@ -17901,14 +17901,14 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         // build worst-case graph
         uint32_t n_seqs = 1; // TODO: worst-case number of sequences
         uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
-        llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+        jarvis_token token = jarvis_token_bos(&lctx.model); // not actually used by jarvis_build_graph, but required to choose between token and embedding inputs graph
+        jarvis_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        ggml_cgraph * gf = jarvis_build_graph(lctx, ubatch, true);
 
         // initialize scheduler with the worst-case graph
         ggml_backend_sched_reset(lctx.sched);
         if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
         }
     }
 }
@@ -17918,8 +17918,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 //
 
 struct quantize_state_internal {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
+    const jarvis_model                 & model;
+    const jarvis_model_quantize_params * params;
 
     int n_attention_wv    = 0;
     int n_ffn_down        = 0;
@@ -17938,13 +17938,13 @@ struct quantize_state_internal {
     // used to figure out if a model shares tok_embd with the output weight
     bool has_output       = false;
 
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+    quantize_state_internal(const jarvis_model & model, const jarvis_model_quantize_params * params)
         : model(model)
         , params(params)
         {}
 };
 
-static void llama_tensor_dequantize_internal(
+static void jarvis_tensor_dequantize_internal(
     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
 ) {
@@ -18016,7 +18016,7 @@ static void llama_tensor_dequantize_internal(
     workers.clear();
 }
 
-static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+static ggml_type jarvis_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, jarvis_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -18053,9 +18053,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
                 new_type = GGML_TYPE_Q8_0;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+            else if (ftype == JARVIS_FTYPE_MOSTLY_IQ2_XXS || ftype == JARVIS_FTYPE_MOSTLY_IQ2_XS || ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS ||
+                     ftype == JARVIS_FTYPE_MOSTLY_IQ1_S   || ftype == JARVIS_FTYPE_MOSTLY_IQ2_S  || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M   ||
+                     ftype == JARVIS_FTYPE_MOSTLY_IQ1_M) {
                 new_type = GGML_TYPE_Q5_K;
             }
             else if (new_type != GGML_TYPE_Q8_0) {
@@ -18066,29 +18066,29 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
             new_type = qs.params->token_embedding_type;
         } else {
-            if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
-                ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+            if (ftype == JARVIS_FTYPE_MOSTLY_IQ2_XXS || ftype == JARVIS_FTYPE_MOSTLY_IQ2_XS ||
+                ftype == JARVIS_FTYPE_MOSTLY_IQ1_S   || ftype == JARVIS_FTYPE_MOSTLY_IQ1_M) {
                 new_type = GGML_TYPE_Q2_K;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+            else if (ftype == JARVIS_FTYPE_MOSTLY_IQ2_S || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M) {
                 new_type = GGML_TYPE_IQ3_S;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+            else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS) {
                 new_type = GGML_TYPE_IQ3_S;
             }
             else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
                      new_type == GGML_TYPE_Q4_0_8_8) {
                 new_type = GGML_TYPE_Q4_0;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
+            else if (ftype == JARVIS_FTYPE_MOSTLY_TQ1_0 || ftype == JARVIS_FTYPE_MOSTLY_TQ2_0) {
                 new_type = GGML_TYPE_Q4_K;
             }
         }
-    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
-               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+    } else if (ftype == JARVIS_FTYPE_MOSTLY_IQ2_XXS || ftype == JARVIS_FTYPE_MOSTLY_IQ2_XS || ftype == JARVIS_FTYPE_MOSTLY_IQ1_S ||
+               ftype == JARVIS_FTYPE_MOSTLY_IQ2_S || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M    || ftype == JARVIS_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
-            else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            else new_type = ftype == JARVIS_FTYPE_MOSTLY_IQ2_S || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
         else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -18096,7 +18096,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         }
         else if (name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = ftype == JARVIS_FTYPE_MOSTLY_IQ2_S || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
@@ -18104,36 +18104,36 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             if (qs.model.hparams.n_expert == 8) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
-                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+                if (ftype == JARVIS_FTYPE_MOSTLY_IQ1_S || ftype == JARVIS_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+                else if (ftype == JARVIS_FTYPE_MOSTLY_IQ2_S || ftype == JARVIS_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
             }
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+        if      (ftype == JARVIS_FTYPE_MOSTLY_Q2_K) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS || ftype == JARVIS_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_M) {
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+        else if ((ftype == JARVIS_FTYPE_MOSTLY_IQ4_NL || ftype == JARVIS_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q5_K;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
+        else if ((ftype == JARVIS_FTYPE_MOSTLY_Q4_K_M || ftype == JARVIS_FTYPE_MOSTLY_Q5_K_M) &&
                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
         if (qs.model.type == MODEL_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -18152,42 +18152,42 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (name.find("attn_q.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+        if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (name.find("ffn_down") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+        if      (ftype == JARVIS_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q2_K_S) {
             if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
             new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_M) {
             new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
                      : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
                      : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
+        else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
                     (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_L) {
             new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q4_K_M) {
             if (arch == LLM_ARCH_FALCON) {
                 new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
                            use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -18195,52 +18195,52 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
+        else if (i_layer < n_layer/8 && (ftype == JARVIS_FTYPE_MOSTLY_IQ4_NL || ftype == JARVIS_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
             new_type = GGML_TYPE_Q5_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
             new_type = GGML_TYPE_Q5_K;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+        else if ((ftype == JARVIS_FTYPE_MOSTLY_Q4_0 || ftype == JARVIS_FTYPE_MOSTLY_Q5_0)
                 && qs.has_imatrix && i_layer < n_layer/8) {
             // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
             // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
-            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+            new_type = ftype == JARVIS_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
         ++qs.i_ffn_down;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert == 8) {
-                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
-                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
+                if (ftype == JARVIS_FTYPE_MOSTLY_Q2_K   || ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS || ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS ||
+                    ftype == JARVIS_FTYPE_MOSTLY_Q3_K_S || ftype == JARVIS_FTYPE_MOSTLY_Q3_K_M  || ftype == JARVIS_FTYPE_MOSTLY_IQ4_NL  ||
+                    ftype == JARVIS_FTYPE_MOSTLY_Q4_K_S || ftype == JARVIS_FTYPE_MOSTLY_Q4_K_M  || ftype == JARVIS_FTYPE_MOSTLY_IQ3_S  ||
+                    ftype == JARVIS_FTYPE_MOSTLY_IQ3_M  || ftype == JARVIS_FTYPE_MOSTLY_IQ4_XS) {
                     new_type = GGML_TYPE_Q5_K;
                 }
             } else {
-                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
+                if      (ftype == JARVIS_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
+                else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
+                else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
+                else if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
             }
         } else {
-            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+            if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
         }
     }
     else if (name.find("attn_qkv.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+        if (ftype == JARVIS_FTYPE_MOSTLY_Q3_K_M || ftype == JARVIS_FTYPE_MOSTLY_Q3_K_L || ftype == JARVIS_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == JARVIS_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
         int i_layer = info.first, n_layer = info.second;
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+        if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
         ++qs.i_ffn_gate;
@@ -18248,22 +18248,22 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
     else if (name.find("ffn_up") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
         int i_layer = info.first, n_layer = info.second;
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+        if (ftype == JARVIS_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
         ++qs.i_ffn_up;
     }
 
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //    if (ftype == JARVIS_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
     //}
     // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
     //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //    if (ftype == JARVIS_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
     //}
     // This can be used to reduce the size of the Q5_K_S model.
     // The associated PPL increase is fully in line with the size reduction
     //else {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
+    //    if (ftype == JARVIS_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
     //}
     bool convert_incompatible_tensor = false;
     if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   ||
@@ -18274,7 +18274,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+            JARVIS_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
             convert_incompatible_tensor = true;
         } else {
             ++qs.n_k_quantized;
@@ -18302,14 +18302,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
             new_type = GGML_TYPE_F16;
         }
-        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        JARVIS_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
         ++qs.n_fallback;
     }
 
     return new_type;
 }
 
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t jarvis_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
         size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
@@ -18363,48 +18363,48 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
     return new_size;
 }
 
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+static void jarvis_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const jarvis_model_quantize_params * params) {
     ggml_type default_type;
-    llama_ftype ftype = params->ftype;
+    jarvis_ftype ftype = params->ftype;
 
     switch (params->ftype) {
-        case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
-        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
-        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
-        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+        case JARVIS_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
+        case JARVIS_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
+        case JARVIS_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
+        case JARVIS_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
+        case JARVIS_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
+        case JARVIS_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
+        case JARVIS_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
+        case JARVIS_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
 
         // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  default_type = GGML_TYPE_Q3_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  default_type = GGML_TYPE_Q4_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  default_type = GGML_TYPE_Q5_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q6_K:    default_type = GGML_TYPE_Q6_K;    break;
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:   default_type = GGML_TYPE_TQ1_0;   break;
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:   default_type = GGML_TYPE_TQ2_0;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:   default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
+        case JARVIS_FTYPE_MOSTLY_Q2_K_S:
+        case JARVIS_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
+        case JARVIS_FTYPE_MOSTLY_IQ3_XS:  default_type = GGML_TYPE_IQ3_S;   break;
+        case JARVIS_FTYPE_MOSTLY_Q3_K_S:
+        case JARVIS_FTYPE_MOSTLY_Q3_K_M:
+        case JARVIS_FTYPE_MOSTLY_Q3_K_L:  default_type = GGML_TYPE_Q3_K;    break;
+        case JARVIS_FTYPE_MOSTLY_Q4_K_S:
+        case JARVIS_FTYPE_MOSTLY_Q4_K_M:  default_type = GGML_TYPE_Q4_K;    break;
+        case JARVIS_FTYPE_MOSTLY_Q5_K_S:
+        case JARVIS_FTYPE_MOSTLY_Q5_K_M:  default_type = GGML_TYPE_Q5_K;    break;
+        case JARVIS_FTYPE_MOSTLY_Q6_K:    default_type = GGML_TYPE_Q6_K;    break;
+        case JARVIS_FTYPE_MOSTLY_TQ1_0:   default_type = GGML_TYPE_TQ1_0;   break;
+        case JARVIS_FTYPE_MOSTLY_TQ2_0:   default_type = GGML_TYPE_TQ2_0;   break;
+        case JARVIS_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
+        case JARVIS_FTYPE_MOSTLY_IQ2_XS:  default_type = GGML_TYPE_IQ2_XS;  break;
+        case JARVIS_FTYPE_MOSTLY_IQ2_S:   default_type = GGML_TYPE_IQ2_XS;  break;
+        case JARVIS_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
+        case JARVIS_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
+        case JARVIS_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
+        case JARVIS_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
+        case JARVIS_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
+        case JARVIS_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
+        case JARVIS_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
+        case JARVIS_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case JARVIS_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
+        case JARVIS_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
+        case JARVIS_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -18423,15 +18423,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     constexpr bool use_mmap = false;
 #endif
 
-    llama_model_kv_override * kv_overrides = nullptr;
+    jarvis_model_kv_override * kv_overrides = nullptr;
     if (params->kv_overrides) {
-        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        auto v = (std::vector<jarvis_model_kv_override>*)params->kv_overrides;
         kv_overrides = v->data();
     }
-    llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
+    jarvis_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
     ml.init_mappings(false); // no prefetching
 
-    llama_model model;
+    jarvis_model model;
     llm_load_arch(ml, model);
     llm_load_hparams(ml, model);
 
@@ -18444,7 +18444,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     if (params->imatrix) {
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            JARVIS_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
             qs.has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *imatrix_data) {
@@ -18471,19 +18471,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
 
     if (params->kv_overrides) {
-        const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
+        const std::vector<jarvis_model_kv_override> & overrides = *(const std::vector<jarvis_model_kv_override> *)params->kv_overrides;
         for (auto & o : overrides) {
             if (o.key[0] == 0) break;
-            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+            if (o.tag == JARVIS_KV_OVERRIDE_TYPE_FLOAT) {
                 gguf_set_val_f32(ctx_out, o.key, o.val_f64);
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+            } else if (o.tag == JARVIS_KV_OVERRIDE_TYPE_INT) {
                 gguf_set_val_i32(ctx_out, o.key, o.val_i64);
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+            } else if (o.tag == JARVIS_KV_OVERRIDE_TYPE_BOOL) {
                 gguf_set_val_bool(ctx_out, o.key, o.val_bool);
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+            } else if (o.tag == JARVIS_KV_OVERRIDE_TYPE_STR) {
                 gguf_set_val_str(ctx_out, o.key, o.val_str);
             } else {
-                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+                JARVIS_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
             }
         }
     }
@@ -18510,7 +18510,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
         // attention layers have a non-zero number of kv heads
         int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
-        if (llama_model_has_encoder(&model)) {
+        if (jarvis_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
         GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
@@ -18576,7 +18576,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         std::string fname = fname_out;
         if (params->keep_split) {
             char split_path[PATH_MAX] = {0};
-            llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
+            jarvis_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
             fname = std::string(split_path);
         }
 
@@ -18607,10 +18607,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         }
         ml.load_data_for(tensor);
 
-        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+        JARVIS_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
                ++idx, ml.n_tensors,
                ggml_get_name(tensor),
-               llama_format_tensor_shape(tensor).c_str(),
+               jarvis_format_tensor_shape(tensor).c_str(),
                ggml_type_name(tensor->type));
 
         // This used to be a regex, but <regex> has an extreme cost to compile times.
@@ -18656,7 +18656,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                new_type = jarvis_tensor_get_type(qs, new_type, tensor, ftype);
             }
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
@@ -18674,7 +18674,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_type = tensor->type;
             new_data = tensor->data;
             new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+            JARVIS_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
         } else {
             const int64_t nelements = ggml_nelements(tensor);
 
@@ -18682,12 +18682,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
                 if (it == imatrix_data->end()) {
-                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                    JARVIS_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
                         imatrix = it->second.data();
                     } else {
-                        LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+                        JARVIS_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
                                 int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
 
                         // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
@@ -18706,11 +18706,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                  new_type == GGML_TYPE_IQ2_S   ||
                  new_type == GGML_TYPE_IQ1_S   ||
                 (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
-                LLAMA_LOG_ERROR("\n\n============================================================\n");
-                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
-                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
-                LLAMA_LOG_ERROR("============================================================\n\n");
+                (new_type == GGML_TYPE_Q2_K && params->ftype == JARVIS_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                JARVIS_LOG_ERROR("\n\n============================================================\n");
+                JARVIS_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                JARVIS_LOG_ERROR("The result will be garbage, so bailing out\n");
+                JARVIS_LOG_ERROR("============================================================\n\n");
                 throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
             }
 
@@ -18721,7 +18721,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
             } else {
-                llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+                jarvis_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
                 f32_data = (float *) f32_conv_buf.data();
             }
 
@@ -18733,7 +18733,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
             }
 
-            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+            JARVIS_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
             if (work.size() < (size_t)nelements * 4) {
@@ -18759,9 +18759,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
                 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
 
-                new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                new_size += jarvis_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
             }
-            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+            JARVIS_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
         }
         total_size_org += ggml_nbytes(tensor);
         total_size_new += new_size;
@@ -18779,17 +18779,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         gguf_free(c);
     }
 
-    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+    JARVIS_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    JARVIS_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
 
     if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
+        JARVIS_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
                 __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
     }
 }
 
-static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+static void jarvis_lora_adapter_init_internal(struct jarvis_model * model, const char * path_lora, struct jarvis_lora_adapter & adapter) {
+    JARVIS_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -18856,7 +18856,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     };
 
     // bundle lora_a and lora_b into pairs
-    std::map<std::string, llama_lora_weight> ab_map;
+    std::map<std::string, jarvis_lora_weight> ab_map;
     auto str_endswith = [](const std::string & str, const std::string & suffix) {
         return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
     };
@@ -18865,14 +18865,14 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         if (str_endswith(name, ".lora_a")) {
             replace_all(name, ".lora_a", "");
             if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(cur, nullptr);
+                ab_map[name] = jarvis_lora_weight(cur, nullptr);
             } else {
                 ab_map[name].a = cur;
             }
         } else if (str_endswith(name, ".lora_b")) {
             replace_all(name, ".lora_b", "");
             if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(nullptr, cur);
+                ab_map[name] = jarvis_lora_weight(nullptr, cur);
             } else {
                 ab_map[name].b = cur;
             }
@@ -18886,7 +18886,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     // add tensors
     for (auto & it : ab_map) {
         const std::string & name = it.first;
-        llama_lora_weight & w = it.second;
+        jarvis_lora_weight & w = it.second;
 
         if (!w.a || !w.b) {
             gguf_free(ctx_gguf);
@@ -18895,7 +18895,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
 
         // device buft and device ctx
-        auto * model_tensor = llama_get_model_tensor(model, name.c_str());
+        auto * model_tensor = jarvis_get_model_tensor(model, name.c_str());
         if (!model_tensor) {
             gguf_free(ctx_gguf);
             ggml_free(ctx);
@@ -18918,7 +18918,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
         ggml_set_name(tensor_a, w.a->name);
         ggml_set_name(tensor_b, w.b->name);
-        adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
+        adapter.ab_map[name] = jarvis_lora_weight(tensor_a, tensor_b);
     }
 
     // allocate tensors / buffers and zero
@@ -18934,7 +18934,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ggml_free(ctx);
                 throw std::runtime_error("failed to allocate buffer for lora adapter\n");
             }
-            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+            JARVIS_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
             adapter.ctxs.push_back(ctx_dev);
             adapter.bufs.push_back(buf);
         }
@@ -18942,7 +18942,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
 
     // set tensor data
     {
-        llama_file gguf_file(path_lora, "rb");
+        jarvis_file gguf_file(path_lora, "rb");
         std::vector<uint8_t> read_buf;
         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
             size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
@@ -18960,28 +18960,28 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
     }
 
-    LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+    JARVIS_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 
     // free ctx for reading gguf
     gguf_free(ctx_gguf);
     ggml_free(ctx);
 }
 
-int32_t llama_lora_adapter_set(
-            struct llama_context * ctx,
-            struct llama_lora_adapter * adapter,
+int32_t jarvis_lora_adapter_set(
+            struct jarvis_context * ctx,
+            struct jarvis_lora_adapter * adapter,
             float scale) {
     if (ctx->cparams.flash_attn) {
-        LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
+        JARVIS_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
         return -1;
     }
     ctx->lora_adapters[adapter] = scale;
     return 0;
 }
 
-int32_t llama_lora_adapter_remove(
-            struct llama_context * ctx,
-            struct llama_lora_adapter * adapter) {
+int32_t jarvis_lora_adapter_remove(
+            struct jarvis_context * ctx,
+            struct jarvis_lora_adapter * adapter) {
     auto pos = ctx->lora_adapters.find(adapter);
     if (pos != ctx->lora_adapters.end()) {
         ctx->lora_adapters.erase(pos);
@@ -18990,21 +18990,21 @@ int32_t llama_lora_adapter_remove(
     return -1;
 }
 
-void llama_lora_adapter_clear(struct llama_context * ctx) {
+void jarvis_lora_adapter_clear(struct jarvis_context * ctx) {
     ctx->lora_adapters.clear();
 }
 
-void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
+void jarvis_lora_adapter_free(struct jarvis_lora_adapter * adapter) {
     delete adapter;
 }
 
 //
 // interface implementation
 //
-struct llama_model_params llama_model_default_params() {
-    struct llama_model_params result = {
+struct jarvis_model_params jarvis_model_default_params() {
+    struct jarvis_model_params result = {
         /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
+        /*.split_mode                  =*/ JARVIS_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.rpc_servers                 =*/ nullptr,
@@ -19025,17 +19025,17 @@ struct llama_model_params llama_model_default_params() {
     return result;
 }
 
-struct llama_context_params llama_context_default_params() {
-    struct llama_context_params result = {
+struct jarvis_context_params jarvis_context_default_params() {
+    struct jarvis_context_params result = {
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 2048,
         /*.n_ubatch                    =*/ 512,
         /*.n_seq_max                   =*/ 1,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
-        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
-        /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        /*.rope_scaling_type           =*/ JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED,
+        /*.pooling_type                =*/ JARVIS_POOLING_TYPE_UNSPECIFIED,
+        /*.attention_type              =*/ JARVIS_ATTENTION_TYPE_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
@@ -19060,18 +19060,18 @@ struct llama_context_params llama_context_default_params() {
     return result;
 }
 
-struct llama_sampler_chain_params llama_sampler_chain_default_params() {
-    struct llama_sampler_chain_params result = {
+struct jarvis_sampler_chain_params jarvis_sampler_chain_default_params() {
+    struct jarvis_sampler_chain_params result = {
         /*.no_perf                     =*/ true,
     };
 
     return result;
 }
 
-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+struct jarvis_model_quantize_params jarvis_model_quantize_default_params() {
+    struct jarvis_model_quantize_params result = {
         /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.ftype                       =*/ JARVIS_FTYPE_MOSTLY_Q5_1,
         /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
         /*.allow_requantize            =*/ false,
@@ -19086,34 +19086,34 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
     return result;
 }
 
-size_t llama_max_devices(void) {
+size_t jarvis_max_devices(void) {
     return 16;
 }
 
-bool llama_supports_mmap(void) {
-    return llama_mmap::SUPPORTED;
+bool jarvis_supports_mmap(void) {
+    return jarvis_mmap::SUPPORTED;
 }
 
-bool llama_supports_mlock(void) {
-    return llama_mlock::SUPPORTED;
+bool jarvis_supports_mlock(void) {
+    return jarvis_mlock::SUPPORTED;
 }
 
-bool llama_supports_gpu_offload(void) {
+bool jarvis_supports_gpu_offload(void) {
 #if defined(GGML_USE_KOMPUTE)
-    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+    // Defined when jarvis.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
     return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
            ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
-           llama_supports_rpc();
+           jarvis_supports_rpc();
 #endif
 }
 
-bool llama_supports_rpc(void) {
+bool jarvis_supports_rpc(void) {
     return ggml_backend_reg_by_name("RPC") != nullptr;
 }
 
-void llama_backend_init(void) {
+void jarvis_backend_init(void) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -19124,39 +19124,39 @@ void llama_backend_init(void) {
     }
 }
 
-void llama_numa_init(enum ggml_numa_strategy numa) {
+void jarvis_numa_init(enum ggml_numa_strategy numa) {
     if (numa != GGML_NUMA_STRATEGY_DISABLED) {
         ggml_numa_init(numa);
     }
 }
 
-void llama_attach_threadpool(
-             struct llama_context * ctx,
+void jarvis_attach_threadpool(
+             struct jarvis_context * ctx,
         ggml_threadpool_t   threadpool,
         ggml_threadpool_t   threadpool_batch) {
     ctx->threadpool       = threadpool;
     ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
-void llama_detach_threadpool(struct llama_context * ctx) {
+void jarvis_detach_threadpool(struct jarvis_context * ctx) {
     ctx->threadpool       = nullptr;
     ctx->threadpool_batch = nullptr;
 }
 
-void llama_backend_free(void) {
+void jarvis_backend_free(void) {
     ggml_quantize_free();
 }
 
-int64_t llama_time_us(void) {
+int64_t jarvis_time_us(void) {
     return ggml_time_us();
 }
 
-struct llama_model * llama_load_model_from_file(
+struct jarvis_model * jarvis_load_model_from_file(
         const char * path_model,
-        struct llama_model_params   params) {
+        struct jarvis_model_params   params) {
     ggml_time_init();
 
-    llama_model * model = new llama_model;
+    jarvis_model * model = new jarvis_model;
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
@@ -19166,9 +19166,9 @@ struct llama_model * llama_load_model_from_file(
             unsigned percentage = (unsigned) (100 * progress);
             while (percentage > *cur_percentage_p) {
                 *cur_percentage_p = percentage;
-                LLAMA_LOG_CONT(".");
+                JARVIS_LOG_CONT(".");
                 if (percentage >= 100) {
-                    LLAMA_LOG_CONT("\n");
+                    JARVIS_LOG_CONT("\n");
                 }
             }
             return true;
@@ -19191,8 +19191,8 @@ struct llama_model * llama_load_model_from_file(
     if (!model->rpc_servers.empty()) {
         ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
         if (!rpc_reg) {
-            LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
-            llama_free_model(model);
+            JARVIS_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+            jarvis_free_model(model);
             return nullptr;
         }
 
@@ -19200,8 +19200,8 @@ struct llama_model * llama_load_model_from_file(
         using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
         ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
         if (!ggml_backend_rpc_add_device_fn) {
-            LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
-            llama_free_model(model);
+            JARVIS_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+            jarvis_free_model(model);
             return nullptr;
         }
 
@@ -19210,8 +19210,8 @@ struct llama_model * llama_load_model_from_file(
             if (dev) {
                 model->devices.push_back(dev);
             } else {
-                LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
-                llama_free_model(model);
+                JARVIS_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                jarvis_free_model(model);
                 return nullptr;
             }
         }
@@ -19233,67 +19233,67 @@ struct llama_model * llama_load_model_from_file(
             {
                 size_t free, total; // NOLINT
                 ggml_backend_dev_memory(dev, &free, &total);
-                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+                JARVIS_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
                 model->devices.push_back(dev);
                 break;
             }
         }
     }
 
-    int status = llama_model_load(path_model, *model, params);
+    int status = jarvis_model_load(path_model, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to load model\n", __func__);
         } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+            JARVIS_LOG_INFO("%s: cancelled model load\n", __func__);
         }
-        llama_free_model(model);
+        jarvis_free_model(model);
         return nullptr;
     }
 
     return model;
 }
 
-void llama_free_model(struct llama_model * model) {
+void jarvis_free_model(struct jarvis_model * model) {
     delete model;
 }
 
-struct llama_context * llama_new_context_with_model(
-                 struct llama_model * model,
-        struct llama_context_params   params) {
+struct jarvis_context * jarvis_new_context_with_model(
+                 struct jarvis_model * model,
+        struct jarvis_context_params   params) {
 
     if (!model) {
-        LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
+        JARVIS_LOG_ERROR("%s: model cannot be NULL\n", __func__);
         return nullptr;
     }
 
     if (params.n_batch == 0 && params.n_ubatch == 0) {
-        LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
+        JARVIS_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
         return nullptr;
     }
 
     if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
-        LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
+        JARVIS_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
         return nullptr;
     }
 
     if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
-        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
+        JARVIS_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
         params.flash_attn = false;
     }
 
     if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
-        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
+        JARVIS_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
         params.flash_attn = false;
     }
 
     if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
-        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
+        JARVIS_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model);
+    jarvis_context * ctx = new jarvis_context(*model);
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
@@ -19317,16 +19317,16 @@ struct llama_context * llama_new_context_with_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, jarvis_kv_cache_get_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
 
     // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    // ref: https://github.com/ggerganov/jarvis.cpp/pull/5021
     if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        JARVIS_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
         cparams.n_batch = GGML_KQ_MASK_PAD;
     }
 
@@ -19340,40 +19340,40 @@ struct llama_context * llama_new_context_with_model(
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
     auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+    if (rope_scaling_type == JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
     }
 
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+    if (rope_scaling_type == JARVIS_ROPE_SCALING_TYPE_NONE) {
         cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     }
 
     if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+        cparams.yarn_ext_factor = rope_scaling_type == JARVIS_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
     }
 
     cparams.yarn_attn_factor *= hparams.rope_attn_factor;
 
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+    if (cparams.pooling_type == JARVIS_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == JARVIS_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = JARVIS_POOLING_TYPE_NONE;
         } else {
             cparams.pooling_type = hparams.pooling_type;
         }
     }
 
-    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+    if (params.attention_type == JARVIS_ATTENTION_TYPE_UNSPECIFIED) {
         cparams.causal_attn = hparams.causal_attn;
     } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+        cparams.causal_attn = params.attention_type == JARVIS_ATTENTION_TYPE_CAUSAL;
     }
 
-    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn = %d\n",     __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
+    JARVIS_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
+    JARVIS_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
+    JARVIS_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
+    JARVIS_LOG_INFO("%s: flash_attn = %d\n",     __func__, cparams.flash_attn);
+    JARVIS_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
+    JARVIS_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
 
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;
@@ -19381,14 +19381,14 @@ struct llama_context * llama_new_context_with_model(
     ctx->logits_all = params.logits_all;
 
     // build worst-case graph for encoder if a model contains encoder
-    ctx->is_encoding = llama_model_has_encoder(model);
+    ctx->is_encoding = jarvis_model_has_encoder(model);
 
     uint32_t kv_size = cparams.n_ctx;
     ggml_type type_k = params.type_k;
     ggml_type type_v = params.type_v;
 
     // Mamba only needs a constant number of KV cache cells per sequence
-    if (llama_model_is_recurrent(model)) {
+    if (jarvis_model_is_recurrent(model)) {
         // Mamba needs at least as many KV cells as there are sequences kept at any time
         kv_size = std::max((uint32_t) 1, params.n_seq_max);
         // it's probably best to keep as much precision as possible for the states
@@ -19404,24 +19404,24 @@ struct llama_context * llama_new_context_with_model(
         int main_gpu = model->main_gpu;
 
         // with registry
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+        if (model->split_mode == JARVIS_SPLIT_MODE_NONE || model->split_mode == JARVIS_SPLIT_MODE_ROW) {
             if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
                 ggml_backend_dev_t main_dev = model->devices[main_gpu];
                 ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
-                    llama_free(ctx);
+                    JARVIS_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
+                    jarvis_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
             }
         } else {
-            // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
+            // JARVIS_SPLIT_MODE_LAYER requires a backend for each GPU
             for (auto * dev : model->devices) {
                 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    llama_free(ctx);
+                    JARVIS_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    jarvis_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
@@ -19435,8 +19435,8 @@ struct llama_context * llama_new_context_with_model(
         if (model->n_gpu_layers > 0) {
             auto * backend = ggml_backend_kompute_init(main_gpu);
             if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
-                llama_free(ctx);
+                JARVIS_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                jarvis_free(ctx);
                 return nullptr;
             }
             ctx->backends.push_back(backend);
@@ -19449,8 +19449,8 @@ struct llama_context * llama_new_context_with_model(
             if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
                 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    llama_free(ctx);
+                    JARVIS_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    jarvis_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
@@ -19459,8 +19459,8 @@ struct llama_context * llama_new_context_with_model(
 
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            llama_free(ctx);
+            JARVIS_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            jarvis_free(ctx);
             return nullptr;
         }
         ctx->backends.push_back(ctx->backend_cpu);
@@ -19477,9 +19477,9 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 
-        if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
-            llama_free(ctx);
+        if (!jarvis_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            JARVIS_LOG_ERROR("%s: jarvis_kv_cache_init() failed for self-attention cache\n", __func__);
+            jarvis_free(ctx);
             return nullptr;
         }
 
@@ -19495,7 +19495,7 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_v += ggml_nbytes(v);
             }
 
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+            JARVIS_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                       (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -19504,13 +19504,13 @@ struct llama_context * llama_new_context_with_model(
         // graph outputs buffer
         {
             // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
-                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
-                llama_free(ctx);
+            if (jarvis_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
+                JARVIS_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                jarvis_free(ctx);
                 return nullptr;
             }
 
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+            JARVIS_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
                     ggml_backend_buffer_name(ctx->buf_output),
                     ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
         }
@@ -19522,13 +19522,13 @@ struct llama_context * llama_new_context_with_model(
             for (auto * backend : ctx->backends) {
                 if (ggml_backend_is_cpu(backend)) {
                     // use host buffers for the CPU backend compute buffer
-                    backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
+                    backend_buft.push_back(jarvis_default_buffer_type_cpu(*model, true));
                 } else {
                     backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
                 }
             }
 
-            const size_t max_nodes = llama_model_max_nodes(*model);
+            const size_t max_nodes = jarvis_model_max_nodes(*model);
 
             // buffer used to store the computation graph and the tensor meta data
             ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
@@ -19536,9 +19536,9 @@ struct llama_context * llama_new_context_with_model(
             // TODO: move these checks to ggml_backend_sched
             // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
             bool pipeline_parallel =
-                llama_get_device_count(*model) > 1 &&
+                jarvis_get_device_count(*model) > 1 &&
                 model->n_gpu_layers > (int)model->hparams.n_layer &&
-                model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                model->split_mode == JARVIS_SPLIT_MODE_LAYER &&
                 params.offload_kqv;
 
             // pipeline parallelism requires support for async compute and events in all devices
@@ -19567,20 +19567,20 @@ struct llama_context * llama_new_context_with_model(
             ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
 
             if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
+                JARVIS_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
             }
 
             // build worst-case graph
             uint32_t n_seqs = 1; // TODO: worst-case number of sequences
             uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
+            jarvis_token token = jarvis_token_bos(&ctx->model); // not actually used by jarvis_build_graph, but required to choose between token and embedding inputs graph
+            jarvis_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf = jarvis_build_graph(*ctx, ubatch, true);
 
             // initialize scheduler with the worst-case graph
             if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                llama_free(ctx);
+                JARVIS_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+                jarvis_free(ctx);
                 return nullptr;
             }
 
@@ -19589,7 +19589,7 @@ struct llama_context * llama_new_context_with_model(
                 ggml_backend_buffer_type_t buft = backend_buft[i];
                 size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
                 if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                    JARVIS_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                             ggml_backend_buft_name(buft),
                             size / 1024.0 / 1024.0);
                 }
@@ -19597,67 +19597,67 @@ struct llama_context * llama_new_context_with_model(
 
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
-            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
+            JARVIS_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
+            JARVIS_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
         }
     }
 
     return ctx;
 }
 
-void llama_free(struct llama_context * ctx) {
+void jarvis_free(struct jarvis_context * ctx) {
     delete ctx;
 }
 
-uint32_t llama_n_ctx(const struct llama_context * ctx) {
+uint32_t jarvis_n_ctx(const struct jarvis_context * ctx) {
     return ctx->cparams.n_ctx;
 }
 
-uint32_t llama_n_batch(const struct llama_context * ctx) {
+uint32_t jarvis_n_batch(const struct jarvis_context * ctx) {
     return ctx->cparams.n_batch;
 }
 
-uint32_t llama_n_ubatch(const struct llama_context * ctx) {
+uint32_t jarvis_n_ubatch(const struct jarvis_context * ctx) {
     return ctx->cparams.n_ubatch;
 }
 
-uint32_t llama_n_seq_max(const struct llama_context * ctx) {
+uint32_t jarvis_n_seq_max(const struct jarvis_context * ctx) {
     return ctx->kv_self.size;
 }
 
-enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
+enum jarvis_vocab_type jarvis_vocab_type(const struct jarvis_model * model) {
     return model->vocab.type;
 }
 
-int32_t llama_n_vocab(const struct llama_model * model) {
+int32_t jarvis_n_vocab(const struct jarvis_model * model) {
     return model->hparams.n_vocab;
 }
 
-int32_t llama_n_ctx_train(const struct llama_model * model) {
+int32_t jarvis_n_ctx_train(const struct jarvis_model * model) {
     return model->hparams.n_ctx_train;
 }
 
-int32_t llama_n_embd(const struct llama_model * model) {
+int32_t jarvis_n_embd(const struct jarvis_model * model) {
     return model->hparams.n_embd;
 }
 
-int32_t llama_n_layer(const struct llama_model * model) {
+int32_t jarvis_n_layer(const struct jarvis_model * model) {
     return model->hparams.n_layer;
 }
 
-int32_t llama_n_head(const struct llama_model * model) {
+int32_t jarvis_n_head(const struct jarvis_model * model) {
     return model->hparams.n_head();
 }
 
-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+const struct jarvis_model * jarvis_get_model(const struct jarvis_context * ctx) {
     return &ctx->model;
 }
 
-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+enum jarvis_pooling_type jarvis_pooling_type(const struct jarvis_context * ctx) {
     return ctx->cparams.pooling_type;
 }
 
-enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+enum jarvis_rope_type jarvis_rope_type(const struct jarvis_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
         case LLM_ARCH_GPT2:
@@ -19671,10 +19671,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_JAIS:
         case LLM_ARCH_RWKV6:
-            return LLAMA_ROPE_TYPE_NONE;
+            return JARVIS_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
-        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_JARVIS:
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
@@ -19690,7 +19690,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
-            return LLAMA_ROPE_TYPE_NORM;
+            return JARVIS_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
         case LLM_ARCH_FALCON:
@@ -19715,21 +19715,21 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_NEMOTRON:
         case LLM_ARCH_EXAONE:
         case LLM_ARCH_MINICPM3:
-            return LLAMA_ROPE_TYPE_NEOX;
+            return JARVIS_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ABORT("unknown architecture");
     }
 
-    return LLAMA_ROPE_TYPE_NONE;
+    return JARVIS_ROPE_TYPE_NONE;
 }
 
-float llama_rope_freq_scale_train(const struct llama_model * model) {
+float jarvis_rope_freq_scale_train(const struct jarvis_model * model) {
     return model->hparams.rope_freq_scale_train;
 }
 
-int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+int32_t jarvis_model_meta_val_str(const struct jarvis_model * model, const char * key, char * buf, size_t buf_size) {
     const auto & it = model->gguf_kv.find(key);
     if (it == model->gguf_kv.end()) {
         if (buf_size > 0) {
@@ -19740,11 +19740,11 @@ int32_t llama_model_meta_val_str(const struct llama_model * model, const char *
     return snprintf(buf, buf_size, "%s", it->second.c_str());
 }
 
-int32_t llama_model_meta_count(const struct llama_model * model) {
+int32_t jarvis_model_meta_count(const struct jarvis_model * model) {
     return (int)model->gguf_kv.size();
 }
 
-int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+int32_t jarvis_model_meta_key_by_index(const struct jarvis_model * model, int i, char * buf, size_t buf_size) {
     if (i < 0 || i >= (int)model->gguf_kv.size()) {
         if (buf_size > 0) {
             buf[0] = '\0';
@@ -19756,7 +19756,7 @@ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, c
     return snprintf(buf, buf_size, "%s", it->first.c_str());
 }
 
-int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
+int32_t jarvis_model_meta_val_str_by_index(const struct jarvis_model * model, int32_t i, char * buf, size_t buf_size) {
     if (i < 0 || i >= (int)model->gguf_kv.size()) {
         if (buf_size > 0) {
             buf[0] = '\0';
@@ -19768,14 +19768,14 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
     return snprintf(buf, buf_size, "%s", it->second.c_str());
 }
 
-int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
+int32_t jarvis_model_desc(const struct jarvis_model * model, char * buf, size_t buf_size) {
     return snprintf(buf, buf_size, "%s %s %s",
-            llama_model_arch_name(model->arch),
-            llama_model_type_name(model->type),
-            llama_model_ftype_name(model->ftype).c_str());
+            jarvis_model_arch_name(model->arch),
+            jarvis_model_type_name(model->type),
+            jarvis_model_ftype_name(model->ftype).c_str());
 }
 
-uint64_t llama_model_size(const struct llama_model * model) {
+uint64_t jarvis_model_size(const struct jarvis_model * model) {
     uint64_t size = 0;
     for (const auto & it : model->tensors_by_name) {
         size += ggml_nbytes(it.second);
@@ -19783,7 +19783,7 @@ uint64_t llama_model_size(const struct llama_model * model) {
     return size;
 }
 
-uint64_t llama_model_n_params(const struct llama_model * model) {
+uint64_t jarvis_model_n_params(const struct jarvis_model * model) {
     uint64_t nparams = 0;
     for (const auto & it : model->tensors_by_name) {
         nparams += ggml_nelements(it.second);
@@ -19791,7 +19791,7 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
     return nparams;
 }
 
-struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
+struct ggml_tensor * jarvis_get_model_tensor(struct jarvis_model * model, const char * name) {
     auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
             [name](const std::pair<std::string, struct ggml_tensor *> & it) {
                 return it.first == name;
@@ -19802,7 +19802,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
     return it->second;
 }
 
-bool llama_model_has_encoder(const struct llama_model * model) {
+bool jarvis_model_has_encoder(const struct jarvis_model * model) {
     switch (model->arch) {
         case LLM_ARCH_T5:        return true;
         case LLM_ARCH_T5ENCODER: return true;
@@ -19810,18 +19810,18 @@ bool llama_model_has_encoder(const struct llama_model * model) {
     }
 }
 
-bool llama_model_has_decoder(const struct llama_model * model) {
+bool jarvis_model_has_decoder(const struct jarvis_model * model) {
     switch (model->arch) {
         case LLM_ARCH_T5ENCODER: return false;
         default:                 return true;
     }
 }
 
-llama_token llama_model_decoder_start_token(const struct llama_model * model) {
+jarvis_token jarvis_model_decoder_start_token(const struct jarvis_model * model) {
     return model->hparams.dec_start_token_id;
 }
 
-bool llama_model_is_recurrent(const struct llama_model * model) {
+bool jarvis_model_is_recurrent(const struct jarvis_model * model) {
     switch (model->arch) {
         case LLM_ARCH_MAMBA:  return true;
         case LLM_ARCH_RWKV6:  return true;
@@ -19829,31 +19829,31 @@ bool llama_model_is_recurrent(const struct llama_model * model) {
     }
 }
 
-uint32_t llama_model_quantize(
+uint32_t jarvis_model_quantize(
         const char * fname_inp,
         const char * fname_out,
-        const llama_model_quantize_params * params) {
+        const jarvis_model_quantize_params * params) {
     try {
-        llama_model_quantize_internal(fname_inp, fname_out, params);
+        jarvis_model_quantize_internal(fname_inp, fname_out, params);
         return 0;
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
         return 1;
     }
 }
 
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
+struct jarvis_lora_adapter * jarvis_lora_adapter_init(struct jarvis_model * model, const char * path_lora) {
     try {
-        struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
-        llama_lora_adapter_init_internal(model, path_lora, *adapter);
+        struct jarvis_lora_adapter * adapter = new jarvis_lora_adapter(model);
+        jarvis_lora_adapter_init_internal(model, path_lora, *adapter);
         return adapter;
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return nullptr;
     }
 }
 
-static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+static bool jarvis_control_vector_init(struct jarvis_control_vector & cvec, const jarvis_model & model) {
     GGML_ASSERT(cvec.tensors.empty());
     GGML_ASSERT(cvec.ctxs.empty());
     GGML_ASSERT(cvec.bufs.empty());
@@ -19875,7 +19875,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
         };
         ggml_context * ctx = ggml_init(params);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
             return 1;
         }
         ctx_map[it.first] = ctx;
@@ -19898,7 +19898,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
         ggml_context * ctx = it.second;
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
@@ -19909,9 +19909,9 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
     return true;
 }
 
-int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
-    const llama_model & model = lctx->model;
-    llama_control_vector & cvec = lctx->cvec;
+int32_t jarvis_control_vector_apply(struct jarvis_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
+    const jarvis_model & model = lctx->model;
+    jarvis_control_vector & cvec = lctx->cvec;
 
     if (data == nullptr) {
         // disable the current control vector (but leave allocated for later)
@@ -19921,12 +19921,12 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
     }
 
     if (n_embd != (int) model.hparams.n_embd) {
-        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        JARVIS_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
         return 1;
     }
 
     if (cvec.tensors.empty()) {
-        if (!llama_control_vector_init(cvec, model)) {
+        if (!jarvis_control_vector_init(cvec, model)) {
             return 1;
         }
     }
@@ -19946,12 +19946,12 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
     return 0;
 }
 
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
-    struct llama_kv_cache_view result = {
+struct jarvis_kv_cache_view jarvis_kv_cache_view_init(const struct jarvis_context * ctx, int32_t n_seq_max) {
+    struct jarvis_kv_cache_view result = {
         /*.n_cells            = */ 0,
         /*.n_seq_max          = */ n_seq_max,
         /*.token_count        = */ 0,
-        /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx),
+        /*.used_cells         = */ jarvis_get_kv_cache_used_cells(ctx),
         /*.max_contiguous     = */ 0,
         /*.max_contiguous_idx = */ -1,
         /*.cells              = */ nullptr,
@@ -19960,7 +19960,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context *
     return result;
 }
 
-void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
+void jarvis_kv_cache_view_free(struct jarvis_kv_cache_view * view) {
     if (view->cells != nullptr) {
         free(view->cells);
         view->cells = nullptr;
@@ -19971,20 +19971,20 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
     }
 }
 
-void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+void jarvis_kv_cache_view_update(const struct jarvis_context * ctx, struct jarvis_kv_cache_view * view) {
     if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
         view->n_cells = int32_t(ctx->kv_self.size);
-        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
+        void * p = realloc(view->cells, sizeof(struct jarvis_kv_cache_view_cell) * view->n_cells);
         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
-        view->cells = (struct llama_kv_cache_view_cell *)p;
-        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
+        view->cells = (struct jarvis_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(jarvis_seq_id) * view->n_seq_max * view->n_cells);
         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
-        view->cells_sequences = (llama_seq_id *)p;
+        view->cells_sequences = (jarvis_seq_id *)p;
     }
 
-    const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
-    llama_kv_cache_view_cell * c_curr = view->cells;
-    llama_seq_id * cs_curr = view->cells_sequences;
+    const std::vector<jarvis_kv_cell> & kv_cells = ctx->kv_self.cells;
+    jarvis_kv_cache_view_cell * c_curr = view->cells;
+    jarvis_seq_id * cs_curr = view->cells_sequences;
     int32_t used_cells = 0;
     int32_t token_count = 0;
     int32_t curr_contig_idx = -1;
@@ -20007,7 +20007,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
         }
 
         int seq_idx = 0;
-        for (const llama_seq_id it : kv_cells[i].seq_id) {
+        for (const jarvis_seq_id it : kv_cells[i].seq_id) {
             if (seq_idx >= view->n_seq_max) {
                 break;
             }
@@ -20030,12 +20030,12 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
     view->token_count = token_count;
     view->used_cells = used_cells;
     if (uint32_t(used_cells) != ctx->kv_self.used) {
-        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+        JARVIS_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
             __func__, ctx->kv_self.used, used_cells);
     }
 }
 
-int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
+int32_t jarvis_get_kv_cache_token_count(const struct jarvis_context * ctx) {
     int result = 0;
 
     for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -20045,88 +20045,88 @@ int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return result;
 }
 
-int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
+int32_t jarvis_get_kv_cache_used_cells(const struct jarvis_context * ctx) {
     return ctx->kv_self.used;
 }
 
-void llama_kv_cache_clear(struct llama_context * ctx) {
-    llama_kv_cache_clear(ctx->kv_self);
+void jarvis_kv_cache_clear(struct jarvis_context * ctx) {
+    jarvis_kv_cache_clear(ctx->kv_self);
 }
 
-bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
+bool jarvis_kv_cache_seq_rm(struct jarvis_context * ctx, jarvis_seq_id seq_id, jarvis_pos p0, jarvis_pos p1) {
+    return jarvis_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
-void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+void jarvis_kv_cache_seq_cp(struct jarvis_context * ctx, jarvis_seq_id seq_id_src, jarvis_seq_id seq_id_dst, jarvis_pos p0, jarvis_pos p1) {
     if (seq_id_src == seq_id_dst) {
         return;
     }
-    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+    jarvis_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
-void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
+void jarvis_kv_cache_seq_keep(struct jarvis_context * ctx, jarvis_seq_id seq_id) {
+    jarvis_kv_cache_seq_keep(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+void jarvis_kv_cache_seq_add(struct jarvis_context * ctx, jarvis_seq_id seq_id, jarvis_pos p0, jarvis_pos p1, jarvis_pos delta) {
     if (delta == 0) {
         return;
     }
 
-    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
+    jarvis_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
 }
 
-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+void jarvis_kv_cache_seq_div(struct jarvis_context * ctx, jarvis_seq_id seq_id, jarvis_pos p0, jarvis_pos p1, int d) {
     if (d == 1) {
         return;
     }
 
-    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
+    jarvis_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
 }
 
-llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
+jarvis_pos jarvis_kv_cache_seq_pos_max(struct jarvis_context * ctx, jarvis_seq_id seq_id) {
+    return jarvis_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_defrag(struct llama_context * ctx) {
-    llama_kv_cache_defrag(ctx->kv_self);
+void jarvis_kv_cache_defrag(struct jarvis_context * ctx) {
+    jarvis_kv_cache_defrag(ctx->kv_self);
 }
 
-void llama_kv_cache_update(struct llama_context * ctx) {
-    llama_kv_cache_update_internal(*ctx);
+void jarvis_kv_cache_update(struct jarvis_context * ctx) {
+    jarvis_kv_cache_update_internal(*ctx);
 }
 
 // deprecated
-size_t llama_get_state_size(struct llama_context * ctx) {
-    return llama_state_get_size(ctx);
+size_t jarvis_get_state_size(struct jarvis_context * ctx) {
+    return jarvis_state_get_size(ctx);
 }
 
 // deprecated
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
-    return llama_state_get_data(ctx, dst, -1);
+size_t jarvis_copy_state_data(struct jarvis_context * ctx, uint8_t * dst) {
+    return jarvis_state_get_data(ctx, dst, -1);
 }
 
 // deprecated
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    return llama_state_set_data(ctx, src, -1);
+size_t jarvis_set_state_data(struct jarvis_context * ctx, const uint8_t * src) {
+    return jarvis_state_set_data(ctx, src, -1);
 }
 
 // deprecated
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+bool jarvis_load_session_file(struct jarvis_context * ctx, const char * path_session, jarvis_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    return jarvis_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
 }
 
 // deprecated
-bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    return llama_state_save_file(ctx, path_session, tokens, n_token_count);
+bool jarvis_save_session_file(struct jarvis_context * ctx, const char * path_session, const jarvis_token * tokens, size_t n_token_count) {
+    return jarvis_state_save_file(ctx, path_session, tokens, n_token_count);
 }
 
 // TODO: replace all non-fatal assertions with returned errors or exceptions
-struct llama_data_write {
+struct jarvis_data_write {
     virtual void write(const void * src, size_t size) = 0;
     virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
     virtual size_t get_size_written() = 0;
-    virtual ~llama_data_write() = default;
+    virtual ~jarvis_data_write() = default;
 
     void write_string(const std::string & str) {
         uint32_t str_size = str.size();
@@ -20135,7 +20135,7 @@ struct llama_data_write {
         write(str.data(), str_size);
     }
 
-    void write_model_info(const struct llama_context * ctx) {
+    void write_model_info(const struct jarvis_context * ctx) {
         std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
         write_string(arch_str);
         // TODO: add more model-specific info which should prevent loading the session file if not identical
@@ -20150,8 +20150,8 @@ struct llama_data_write {
     //    write_string(rng_str);
     //}
 
-    void write_output_ids(struct llama_context * ctx) {
-        llama_output_reorder(ctx);
+    void write_output_ids(struct jarvis_context * ctx) {
+        jarvis_output_reorder(ctx);
 
         const uint32_t n_outputs = ctx->n_outputs;
 
@@ -20181,7 +20181,7 @@ struct llama_data_write {
         }
     }
 
-    void write_logits(const struct llama_context * ctx) {
+    void write_logits(const struct jarvis_context * ctx) {
         const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
 
         write(&logits_size, sizeof(logits_size));
@@ -20191,7 +20191,7 @@ struct llama_data_write {
         }
     }
 
-    void write_embeddings(const struct llama_context * ctx) {
+    void write_embeddings(const struct jarvis_context * ctx) {
         const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
 
         write(&embeddings_size, sizeof(embeddings_size));
@@ -20201,12 +20201,12 @@ struct llama_data_write {
         }
     }
 
-    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
+    void write_kv_cache_meta(const jarvis_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, jarvis_seq_id seq_id = -1) {
 
         for (const auto & range : cell_ranges) {
             for (uint32_t i = range.first; i < range.second; ++i) {
                 const auto & cell = kv_self.cells[i];
-                const llama_pos pos      = cell.pos;
+                const jarvis_pos pos      = cell.pos;
                 const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
 
                 write(&pos,      sizeof(pos));
@@ -20221,9 +20221,9 @@ struct llama_data_write {
         }
     }
 
-    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
-        const struct llama_hparams & hparams = ctx->model.hparams;
+    void write_kv_cache_data(const struct jarvis_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+        const struct jarvis_kv_cache & kv_self = ctx->kv_self;
+        const struct jarvis_hparams & hparams = ctx->model.hparams;
 
         const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
         const uint32_t n_layer = hparams.n_layer;
@@ -20304,8 +20304,8 @@ struct llama_data_write {
         }
     }
 
-    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
+    void write_kv_cache(const struct jarvis_context * ctx, jarvis_seq_id seq_id = -1) {
+        const struct jarvis_kv_cache & kv_self = ctx->kv_self;
         std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
         uint32_t cell_count = 0;
 
@@ -20344,11 +20344,11 @@ struct llama_data_write {
     }
 };
 
-struct llama_data_read {
+struct jarvis_data_read {
     virtual const uint8_t * read(size_t size) = 0;
     virtual void read_to(void * dst, size_t size) = 0;
     virtual size_t get_size_read() = 0;
-    virtual ~llama_data_read() = default;
+    virtual ~jarvis_data_read() = default;
 
     void read_string(std::string & str) {
         uint32_t str_size;
@@ -20358,7 +20358,7 @@ struct llama_data_read {
     }
 
     // validate model information
-    void read_model_info(const struct llama_context * ctx) {
+    void read_model_info(const struct jarvis_context * ctx) {
         std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
         std::string arch_str;
         read_string(arch_str);
@@ -20380,13 +20380,13 @@ struct llama_data_read {
     //    }
     //}
 
-    void read_output_ids(struct llama_context * ctx) {
+    void read_output_ids(struct jarvis_context * ctx) {
         std::vector<int32_t> output_pos;
 
         uint32_t n_outputs;
         read_to(&n_outputs, sizeof(n_outputs));
 
-        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+        if (n_outputs > jarvis_output_reserve(*ctx, n_outputs)) {
             throw std::runtime_error("could not reserve outputs");
         }
 
@@ -20406,7 +20406,7 @@ struct llama_data_read {
         }
     }
 
-    void read_logits(struct llama_context * ctx) {
+    void read_logits(struct jarvis_context * ctx) {
         uint64_t logits_size;
         read_to(&logits_size, sizeof(logits_size));
 
@@ -20419,7 +20419,7 @@ struct llama_data_read {
         }
     }
 
-    void read_embeddings(struct llama_context * ctx) {
+    void read_embeddings(struct jarvis_context * ctx) {
         uint64_t embeddings_size;
         read_to(&embeddings_size, sizeof(embeddings_size));
 
@@ -20432,28 +20432,28 @@ struct llama_data_read {
         }
     }
 
-    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
-        struct llama_kv_cache & kv_self = ctx->kv_self;
+    bool read_kv_cache_meta(struct jarvis_context * ctx, uint32_t cell_count, jarvis_seq_id dest_seq_id = -1) {
+        struct jarvis_kv_cache & kv_self = ctx->kv_self;
 
         if (dest_seq_id != -1) {
             // single sequence
 
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+            jarvis_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
 
-            llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+            jarvis_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
             batch.n_seq_tokens = cell_count;
             batch.n_seqs = 1;
 
             for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_pos pos;
+                jarvis_pos pos;
                 uint32_t n_seq_id;
 
                 read_to(&pos, sizeof(pos));
                 read_to(&n_seq_id, sizeof(n_seq_id));
 
                 if (n_seq_id != 0) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                    JARVIS_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
                     return false;
                 }
 
@@ -20461,8 +20461,8 @@ struct llama_data_read {
             }
             batch.n_seq_id[0] = 1;
             batch.seq_id[0] = &dest_seq_id;
-            if (!llama_kv_cache_find_slot(kv_self, batch)) {
-                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            if (!jarvis_kv_cache_find_slot(kv_self, batch)) {
+                JARVIS_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                 return false;
             }
 
@@ -20477,16 +20477,16 @@ struct llama_data_read {
             // whole KV cache restore
 
             if (cell_count > kv_self.size) {
-                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+                JARVIS_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
                 return false;
             }
 
-            llama_kv_cache_clear(kv_self);
+            jarvis_kv_cache_clear(kv_self);
 
             for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_kv_cell & cell = kv_self.cells[i];
+                jarvis_kv_cell & cell = kv_self.cells[i];
 
-                llama_pos pos;
+                jarvis_pos pos;
                 uint32_t  n_seq_id;
 
                 read_to(&pos,      sizeof(pos));
@@ -20495,11 +20495,11 @@ struct llama_data_read {
                 cell.pos = pos;
 
                 for (uint32_t j = 0; j < n_seq_id; ++j) {
-                    llama_seq_id seq_id;
+                    jarvis_seq_id seq_id;
                     read_to(&seq_id, sizeof(seq_id));
 
-                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    if (seq_id < 0 || (uint32_t) seq_id >= jarvis_n_seq_max(ctx)) {
+                        JARVIS_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, jarvis_n_seq_max(ctx));
                         return false;
                     }
 
@@ -20508,7 +20508,7 @@ struct llama_data_read {
                     if (kv_self.recurrent) {
                         int32_t & tail = kv_self.cells[seq_id].tail;
                         if (tail != -1) {
-                            LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                            JARVIS_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
                             return false;
                         }
                         tail = i;
@@ -20531,24 +20531,24 @@ struct llama_data_read {
         return true;
     }
 
-    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
-        const struct llama_hparams & hparams = ctx->model.hparams;
-        struct llama_kv_cache & kv_self = ctx->kv_self;
+    bool read_kv_cache_data(struct jarvis_context * ctx, uint32_t cell_count) {
+        const struct jarvis_hparams & hparams = ctx->model.hparams;
+        struct jarvis_kv_cache & kv_self = ctx->kv_self;
         uint32_t v_trans;
         uint32_t n_layer;
         read_to(&v_trans, sizeof(v_trans));
         read_to(&n_layer, sizeof(n_layer));
 
         if (n_layer != hparams.n_layer) {
-            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+            JARVIS_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
             return false;
         }
         if (cell_count > kv_self.size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+            JARVIS_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
             return false;
         }
         if (kv_self.v_trans != (bool) v_trans) {
-            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+            JARVIS_LOG_ERROR("%s: incompatible V transposition\n", __func__);
             return false;
         }
 
@@ -20561,7 +20561,7 @@ struct llama_data_read {
             read_to(&k_type_i_ref, sizeof(k_type_i_ref));
             const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
             if (k_type_i != k_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+                JARVIS_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
                 return false;
             }
 
@@ -20570,7 +20570,7 @@ struct llama_data_read {
             read_to(&k_size_row_ref, sizeof(k_size_row_ref));
             const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
             if (k_size_row != k_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+                JARVIS_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
                 return false;
             }
 
@@ -20589,7 +20589,7 @@ struct llama_data_read {
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
                 const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    JARVIS_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
                 }
 
@@ -20598,7 +20598,7 @@ struct llama_data_read {
                 read_to(&v_size_row_ref, sizeof(v_size_row_ref));
                 const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
                 if (v_size_row != v_size_row_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                    JARVIS_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
                     return false;
                 }
 
@@ -20617,7 +20617,7 @@ struct llama_data_read {
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
                 const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    JARVIS_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
                 }
 
@@ -20626,7 +20626,7 @@ struct llama_data_read {
                 read_to(&v_size_el_ref, sizeof(v_size_el_ref));
                 const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
                 if (v_size_el != v_size_el_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                    JARVIS_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
                     return false;
                 }
 
@@ -20634,7 +20634,7 @@ struct llama_data_read {
                 uint32_t n_embd_v_gqa_ref;
                 read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
                 if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                    JARVIS_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
                     return false;
                 }
 
@@ -20650,7 +20650,7 @@ struct llama_data_read {
         return true;
     }
 
-    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+    void read_kv_cache(struct jarvis_context * ctx, jarvis_seq_id seq_id = -1) {
         uint32_t cell_count;
         read_to(&cell_count, sizeof(cell_count));
 
@@ -20658,19 +20658,19 @@ struct llama_data_read {
 
         if (!res) {
             if (seq_id == -1) {
-                llama_kv_cache_clear(ctx);
+                jarvis_kv_cache_clear(ctx);
             } else {
-                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+                jarvis_kv_cache_seq_rm(ctx, seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
     }
 };
 
-struct llama_data_write_dummy : llama_data_write {
+struct jarvis_data_write_dummy : jarvis_data_write {
     size_t size_written = 0;
 
-    llama_data_write_dummy() {}
+    jarvis_data_write_dummy() {}
 
     void write(const void * /* src */, size_t size) override {
         size_written += size;
@@ -20685,12 +20685,12 @@ struct llama_data_write_dummy : llama_data_write {
     }
 };
 
-struct llama_data_write_buffer : llama_data_write {
+struct jarvis_data_write_buffer : jarvis_data_write {
     uint8_t * ptr;
     size_t buf_size = 0;
     size_t size_written = 0;
 
-    llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+    jarvis_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
 
     void write(const void * src, size_t size) override {
         if (size > buf_size) {
@@ -20717,12 +20717,12 @@ struct llama_data_write_buffer : llama_data_write {
     }
 };
 
-struct llama_data_read_buffer : llama_data_read {
+struct jarvis_data_read_buffer : jarvis_data_read {
     const uint8_t * ptr;
     size_t buf_size = 0;
     size_t size_read = 0;
 
-    llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+    jarvis_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
 
     const uint8_t * read(size_t size) override {
         const uint8_t * base_ptr = ptr;
@@ -20744,12 +20744,12 @@ struct llama_data_read_buffer : llama_data_read {
     }
 };
 
-struct llama_data_write_file : llama_data_write {
-    llama_file * file;
+struct jarvis_data_write_file : jarvis_data_write {
+    jarvis_file * file;
     size_t size_written = 0;
     std::vector<uint8_t> temp_buffer;
 
-    llama_data_write_file(llama_file * f) : file(f) {}
+    jarvis_data_write_file(jarvis_file * f) : file(f) {}
 
     void write(const void * src, size_t size) override {
         file->write_raw(src, size);
@@ -20767,12 +20767,12 @@ struct llama_data_write_file : llama_data_write {
     }
 };
 
-struct llama_data_read_file : llama_data_read {
-    llama_file * file;
+struct jarvis_data_read_file : jarvis_data_read {
+    jarvis_file * file;
     size_t size_read = 0;
     std::vector<uint8_t> temp_buffer;
 
-    llama_data_read_file(llama_file * f) : file(f) {}
+    jarvis_data_read_file(jarvis_file * f) : file(f) {}
 
     void read_to(void * dst, size_t size) override {
         file->read_raw(dst, size);
@@ -20793,18 +20793,18 @@ struct llama_data_read_file : llama_data_read {
 /** copy state data into either a buffer or file depending on the passed in context
  *
  * file context:
- * llama_file file("/path", "wb");
- * llama_data_write_file data_ctx(&file);
- * llama_state_get_data_internal(ctx, data_ctx);
+ * jarvis_file file("/path", "wb");
+ * jarvis_data_write_file data_ctx(&file);
+ * jarvis_state_get_data_internal(ctx, data_ctx);
  *
  * buffer context:
  * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_write_buffer data_ctx(buf.data(), max_size);
- * llama_state_get_data_internal(ctx, data_ctx);
+ * jarvis_data_write_buffer data_ctx(buf.data(), max_size);
+ * jarvis_state_get_data_internal(ctx, data_ctx);
  *
 */
-static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
-    llama_synchronize(ctx);
+static size_t jarvis_state_get_data_internal(struct jarvis_context * ctx, jarvis_data_write & data_ctx) {
+    jarvis_synchronize(ctx);
 
     data_ctx.write_model_info(ctx);
 
@@ -20818,30 +20818,30 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     return data_ctx.get_size_written();
 }
 
-size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
-    llama_data_write_buffer data_ctx(dst, size);
+size_t jarvis_state_get_data(struct jarvis_context * ctx, uint8_t * dst, size_t size) {
+    jarvis_data_write_buffer data_ctx(dst, size);
     try {
-        return llama_state_get_data_internal(ctx, data_ctx);
+        return jarvis_state_get_data_internal(ctx, data_ctx);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
         return 0;
     }
 }
 
 // Returns the *actual* size of the state.
 // Intended to be used when saving to state to a buffer.
-size_t llama_state_get_size(struct llama_context * ctx) {
-    llama_data_write_dummy data_ctx;
+size_t jarvis_state_get_size(struct jarvis_context * ctx) {
+    jarvis_data_write_dummy data_ctx;
     try {
-        return llama_state_get_data_internal(ctx, data_ctx);
+        return jarvis_state_get_data_internal(ctx, data_ctx);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
-    llama_synchronize(ctx);
+static size_t jarvis_state_set_data_internal(struct jarvis_context * ctx, jarvis_data_read & data_ctx) {
+    jarvis_synchronize(ctx);
 
     data_ctx.read_model_info(ctx);
 
@@ -20856,26 +20856,26 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
 }
 
 // Sets the state reading from the specified source address
-size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
-    llama_data_read_buffer data_ctx(src, size);
+size_t jarvis_state_set_data(struct jarvis_context * ctx, const uint8_t * src, size_t size) {
+    jarvis_data_read_buffer data_ctx(src, size);
     try {
-        return llama_state_set_data_internal(ctx, data_ctx);
+        return jarvis_state_set_data_internal(ctx, data_ctx);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(path_session, "rb");
+static bool jarvis_state_load_file_internal(struct jarvis_context * ctx, const char * path_session, jarvis_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    jarvis_file file(path_session, "rb");
 
     // sanity checks
     {
         const uint32_t magic   = file.read_u32();
         const uint32_t version = file.read_u32();
 
-        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+        if (magic != JARVIS_SESSION_MAGIC || version != JARVIS_SESSION_VERSION) {
+            JARVIS_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
             return false;
         }
     }
@@ -20885,11 +20885,11 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha
         const uint32_t n_token_count = file.read_u32();
 
         if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            JARVIS_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
             return false;
         }
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        file.read_raw(tokens_out, sizeof(jarvis_token) * n_token_count);
         *n_token_count_out = n_token_count;
     }
 
@@ -20897,122 +20897,122 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha
     {
         const size_t n_state_size_cur = file.size - file.tell();
 
-        llama_data_read_file data_ctx(&file);
-        const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
+        jarvis_data_read_file data_ctx(&file);
+        const size_t n_read = jarvis_state_set_data_internal(ctx, data_ctx);
 
         if (n_read != n_state_size_cur) {
-            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+            JARVIS_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
             return false;
         }
     }
     return true;
 }
 
-bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+bool jarvis_state_load_file(struct jarvis_context * ctx, const char * path_session, jarvis_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     try {
-        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+        return jarvis_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
         return false;
     }
 }
 
-static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(path_session, "wb");
+static bool jarvis_state_save_file_internal(struct jarvis_context * ctx, const char * path_session, const jarvis_token * tokens, size_t n_token_count) {
+    jarvis_file file(path_session, "wb");
 
-    file.write_u32(LLAMA_SESSION_MAGIC);
-    file.write_u32(LLAMA_SESSION_VERSION);
+    file.write_u32(JARVIS_SESSION_MAGIC);
+    file.write_u32(JARVIS_SESSION_VERSION);
 
     // save the prompt
     file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+    file.write_raw(tokens, sizeof(jarvis_token) * n_token_count);
 
     // save the context state using stream saving
-    llama_data_write_file data_ctx(&file);
-    llama_state_get_data_internal(ctx, data_ctx);
+    jarvis_data_write_file data_ctx(&file);
+    jarvis_state_get_data_internal(ctx, data_ctx);
 
     return true;
 }
 
-bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+bool jarvis_state_save_file(struct jarvis_context * ctx, const char * path_session, const jarvis_token * tokens, size_t n_token_count) {
     try {
-        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
+        return jarvis_state_save_file_internal(ctx, path_session, tokens, n_token_count);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
         return false;
     }
 }
 
-static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
-    llama_synchronize(ctx);
+static size_t jarvis_state_seq_get_data_internal(struct jarvis_context * ctx, jarvis_data_write & data_ctx, jarvis_seq_id seq_id) {
+    jarvis_synchronize(ctx);
 
     data_ctx.write_kv_cache(ctx, seq_id);
 
     return data_ctx.get_size_written();
 }
 
-size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_data_write_dummy data_ctx;
-    return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+size_t jarvis_state_seq_get_size(struct jarvis_context * ctx, jarvis_seq_id seq_id) {
+    jarvis_data_write_dummy data_ctx;
+    return jarvis_state_seq_get_data_internal(ctx, data_ctx, seq_id);
 }
 
-size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
-    llama_data_write_buffer data_ctx(dst, size);
+size_t jarvis_state_seq_get_data(struct jarvis_context * ctx, uint8_t * dst, size_t size, jarvis_seq_id seq_id) {
+    jarvis_data_write_buffer data_ctx(dst, size);
     try {
-        return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+        return jarvis_state_seq_get_data_internal(ctx, data_ctx, seq_id);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
-    llama_synchronize(ctx);
+static size_t jarvis_state_seq_set_data_internal(struct jarvis_context * ctx, jarvis_data_read & data_ctx, jarvis_seq_id dest_seq_id) {
+    jarvis_synchronize(ctx);
 
     data_ctx.read_kv_cache(ctx, dest_seq_id);
 
     return data_ctx.get_size_read();
 }
 
-size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
-    llama_data_read_buffer data_ctx(src, size);
+size_t jarvis_state_seq_set_data(struct jarvis_context * ctx, const uint8_t * src, size_t size, jarvis_seq_id dest_seq_id) {
+    jarvis_data_read_buffer data_ctx(src, size);
     try {
-        return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+        return jarvis_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
+static size_t jarvis_state_seq_save_file_internal(struct jarvis_context * ctx, const char * filepath, jarvis_seq_id seq_id, const jarvis_token * tokens, size_t n_token_count) {
+    jarvis_file file(filepath, "wb");
 
-    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
-    file.write_u32(LLAMA_STATE_SEQ_VERSION);
+    file.write_u32(JARVIS_STATE_SEQ_MAGIC);
+    file.write_u32(JARVIS_STATE_SEQ_VERSION);
 
     // save the prompt
     file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+    file.write_raw(tokens, sizeof(jarvis_token) * n_token_count);
 
     // save the context state using stream saving
-    llama_data_write_file data_ctx(&file);
-    llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+    jarvis_data_write_file data_ctx(&file);
+    jarvis_state_seq_get_data_internal(ctx, data_ctx, seq_id);
 
     const size_t res = file.tell();
-    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
+    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(jarvis_token) * n_token_count + data_ctx.get_size_written());
     return res;
 }
 
-static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
+static size_t jarvis_state_seq_load_file_internal(struct jarvis_context * ctx, const char * filepath, jarvis_seq_id dest_seq_id, jarvis_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    jarvis_file file(filepath, "rb");
 
     // version checks
     {
         const uint32_t magic   = file.read_u32();
         const uint32_t version = file.read_u32();
 
-        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
+        if (magic != JARVIS_STATE_SEQ_MAGIC || version != JARVIS_STATE_SEQ_VERSION) {
+            JARVIS_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
             return 0;
         }
     }
@@ -21022,76 +21022,76 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con
         const uint32_t n_token_count = file.read_u32();
 
         if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            JARVIS_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
             return 0;
         }
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        file.read_raw(tokens_out, sizeof(jarvis_token) * n_token_count);
         *n_token_count_out = n_token_count;
     }
 
     // restore the context state
     {
         const size_t state_size = file.size - file.tell();
-        llama_data_read_file data_ctx(&file);
-        const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+        jarvis_data_read_file data_ctx(&file);
+        const size_t nread = jarvis_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
         if (!nread) {
-            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
+            JARVIS_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
             return 0;
         }
         GGML_ASSERT(nread <= state_size);
-        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(jarvis_token) * *n_token_count_out == file.tell());
     }
 
     return file.tell();
 }
 
-size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
+size_t jarvis_state_seq_save_file(struct jarvis_context * ctx, const char * filepath, jarvis_seq_id seq_id, const jarvis_token * tokens, size_t n_token_count) {
     try {
-        return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
+        return jarvis_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+size_t jarvis_state_seq_load_file(struct jarvis_context * ctx, const char * filepath, jarvis_seq_id dest_seq_id, jarvis_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     try {
-        return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
+        return jarvis_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
+        JARVIS_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
         return 0;
     }
 }
 
-void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
+void jarvis_set_n_threads(struct jarvis_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
 }
 
-int32_t llama_n_threads(struct llama_context * ctx) {
+int32_t jarvis_n_threads(struct jarvis_context * ctx) {
     return ctx->cparams.n_threads;
 }
 
-int32_t llama_n_threads_batch(struct llama_context * ctx) {
+int32_t jarvis_n_threads_batch(struct jarvis_context * ctx) {
     return ctx->cparams.n_threads_batch;
 }
 
-void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
+void jarvis_set_abort_callback(struct jarvis_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
     ctx->abort_callback      = abort_callback;
     ctx->abort_callback_data = abort_callback_data;
 }
 
-void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
+void jarvis_set_embeddings(struct jarvis_context * ctx, bool embeddings) {
     ctx->cparams.embeddings = embeddings;
 }
 
-void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+void jarvis_set_causal_attn(struct jarvis_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
-struct llama_batch llama_batch_get_one(
-             llama_token * tokens,
+struct jarvis_batch jarvis_batch_get_one(
+             jarvis_token * tokens,
                  int32_t   n_tokens) {
     return {
         /*n_tokens       =*/ n_tokens,
@@ -21104,8 +21104,8 @@ struct llama_batch llama_batch_get_one(
     };
 }
 
-struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = {
+struct jarvis_batch jarvis_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
+    jarvis_batch batch = {
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
@@ -21118,14 +21118,14 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
     } else {
-        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
+        batch.token = (jarvis_token *) malloc(sizeof(jarvis_token) * n_tokens_alloc);
     }
 
-    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
+    batch.pos      = (jarvis_pos *)     malloc(sizeof(jarvis_pos)      * n_tokens_alloc);
     batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
-    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
+    batch.seq_id   = (jarvis_seq_id **) malloc(sizeof(jarvis_seq_id *) * (n_tokens_alloc + 1));
     for (int i = 0; i < n_tokens_alloc; ++i) {
-        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+        batch.seq_id[i] = (jarvis_seq_id *) malloc(sizeof(jarvis_seq_id) * n_seq_max);
     }
     batch.seq_id[n_tokens_alloc] = nullptr;
 
@@ -21134,7 +21134,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     return batch;
 }
 
-void llama_batch_free(struct llama_batch batch) {
+void jarvis_batch_free(struct jarvis_batch batch) {
     if (batch.token)    free(batch.token);
     if (batch.embd)     free(batch.embd);
     if (batch.pos)      free(batch.pos);
@@ -21148,29 +21148,29 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
-int32_t llama_encode(
-        struct llama_context * ctx,
-          struct llama_batch   batch) {
-    const int ret = llama_encode_internal(*ctx, batch);
+int32_t jarvis_encode(
+        struct jarvis_context * ctx,
+          struct jarvis_batch   batch) {
+    const int ret = jarvis_encode_internal(*ctx, batch);
     if (ret != 0) {
-        LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
+        JARVIS_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
     }
 
     return ret;
 }
 
-int32_t llama_decode(
-        struct llama_context * ctx,
-          struct llama_batch   batch) {
-    const int ret = llama_decode_internal(*ctx, batch);
+int32_t jarvis_decode(
+        struct jarvis_context * ctx,
+          struct jarvis_batch   batch) {
+    const int ret = jarvis_decode_internal(*ctx, batch);
     if (ret != 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+        JARVIS_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
     return ret;
 }
 
-void llama_synchronize(struct llama_context * ctx) {
+void jarvis_synchronize(struct jarvis_context * ctx) {
     ggml_backend_sched_synchronize(ctx->sched);
 
     // FIXME: if multiple single tokens are evaluated without a synchronization,
@@ -21200,19 +21200,19 @@ void llama_synchronize(struct llama_context * ctx) {
     ctx->t_compute_start_us = 0;
 }
 
-float * llama_get_logits(struct llama_context * ctx) {
-    llama_synchronize(ctx);
+float * jarvis_get_logits(struct jarvis_context * ctx) {
+    jarvis_synchronize(ctx);
 
     // reorder logits for backward compatibility
     // TODO: maybe deprecate this
-    llama_output_reorder(ctx);
+    jarvis_output_reorder(ctx);
 
     return ctx->logits;
 }
 
-float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+float * jarvis_get_logits_ith(struct jarvis_context * ctx, int32_t i) {
     int32_t j = -1;
-    llama_synchronize(ctx);
+    jarvis_synchronize(ctx);
 
     try {
         if (ctx->logits == nullptr) {
@@ -21240,7 +21240,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
 
         return ctx->logits + j*ctx->model.hparams.n_vocab;
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+        JARVIS_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
         GGML_ABORT("fatal error");
 #else
@@ -21249,20 +21249,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
     }
 }
 
-float * llama_get_embeddings(struct llama_context * ctx) {
-    llama_synchronize(ctx);
+float * jarvis_get_embeddings(struct jarvis_context * ctx) {
+    jarvis_synchronize(ctx);
 
     // reorder embeddings for backward compatibility
     // TODO: maybe deprecate this
-    llama_output_reorder(ctx);
+    jarvis_output_reorder(ctx);
 
     return ctx->embd;
 }
 
-float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+float * jarvis_get_embeddings_ith(struct jarvis_context * ctx, int32_t i) {
     int32_t j = -1;
 
-    llama_synchronize(ctx);
+    jarvis_synchronize(ctx);
 
     try {
         if (ctx->embd == nullptr) {
@@ -21290,7 +21290,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
 
         return ctx->embd + j*ctx->model.hparams.n_embd;
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+        JARVIS_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
         GGML_ABORT("fatal error");
 #else
@@ -21299,8 +21299,8 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
     }
 }
 
-float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_synchronize(ctx);
+float * jarvis_get_embeddings_seq(struct jarvis_context * ctx, jarvis_seq_id seq_id) {
+    jarvis_synchronize(ctx);
 
     auto it = ctx->embd_seq.find(seq_id);
     if (it == ctx->embd_seq.end()) {
@@ -21314,145 +21314,145 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
 // vocab
 //
 
-const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
-    return llama_token_get_text_impl(model->vocab, token);
+const char * jarvis_token_get_text(const struct jarvis_model * model, jarvis_token token) {
+    return jarvis_token_get_text_impl(model->vocab, token);
 }
 
-float llama_token_get_score(const struct llama_model * model, llama_token token) {
-    return llama_token_get_score_impl(model->vocab, token);
+float jarvis_token_get_score(const struct jarvis_model * model, jarvis_token token) {
+    return jarvis_token_get_score_impl(model->vocab, token);
 }
 
-enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
-    return llama_token_get_attr_impl(model->vocab, token);
+enum jarvis_token_attr jarvis_token_get_attr(const struct jarvis_model * model, jarvis_token token) {
+    return jarvis_token_get_attr_impl(model->vocab, token);
 }
 
-bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
-    return llama_token_is_eog_impl(model->vocab, token);
+bool jarvis_token_is_eog(const struct jarvis_model * model, jarvis_token token) {
+    return jarvis_token_is_eog_impl(model->vocab, token);
 }
 
-bool llama_token_is_control(const struct llama_model * model, llama_token token) {
-    return llama_token_is_control_impl(model->vocab, token);
+bool jarvis_token_is_control(const struct jarvis_model * model, jarvis_token token) {
+    return jarvis_token_is_control_impl(model->vocab, token);
 }
 
-llama_token llama_token_bos(const struct llama_model * model) {
-    return llama_token_bos_impl(model->vocab);
+jarvis_token jarvis_token_bos(const struct jarvis_model * model) {
+    return jarvis_token_bos_impl(model->vocab);
 }
 
-llama_token llama_token_eos(const struct llama_model * model) {
-    return llama_token_eos_impl(model->vocab);
+jarvis_token jarvis_token_eos(const struct jarvis_model * model) {
+    return jarvis_token_eos_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+jarvis_token jarvis_token_eot(const struct jarvis_model * model) {
+    return jarvis_token_eot_impl(model->vocab);
 }
 
-llama_token llama_token_cls(const struct llama_model * model) {
-    return llama_token_cls_impl(model->vocab);
+jarvis_token jarvis_token_cls(const struct jarvis_model * model) {
+    return jarvis_token_cls_impl(model->vocab);
 }
 
-llama_token llama_token_sep(const struct llama_model * model) {
-    return llama_token_sep_impl(model->vocab);
+jarvis_token jarvis_token_sep(const struct jarvis_model * model) {
+    return jarvis_token_sep_impl(model->vocab);
 }
 
-llama_token llama_token_nl (const struct llama_model * model) {
-    return llama_token_nl_impl(model->vocab);
+jarvis_token jarvis_token_nl (const struct jarvis_model * model) {
+    return jarvis_token_nl_impl(model->vocab);
 }
 
-llama_token llama_token_pad(const struct llama_model * model) {
-    return llama_token_pad_impl(model->vocab);
+jarvis_token jarvis_token_pad(const struct jarvis_model * model) {
+    return jarvis_token_pad_impl(model->vocab);
 }
 
-bool llama_add_bos_token(const struct llama_model * model) {
-    return llama_add_bos_token_impl(model->vocab);
+bool jarvis_add_bos_token(const struct jarvis_model * model) {
+    return jarvis_add_bos_token_impl(model->vocab);
 }
 
-bool llama_add_eos_token(const struct llama_model * model) {
-    return llama_add_eos_token_impl(model->vocab);
+bool jarvis_add_eos_token(const struct jarvis_model * model) {
+    return jarvis_add_eos_token_impl(model->vocab);
 }
 
-llama_token llama_token_prefix(const struct llama_model * model) {
-    return llama_token_prefix_impl(model->vocab);
+jarvis_token jarvis_token_prefix(const struct jarvis_model * model) {
+    return jarvis_token_prefix_impl(model->vocab);
 }
 
-llama_token llama_token_middle(const struct llama_model * model) {
-    return llama_token_middle_impl(model->vocab);
+jarvis_token jarvis_token_middle(const struct jarvis_model * model) {
+    return jarvis_token_middle_impl(model->vocab);
 }
 
-llama_token llama_token_suffix(const struct llama_model * model) {
-    return llama_token_suffix_impl(model->vocab);
+jarvis_token jarvis_token_suffix(const struct jarvis_model * model) {
+    return jarvis_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_fim_pre(const struct llama_model * model) {
-    return llama_token_fim_pre_impl(model->vocab);
+jarvis_token jarvis_token_fim_pre(const struct jarvis_model * model) {
+    return jarvis_token_fim_pre_impl(model->vocab);
 }
 
-llama_token llama_token_fim_suf(const struct llama_model * model) {
-    return llama_token_fim_suf_impl(model->vocab);
+jarvis_token jarvis_token_fim_suf(const struct jarvis_model * model) {
+    return jarvis_token_fim_suf_impl(model->vocab);
 }
 
-llama_token llama_token_fim_mid(const struct llama_model * model) {
-    return llama_token_fim_mid_impl(model->vocab);
+jarvis_token jarvis_token_fim_mid(const struct jarvis_model * model) {
+    return jarvis_token_fim_mid_impl(model->vocab);
 }
 
-llama_token llama_token_fim_pad(const struct llama_model * model) {
-    return llama_token_fim_pad_impl(model->vocab);
+jarvis_token jarvis_token_fim_pad(const struct jarvis_model * model) {
+    return jarvis_token_fim_pad_impl(model->vocab);
 }
 
-llama_token llama_token_fim_rep(const struct llama_model * model) {
-    return llama_token_fim_rep_impl(model->vocab);
+jarvis_token jarvis_token_fim_rep(const struct jarvis_model * model) {
+    return jarvis_token_fim_rep_impl(model->vocab);
 }
 
-llama_token llama_token_fim_sep(const struct llama_model * model) {
-    return llama_token_fim_sep_impl(model->vocab);
+jarvis_token jarvis_token_fim_sep(const struct jarvis_model * model) {
+    return jarvis_token_fim_sep_impl(model->vocab);
 }
 
 //
 // tokenization
 //
 
-int32_t llama_tokenize(
-    const struct llama_model * model,
+int32_t jarvis_tokenize(
+    const struct jarvis_model * model,
                   const char * text,
                      int32_t   text_len,
-                 llama_token * tokens,
+                 jarvis_token * tokens,
                      int32_t   n_tokens_max,
                         bool   add_special,
                         bool   parse_special) {
-    return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
+    return jarvis_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
 }
 
-int32_t llama_token_to_piece(
-    const struct llama_model * model,
-                 llama_token   token,
+int32_t jarvis_token_to_piece(
+    const struct jarvis_model * model,
+                 jarvis_token   token,
                         char * buf,
                      int32_t   length,
                      int32_t   lstrip,
                         bool   special) {
-    return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
+    return jarvis_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
-int32_t llama_detokenize(
-    const struct llama_model * model,
-           const llama_token * tokens,
+int32_t jarvis_detokenize(
+    const struct jarvis_model * model,
+           const jarvis_token * tokens,
                      int32_t   n_tokens,
                         char * text,
                      int32_t   text_len_max,
                         bool   remove_special,
                         bool   unparse_special) {
-    return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+    return jarvis_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
 }
 
 //
 // chat templates
 //
 
-// Simple version of "llama_apply_chat_template" that only works with strings
+// Simple version of "jarvis_apply_chat_template" that only works with strings
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
-static int32_t llama_chat_apply_template_internal(
+static int32_t jarvis_chat_apply_template_internal(
     const std::string & tmpl,
-    const std::vector<const llama_chat_message *> & chat,
+    const std::vector<const jarvis_chat_message *> & chat,
     std::string & dest, bool add_ass) {
-    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
+    // Taken from the research: https://github.com/ggerganov/jarvis.cpp/issues/5527
     std::stringstream ss;
     auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
         return tmpl.find(haystack) != std::string::npos;
@@ -21465,8 +21465,8 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
-        // llama2 template and its variants
+    } else if (tmpl == "jarvis2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
+        // jarvis2 template and its variants
         // [variant] support system message
         bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
         // [variant] space before + after response
@@ -21499,7 +21499,7 @@ static int32_t llama_chat_apply_template_internal(
                 is_inside_turn = false;
             }
         }
-        // llama2 templates seem to not care about "add_generation_prompt"
+        // jarvis2 templates seem to not care about "add_generation_prompt"
     } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
         // Phi 3
         for (auto message : chat) {
@@ -21632,8 +21632,8 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
         }
-    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
-        // Llama 3
+    } else if (tmpl == "jarvis3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
+        // Jarvis 3
         for (auto message : chat) {
             std::string role(message->role);
             ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
@@ -21721,10 +21721,10 @@ static int32_t llama_chat_apply_template_internal(
     return dest.size();
 }
 
-int32_t llama_chat_apply_template(
-                const struct llama_model * model,
+int32_t jarvis_chat_apply_template(
+                const struct jarvis_model * model,
                               const char * tmpl,
-         const struct llama_chat_message * chat,
+         const struct jarvis_chat_message * chat,
                                   size_t   n_msg,
                                     bool   add_ass,
                                     char * buf,
@@ -21735,24 +21735,24 @@ int32_t llama_chat_apply_template(
         // load template from model
         std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
         std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        int32_t res = jarvis_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
         if (res < 0) {
             // worst case: there is no information about template, we will use chatml by default
-            curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
+            curr_tmpl = "chatml"; // see jarvis_chat_apply_template_internal
         } else {
             curr_tmpl = std::string(model_template.data(), model_template.size());
         }
     }
 
     // format the chat to string
-    std::vector<const llama_chat_message *> chat_vec;
+    std::vector<const jarvis_chat_message *> chat_vec;
     chat_vec.resize(n_msg);
     for (size_t i = 0; i < n_msg; i++) {
         chat_vec[i] = &chat[i];
     }
 
     std::string formatted_chat;
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
+    int32_t res = jarvis_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
     if (res < 0) {
         return res;
     }
@@ -21766,24 +21766,24 @@ int32_t llama_chat_apply_template(
 // sampling
 //
 
-// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
-struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
-    return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
+// TODO: remove indirection when vocab becomes accesible in jarvis-sampling.cpp
+struct jarvis_sampler * jarvis_sampler_init_grammar(const struct jarvis_model * model, const char * grammar_str, const char * grammar_root) {
+    return jarvis_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 
-struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
-    return llama_sampler_init_infill_impl(model->vocab);
+struct jarvis_sampler * jarvis_sampler_init_infill(const struct jarvis_model * model) {
+    return jarvis_sampler_init_infill_impl(model->vocab);
 }
 
-struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
-    return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
+struct jarvis_sampler * jarvis_sampler_init_dry(const struct jarvis_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    return jarvis_sampler_init_dry_impl(model->vocab, jarvis_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
 }
 
 //
 // model split
 //
 
-int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
+int jarvis_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
     static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
     if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
         return strlen(split_path);
@@ -21791,7 +21791,7 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
     return 0;
 }
 
-int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
+int jarvis_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
     std::string str_split_path(split_path);
     char postfix[32];
     snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
@@ -21807,7 +21807,7 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
     return 0;
 }
 
-const char * llama_print_system_info(void) {
+const char * jarvis_print_system_info(void) {
     static std::string s;
 
     s  = "";
@@ -21832,13 +21832,13 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
+    s += "JARVISFILE = "   + std::to_string(ggml_cpu_has_jarvisfile())   + " | ";
 
     return s.c_str();
 }
 
-struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
-    struct llama_perf_context_data data = {};
+struct jarvis_perf_context_data jarvis_perf_context(const struct jarvis_context * ctx) {
+    struct jarvis_perf_context_data data = {};
 
     if (ctx == nullptr) {
         return data;
@@ -21854,26 +21854,26 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c
     return data;
 }
 
-void llama_perf_context_print(const struct llama_context * ctx) {
-    const auto data = llama_perf_context(ctx);
+void jarvis_perf_context_print(const struct jarvis_context * ctx) {
+    const auto data = jarvis_perf_context(ctx);
 
     const double t_end_ms = 1e-3 * ggml_time_us();
 
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+    JARVIS_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    JARVIS_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    JARVIS_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+    JARVIS_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 }
 
-void llama_perf_context_reset(struct llama_context * ctx) {
+void jarvis_perf_context_reset(struct jarvis_context * ctx) {
     ctx->t_start_us  = ggml_time_us();
     ctx->t_eval_us   = ctx->n_eval = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 
-void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
+void jarvis_perf_dump_yaml(FILE * stream, const jarvis_context * ctx) {
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");
     fprintf(stream, "# Timings #\n");
@@ -21896,19 +21896,19 @@ void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
 }
 
 // For internal test use
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & jarvis_internal_get_tensor_map(
+    struct jarvis_context * ctx
 ) {
     return ctx->model.tensors_by_name;
 }
 
-void llama_log_set(ggml_log_callback log_callback, void * user_data) {
+void jarvis_log_set(ggml_log_callback log_callback, void * user_data) {
     ggml_log_set(log_callback, user_data);
-    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+    g_logger_state.log_callback = log_callback ? log_callback : jarvis_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
 
-static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
+static void jarvis_log_internal_v(ggml_log_level level, const char * format, va_list args) {
     va_list args_copy;
     va_copy(args_copy, args);
     char buffer[128];
@@ -21925,14 +21925,14 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
     va_end(args_copy);
 }
 
-void llama_log_internal(ggml_log_level level, const char * format, ...) {
+void jarvis_log_internal(ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
-    llama_log_internal_v(level, format, args);
+    jarvis_log_internal_v(level, format, args);
     va_end(args);
 }
 
-void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+void jarvis_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
     (void) level;
     (void) user_data;
     fputs(text, stderr);
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
deleted file mode 100644
index f529ce351e416..0000000000000
--- a/src/llama-grammar.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#pragma once
-
-#include "llama-impl.h"
-
-#include <map>
-
-struct llama_vocab;
-
-// grammar element type
-enum llama_gretype {
-    // end of rule definition
-    LLAMA_GRETYPE_END            = 0,
-
-    // start of alternate definition for rule
-    LLAMA_GRETYPE_ALT            = 1,
-
-    // non-terminal element: reference to rule
-    LLAMA_GRETYPE_RULE_REF       = 2,
-
-    // terminal element: character (code point)
-    LLAMA_GRETYPE_CHAR           = 3,
-
-    // inverse char(s) ([^a], [^a-b] [^abc])
-    LLAMA_GRETYPE_CHAR_NOT       = 4,
-
-    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
-    // be an inclusive range ([a-z])
-    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
-
-    // modifies a preceding LLAMA_GRETYPE_CHAR or
-    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
-    LLAMA_GRETYPE_CHAR_ALT       = 6,
-
-    // any character (.)
-    LLAMA_GRETYPE_CHAR_ANY       = 7,
-};
-
-typedef struct llama_grammar_element {
-    enum llama_gretype type;
-    uint32_t           value; // Unicode code point or rule ID
-} llama_grammar_element;
-
-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
-
-using llama_grammar_rule  = std::vector<      llama_grammar_element>;
-using llama_grammar_stack = std::vector<const llama_grammar_element *>;
-
-using llama_grammar_rules      = std::vector<llama_grammar_rule>;
-using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
-using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
-
-const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
-      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
-
-// takes a set of possible pushdown stacks on a grammar, which are required to
-// be positioned at a character range (see `llama_grammar_advance_stack`), and
-// produces the N possible stacks if the given char is accepted at those
-// positions
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-                          uint32_t   chr,
-              llama_grammar_stacks & stacks_new);
-
-std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
-        const llama_grammar_rules      & rules,
-        const llama_grammar_stack      & stack,
-        const llama_grammar_candidates & candidates);
-
-struct llama_grammar_parser {
-    std::map<std::string, uint32_t> symbol_ids;
-
-    llama_grammar_rules rules;
-
-    llama_grammar_stack c_rules() const;
-
-    uint32_t get_symbol_id(const char * src, size_t len);
-    uint32_t generate_symbol_id(const std::string & base_name);
-
-    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
-
-    const char * parse_alternates(
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    const char * parse_sequence(
-            const char         * src,
-            const std::string  & rule_name,
-            llama_grammar_rule & rule,
-            bool               is_nested);
-
-    const char * parse_rule(const char * src);
-
-    bool parse(const char * src);
-    void print(FILE * file);
-};
-
-struct llama_grammar {
-    // note: allow null vocab for testing (not great)
-    const llama_vocab * vocab;
-
-    const llama_grammar_rules  rules;  // TODO: shared ptr
-          llama_grammar_stacks stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8 partial_utf8;
-};
-
-//
-// internal API
-//
-
-// note: needed for tests (not great)
-struct llama_grammar * llama_grammar_init_impl(
-        const struct llama_vocab * vocab,
-        const llama_grammar_element ** rules,
-        size_t n_rules,
-        size_t start_rule_index);
-
-struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
-
-void llama_grammar_free_impl(struct llama_grammar * grammar);
-
-struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
-
-// TODO: move the API below as member functions of llama_grammar
-void llama_grammar_apply_impl(
-        const struct llama_grammar & grammar,
-            llama_token_data_array * cur_p);
-
-void llama_grammar_accept_impl(
-              struct llama_grammar & grammar,
-                       llama_token   token);
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
deleted file mode 100644
index 4bb16d2e4299f..0000000000000
--- a/src/llama-vocab.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#pragma once
-
-#include "llama-impl.h"
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-
-struct llm_tokenizer;
-
-struct llama_vocab {
-    using id    = llama_token;
-    using token = std::string;
-    using tattr = llama_token_attr;
-
-    struct token_data {
-        token text;
-        float score;
-        tattr attr;
-    };
-
-    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
-
-    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
-    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-
-    int max_token_len = 0; // used for optimizing longest token search
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data>       id_to_token;
-
-    std::vector<id>    cache_special_tokens;
-    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
-
-    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
-
-    // default LLaMA special tokens
-    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
-    id special_bos_id  = 1;
-    id special_eos_id  = 2;
-    id special_eot_id  = LLAMA_TOKEN_NULL;
-    id special_eom_id  = LLAMA_TOKEN_NULL;
-    id special_unk_id  = 0;
-    id special_sep_id  = LLAMA_TOKEN_NULL;
-    id special_pad_id  = LLAMA_TOKEN_NULL;
-    id special_cls_id  = LLAMA_TOKEN_NULL;
-    id special_mask_id = LLAMA_TOKEN_NULL;
-
-    id linefeed_id = 13;
-
-    // fim tokens
-    id special_fim_pre_id = LLAMA_TOKEN_NULL;
-    id special_fim_suf_id = LLAMA_TOKEN_NULL;
-    id special_fim_mid_id = LLAMA_TOKEN_NULL;
-    id special_fim_pad_id = LLAMA_TOKEN_NULL;
-    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
-    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
-
-    // set of all tokens that cause "end of generation"
-    std::set<id> special_eog_ids;
-
-    // tokenizer flags
-    bool tokenizer_add_space_prefix           = false;
-    bool tokenizer_add_bos                    = false;
-    bool tokenizer_add_eos                    = false;
-    bool tokenizer_ignore_merges              = false;
-    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
-    bool tokenizer_remove_extra_whitespaces   = false;
-    bool tokenizer_escape_whitespaces         = true;
-    bool tokenizer_treat_whitespace_as_suffix = false;
-
-    std::vector<char> precompiled_charsmap;
-
-    llm_tokenizer * tokenizer = nullptr;
-
-    llama_vocab() = default;
-    ~llama_vocab();
-
-    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
-
-    void init_tokenizer();
-};
-
-//
-// internal API
-//
-
-// TODO: rename to llama_tokenize_impl
-// TODO: This should probably be in llama.h
-std::vector<llama_vocab::id> llama_tokenize_internal(
-        const llama_vocab & vocab,
-        std::string raw_text,
-        bool add_special,
-        bool parse_special = false);
-
-// TODO: move the API below as member functions of llama_vocab
-llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
-
-const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
-
-float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
-
-llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
-
-bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
-
-bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
-
-llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
-llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
-llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
-llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
-llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
-
-llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
-llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
-llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
-
-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
-
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
-
-int32_t llama_tokenize_impl(
-        const struct llama_vocab & vocab,
-                      const char * text,
-                         int32_t   text_len,
-                     llama_token * tokens,
-                         int32_t   n_tokens_max,
-                            bool   add_special,
-                            bool   parse_special);
-
-// does not write null-terminator to buf
-int32_t llama_token_to_piece_impl(
-        const struct llama_vocab & vocab,
-                     llama_token   token,
-                            char * buf,
-                         int32_t   length,
-                         int32_t   lstrip,
-                            bool   special);
-
-// check if token0 is contained as a prefix in token1
-bool llama_token_is_prefix_impl(
-        const struct llama_vocab & vocab,
-                     llama_token   token0,
-                     llama_token   token1);
-
-int32_t llama_detokenize_impl(
-        const struct llama_vocab & vocab,
-               const llama_token * tokens,
-                         int32_t   n_tokens,
-                            char * text,
-                         int32_t   text_len_max,
-                            bool   remove_special,
-                            bool   unparse_special);
-
-std::string llama_detokenize(
-        const struct llama_vocab & vocab,
-  const std::vector<llama_token> & tokens,
-                            bool   special);
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 50b35bbbc918c..0d1533242f8e2 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -341,8 +341,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
     return bpe_offsets;
 }
 
-// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
-static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
+// JARVIS3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+static std::vector<size_t> unicode_regex_split_custom_jarvis3(const std::string & text, const std::vector<size_t> & offsets) {
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
 
@@ -551,7 +551,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
             regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
             regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
 
-        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+        bpe_offsets = unicode_regex_split_custom_jarvis3(text, offsets);
     }
 
     return bpe_offsets;
@@ -685,7 +685,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
     const auto cpts = unicode_cpts_from_utf8(text);
 
     // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+    // ref: https://github.com/ggerganov/jarvis.cpp/pull/6920#issuecomment-2081479935
     std::string text_collapsed;
     if (need_collapse) {
         // collapse all unicode categories
diff --git a/src/unicode.h b/src/unicode.h
index 008532a242ab8..25d3b5ddedcc0 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -4,7 +4,7 @@
 #include <string>
 #include <vector>
 
-// TODO: prefix all symbols with "llama_"
+// TODO: prefix all symbols with "jarvis_"
 
 struct codepoint_flags {
     enum {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 08ad66b49fdd4..622c554cab775 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,18 +1,18 @@
-function(llama_test target)
+function(jarvis_test target)
     include(CMakeParseArguments)
     set(options)
     set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
     set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(JARVIS_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
+    if (NOT DEFINED JARVIS_TEST_LABEL)
+        set(JARVIS_TEST_LABEL "main")
     endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    if (NOT DEFINED JARVIS_TEST_WORKING_DIRECTORY)
+        set(JARVIS_TEST_WORKING_DIRECTORY .)
     endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_NAME ${LLAMA_TEST_NAME})
+    if (DEFINED JARVIS_TEST_NAME)
+        set(TEST_NAME ${JARVIS_TEST_NAME})
     else()
         set(TEST_NAME ${target})
     endif()
@@ -21,11 +21,11 @@ function(llama_test target)
 
     add_test(
         NAME ${TEST_NAME}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        WORKING_DIRECTORY ${JARVIS_TEST_WORKING_DIRECTORY}
         COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
+        ${JARVIS_TEST_ARGS})
 
-    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${JARVIS_TEST_LABEL})
 endfunction()
 
 # Builds and runs a test source file.
@@ -34,21 +34,21 @@ endfunction()
 # - LABEL: label for the test (defaults to main)
 # - ARGS: arguments to pass to the test executable
 # - WORKING_DIRECTORY
-function(llama_target_and_test source)
+function(jarvis_target_and_test source)
     include(CMakeParseArguments)
     set(options)
     set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
     set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(JARVIS_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
+    if (NOT DEFINED JARVIS_TEST_LABEL)
+        set(JARVIS_TEST_LABEL "main")
     endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    if (NOT DEFINED JARVIS_TEST_WORKING_DIRECTORY)
+        set(JARVIS_TEST_WORKING_DIRECTORY .)
     endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    if (DEFINED JARVIS_TEST_NAME)
+        set(TEST_TARGET ${JARVIS_TEST_NAME})
     else()
         get_filename_component(TEST_TARGET ${source} NAME_WE)
     endif()
@@ -58,11 +58,11 @@ function(llama_target_and_test source)
     target_link_libraries(${TEST_TARGET} PRIVATE common)
     add_test(
         NAME ${TEST_TARGET}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        WORKING_DIRECTORY ${JARVIS_TEST_WORKING_DIRECTORY}
         COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
+        ${JARVIS_TEST_ARGS})
 
-    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${JARVIS_TEST_LABEL})
 endfunction()
 
 # build test-tokenizer-0 target once and add many tests
@@ -70,19 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
 target_link_libraries(test-tokenizer-0 PRIVATE common)
 install(TARGETS test-tokenizer-0 RUNTIME)
 
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-jarvis-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-jarvis-bpe.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-jarvis-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-jarvis-spm.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+jarvis_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
@@ -90,51 +90,51 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
 
 # TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-jarvis-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-jarvis-bpe.gguf --ignore-merges)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+#jarvis_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
 # build test-tokenizer-1-spm target once and add many tests
 add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
 target_link_libraries(test-tokenizer-1-spm PRIVATE common)
 install(TARGETS test-tokenizer-1-spm RUNTIME)
 
-llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-#llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+jarvis_test(test-tokenizer-1-spm  NAME test-tokenizer-1-jarvis-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-jarvis-spm.gguf)
+#jarvis_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
-# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_target_and_test(test-log.cpp)
-llama_target_and_test(test-arg-parser.cpp)
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
-llama_target_and_test(test-sampling.cpp)
-llama_target_and_test(test-chat-template.cpp)
+# jarvis_target_and_test(test-double-float.cpp) # SLOW
+jarvis_target_and_test(test-log.cpp)
+jarvis_target_and_test(test-arg-parser.cpp)
+jarvis_target_and_test(test-quantize-fns.cpp)
+jarvis_target_and_test(test-quantize-perf.cpp)
+jarvis_target_and_test(test-sampling.cpp)
+jarvis_target_and_test(test-chat-template.cpp)
 
-llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
-llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-grad0.cpp)
-llama_target_and_test(test-barrier.cpp)
-# llama_target_and_test(test-opt.cpp) # SLOW
-llama_target_and_test(test-backend-ops.cpp)
+jarvis_target_and_test(test-grammar-parser.cpp)
+jarvis_target_and_test(test-jarvis-grammar.cpp)
+jarvis_target_and_test(test-grammar-integration.cpp)
+jarvis_target_and_test(test-grad0.cpp)
+jarvis_target_and_test(test-barrier.cpp)
+# jarvis_target_and_test(test-opt.cpp) # SLOW
+jarvis_target_and_test(test-backend-ops.cpp)
 
-llama_target_and_test(test-rope.cpp)
+jarvis_target_and_test(test-rope.cpp)
 
-llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
-llama_target_and_test(test-autorelease.cpp        LABEL "model")
+jarvis_target_and_test(test-model-load-cancel.cpp  LABEL "model")
+jarvis_target_and_test(test-autorelease.cpp        LABEL "model")
 
 # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+    jarvis_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
     target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
 endif()
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
+target_link_libraries(${TEST_TARGET} PRIVATE jarvis)
diff --git a/tests/get-model.cpp b/tests/get-model.cpp
index 4edb685f0fbf7..a899bdc1f1e68 100644
--- a/tests/get-model.cpp
+++ b/tests/get-model.cpp
@@ -10,9 +10,9 @@ char * get_model_or_exit(int argc, char *argv[]) {
         model_path = argv[1];
 
     } else {
-        model_path = getenv("LLAMACPP_TEST_MODELFILE");
+        model_path = getenv("JARVISCPP_TEST_MODELFILE");
         if (!model_path || strlen(model_path) == 0) {
-            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
+            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set JARVISCPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
             exit(EXIT_SUCCESS);
         }
     }
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 3665238b5a2d8..725c3d21f04b6 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -13,9 +13,9 @@ int main(void) {
     common_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
-    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+    for (int ex = 0; ex < JARVIS_EXAMPLE_COUNT; ex++) {
         try {
-            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = common_params_parser_init(params, (enum jarvis_example)ex);
             std::unordered_set<std::string> seen_args;
             std::unordered_set<std::string> seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
@@ -58,44 +58,44 @@ int main(void) {
 
     // missing value
     argv = {"binary_name", "-m"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
 
     // wrong value (int)
     argv = {"binary_name", "-ngl", "hello"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
 
     // wrong value (enum)
     argv = {"binary_name", "-sm", "hello"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
 
-    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    // non-existence arg in specific example (--draft cannot be used outside jarvis-speculative)
     argv = {"binary_name", "--draft", "123"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_SERVER));
 
 
     printf("test-arg-parser: test valid usage\n\n");
 
     argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.model == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.cpuparams.n_threads == 1234);
 
     argv = {"binary_name", "--verbose"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.verbosity > 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.model == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
-    // --draft cannot be used outside llama-speculative
+    // --draft cannot be used outside jarvis-speculative
     argv = {"binary_name", "--draft", "123"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_SPECULATIVE));
     assert(params.n_draft == 123);
 
 // skip this part on windows, because setenv is not supported
@@ -104,24 +104,24 @@ int main(void) {
 #else
     printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
 
-    setenv("LLAMA_ARG_THREADS", "blah", true);
+    setenv("JARVIS_ARG_THREADS", "blah", true);
     argv = {"binary_name"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
 
-    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
-    setenv("LLAMA_ARG_THREADS", "1010", true);
+    setenv("JARVIS_ARG_MODEL", "blah.gguf", true);
+    setenv("JARVIS_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.model == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
 
     printf("test-arg-parser: test environment variables being overwritten\n\n");
 
-    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
-    setenv("LLAMA_ARG_THREADS", "1010", true);
+    setenv("JARVIS_ARG_MODEL", "blah.gguf", true);
+    setenv("JARVIS_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, JARVIS_EXAMPLE_COMMON));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
index 57fa000114d5d..17fe150dbb3be 100644
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -1,10 +1,10 @@
-// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
+// ref: https://github.com/ggerganov/jarvis.cpp/issues/4952#issuecomment-1892864763
 
 #include <cstdio>
 #include <string>
 #include <thread>
 
-#include "llama.h"
+#include "jarvis.h"
 #include "get-model.h"
 
 // This creates a new context inside a pthread and then tries to exit cleanly.
@@ -12,12 +12,12 @@ int main(int argc, char ** argv) {
     auto * model_path = get_model_or_exit(argc, argv);
 
     std::thread([&model_path]() {
-        llama_backend_init();
-        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
-        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
-        llama_free(ctx);
-        llama_free_model(model);
-        llama_backend_free();
+        jarvis_backend_init();
+        auto * model = jarvis_load_model_from_file(model_path, jarvis_model_default_params());
+        auto * ctx = jarvis_new_context_with_model(model, jarvis_context_default_params());
+        jarvis_free(ctx);
+        jarvis_free_model(model);
+        jarvis_backend_free();
     }).join();
 
     return 0;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2e3ad79f01944..accf68491e3be 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2873,7 +2873,7 @@ enum llm_norm_type {
     LLM_NORM_RMS,
 };
 
-struct llama_hparams {
+struct jarvis_hparams {
     uint32_t n_vocab;
     uint32_t n_embd;
     uint32_t n_head;
@@ -2904,10 +2904,10 @@ struct llama_hparams {
 
 // LLM base class
 struct test_llm : public test_case {
-    llama_hparams hp;
+    jarvis_hparams hp;
 
 protected:
-    test_llm(llama_hparams hp)
+    test_llm(jarvis_hparams hp)
         : hp(std::move(hp)) {
     }
 
@@ -3006,8 +3006,8 @@ struct test_llm : public test_case {
     }
 };
 
-// Llama
-struct test_llama : public test_llm {
+// Jarvis
+struct test_jarvis : public test_llm {
     static constexpr float freq_base = 10000.0f;
     static constexpr float freq_scale = 1.0f;
     static constexpr float ext_factor = 0.0f;
@@ -3017,7 +3017,7 @@ struct test_llama : public test_llm {
 
     std::string op_desc(ggml_tensor * t) override {
         GGML_UNUSED(t);
-        return "LLAMA";
+        return "JARVIS";
     }
 
     std::string vars() override {
@@ -3029,7 +3029,7 @@ struct test_llama : public test_llm {
         return 2e-3;
     }
 
-    test_llama(int n_tokens = 1)
+    test_jarvis(int n_tokens = 1)
         : test_llm({
             /*n_vocab        =*/ 32000,
             /*n_embd         =*/ 3200,
@@ -3683,12 +3683,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                     for (float af : { 1.0f, 1.4245f }) {
                         for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
                             for (bool ff : {false, true}) { // freq_factors
-                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
+                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // jarvis 7B
 
                                 if (all) {
-                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
-                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
-                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // jarvis 13B
+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // jarvis 30B
+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // jarvis 65B
                                 }
 
                                 if (all) {
@@ -3762,8 +3762,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
-    test_cases.emplace_back(new test_llama(1));
-    test_cases.emplace_back(new test_llama(2));
+    test_cases.emplace_back(new test_jarvis(1));
+    test_cases.emplace_back(new test_jarvis(2));
     test_cases.emplace_back(new test_falcon(1));
     test_cases.emplace_back(new test_falcon(2));
 #endif
diff --git a/tests/test-c.c b/tests/test-c.c
index 95ba73df39a3c..e8dc047916086 100644
--- a/tests/test-c.c
+++ b/tests/test-c.c
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 
 #ifdef GGML_USE_KOMPUTE
 #include "ggml-kompute.h"
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 6f046249fa1a8..826dff20e0dbf 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -5,11 +5,11 @@
 #undef NDEBUG
 #include <cassert>
 
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 
 int main(void) {
-    llama_chat_message conversation[] = {
+    jarvis_chat_message conversation[] = {
         {"system", "You are a helpful assistant"},
         {"user", "Hello"},
         {"assistant", "Hi there"},
@@ -47,7 +47,7 @@ int main(void) {
         "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
         // CohereForAI/c4ai-command-r-plus
         "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
-        // Llama-3
+        // Jarvis-3
         "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
         //Phi-3-mini
         "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
@@ -91,7 +91,7 @@ int main(void) {
         "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
         // CohereForAI/c4ai-command-r-plus
         "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-        // Llama 3
+        // Jarvis 3
         "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
         //Phi-3-mini
         "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
@@ -114,14 +114,14 @@ int main(void) {
     int32_t res;
 
     // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
+    res = jarvis_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
 
     for (size_t i = 0; i < templates.size(); i++) {
         std::string custom_template = templates[i];
         std::string expected = expected_output[i];
         formatted_chat.resize(1024);
-        res = llama_chat_apply_template(
+        res = jarvis_chat_apply_template(
             nullptr,
             custom_template.c_str(),
             conversation,
@@ -138,8 +138,8 @@ int main(void) {
     }
 
 
-    // test llama_chat_format_single for system message
-    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
+    // test jarvis_chat_format_single for system message
+    printf("\n\n=== jarvis_chat_format_single (system message) ===\n\n");
     std::vector<common_chat_msg> chat2;
     common_chat_msg sys_msg{"system", "You are a helpful assistant"};
 
@@ -150,13 +150,13 @@ int main(void) {
         return output;
     };
     assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("jarvis2") == "[INST] You are a helpful assistant\n");
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
-    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+    assert(fmt_sys("jarvis3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
 
 
-    // test llama_chat_format_single for user message
-    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
+    // test jarvis_chat_format_single for user message
+    printf("\n\n=== jarvis_chat_format_single (user message) ===\n\n");
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
@@ -169,9 +169,9 @@ int main(void) {
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
-    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("jarvis2") == "[INST] How are you [/INST]");
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
-    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+    assert(fmt_single("jarvis3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 
     return 0;
 }
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 5cc0cdb04751f..80e6fbe19178f 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -3,7 +3,7 @@
 #endif
 
 #include "unicode.h"
-#include "llama-grammar.h"
+#include "jarvis-grammar.h"
 #include "json-schema-to-grammar.h"
 
 #include <cassert>
@@ -12,14 +12,14 @@
 
 using json = nlohmann::ordered_json;
 
-static llama_grammar * build_grammar(const std::string & grammar_str) {
-    return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
+static jarvis_grammar * build_grammar(const std::string & grammar_str) {
+    return jarvis_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
 }
 
 static bool test_build_grammar_fails(const std::string & grammar_str) {
     fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
     bool grammar_fails = false;
-    llama_grammar * grammar = build_grammar(grammar_str);
+    jarvis_grammar * grammar = build_grammar(grammar_str);
     if (grammar != nullptr) {
         fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
     } else {
@@ -29,16 +29,16 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
     return grammar_fails;
 }
 
-static bool match_string(const std::string & input, llama_grammar * grammar) {
+static bool match_string(const std::string & input, jarvis_grammar * grammar) {
     const auto cpts = unicode_cpts_from_utf8(input);
 
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+    const jarvis_grammar_rules  & rules      = jarvis_grammar_get_rules (grammar);
+          jarvis_grammar_stacks & stacks_cur = jarvis_grammar_get_stacks(grammar);
 
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+        const jarvis_grammar_stacks stacks_prev = jarvis_grammar_get_stacks(grammar); // copy
 
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        jarvis_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
 
         if (stacks_cur.empty()) {
             // no stacks means that the grammar failed to match at this point
@@ -63,9 +63,9 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
     auto * grammar = build_grammar(grammar_str);
 
     // Save the original grammar stacks so that we can reset after every new string we want to test
-    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar);
+    const jarvis_grammar_stacks stacks_org = jarvis_grammar_get_stacks(grammar);
 
-    llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+    jarvis_grammar_stacks & stacks_cur = jarvis_grammar_get_stacks(grammar);
 
     fprintf(stderr, "  🔵 Valid strings:\n");
 
@@ -94,7 +94,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
                 fclose(string_file);
             }
 
-            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
+            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./jarvis-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
         } else {
             fprintf(stdout, "✅︎\n");
         }
@@ -126,7 +126,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
     }
 
     // Clean up allocated memory
-    llama_grammar_free_impl(grammar);
+    jarvis_grammar_free_impl(grammar);
 }
 static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
     test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
@@ -666,7 +666,7 @@ static void test_failure_missing_root() {
         term ::= number
         number ::= [0-9]+)""";
 
-    llama_grammar_parser parsed_grammar;
+    jarvis_grammar_parser parsed_grammar;
     parsed_grammar.parse(grammar_str.c_str());
 
     // Ensure we parsed correctly
@@ -689,7 +689,7 @@ static void test_failure_missing_reference() {
 
     fprintf(stderr, "    Expected error:  ");
 
-    llama_grammar_parser parsed_grammar;
+    jarvis_grammar_parser parsed_grammar;
     parsed_grammar.parse(grammar_str.c_str());
 
     // Ensure we did NOT parsed correctly
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 259172d999c78..e47cebfc938a5 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -2,27 +2,27 @@
 #undef NDEBUG
 #endif
 
-#include "llama.h"
-#include "llama-grammar.h"
+#include "jarvis.h"
+#include "jarvis-grammar.h"
 
 #include <cassert>
 
-static const char * type_str(llama_gretype type) {
+static const char * type_str(jarvis_gretype type) {
     switch (type) {
-        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
-        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
-        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
-        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
-        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
-        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
-        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
+        case JARVIS_GRETYPE_CHAR: return "JARVIS_GRETYPE_CHAR";
+        case JARVIS_GRETYPE_CHAR_NOT: return "JARVIS_GRETYPE_CHAR_NOT";
+        case JARVIS_GRETYPE_CHAR_ALT: return "JARVIS_GRETYPE_CHAR_ALT";
+        case JARVIS_GRETYPE_CHAR_RNG_UPPER: return "JARVIS_GRETYPE_CHAR_RNG_UPPER";
+        case JARVIS_GRETYPE_RULE_REF: return "JARVIS_GRETYPE_RULE_REF";
+        case JARVIS_GRETYPE_ALT: return "JARVIS_GRETYPE_ALT";
+        case JARVIS_GRETYPE_END: return "JARVIS_GRETYPE_END";
         default: return "?";
     }
 }
 
-static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
+static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<jarvis_grammar_element> &expected_rules) {
     uint32_t index = 0;
-    llama_grammar_parser parsed_grammar;
+    jarvis_grammar_parser parsed_grammar;
     parsed_grammar.parse(grammar_bytes);
 
     std::map<uint32_t, std::string> symbol_names;
@@ -42,8 +42,8 @@ static void verify_parsing(const char *grammar_bytes, const std::vector<std::pai
             for (uint32_t i = 0; i < rule.size(); i++) {
                 std::string rule_str;
                 fprintf(stderr, "        {%s, ", type_str(rule[i].type));
-                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
-                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+                if (rule[i].type == JARVIS_GRETYPE_CHAR || rule[i].type == JARVIS_GRETYPE_CHAR_ALT ||
+                    rule[i].type == JARVIS_GRETYPE_CHAR_NOT || rule[i].type == JARVIS_GRETYPE_CHAR_RNG_UPPER) {
                     char c = rule[i].value;
                     if (c == '\n') {
                         fprintf(stderr, "'\\n'");
@@ -56,7 +56,7 @@ static void verify_parsing(const char *grammar_bytes, const std::vector<std::pai
                     } else {
                         fprintf(stderr, "'%c'", c);
                     }
-                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
+                } else if (rule[i].type == JARVIS_GRETYPE_RULE_REF) {
                     fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
                 } else {
                     fprintf(stderr, "%u", rule[i].value);
@@ -109,8 +109,8 @@ static void verify_parsing(const char *grammar_bytes, const std::vector<std::pai
         // compare rule to expected rule
         for (uint32_t i = 0; i < rule.size(); i++)
         {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
+            jarvis_grammar_element element = rule[i];
+            jarvis_grammar_element expected_element = expected_rules[index];
 
             // pretty print error message before asserting
             if (expected_element.type != element.type || expected_element.value != element.value)
@@ -132,7 +132,7 @@ static void verify_parsing(const char *grammar_bytes, const std::vector<std::pai
 
 static void verify_failure(const char * grammar_bytes) {
     fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
-    llama_grammar_parser result;
+    jarvis_grammar_parser result;
     result.parse(grammar_bytes);
     assert(result.rules.empty() && "should have failed");
 }
@@ -153,8 +153,8 @@ int main()
         {"root", 0},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -163,16 +163,16 @@ int main()
         {"root", 0},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 'b'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR_NOT, '1'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_CHAR, 'b'},
+        {JARVIS_GRETYPE_CHAR_ALT, 'd'},
+        {JARVIS_GRETYPE_CHAR_ALT, 'x'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_CHAR_NOT, '1'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '3'},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -184,17 +184,17 @@ int main()
         {"root_2", 2},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* a */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_END, 0},
         // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
         // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* a */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -204,14 +204,14 @@ int main()
         {"root_1", 1},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -223,15 +223,15 @@ int main()
         {"root_2", 2},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_END, 0},
         // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
         // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* a */ 1},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -241,12 +241,12 @@ int main()
         {"root_1", 1},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -258,16 +258,16 @@ int main()
         {"root_2", 2},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_END, 0},
         // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
         // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* a */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -277,13 +277,13 @@ int main()
         {"root_1", 1},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -292,9 +292,9 @@ int main()
         {"root", 0},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -304,15 +304,15 @@ int main()
         {"root_1", 1},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -321,11 +321,11 @@ int main()
         {"root", 0},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -336,19 +336,19 @@ int main()
         {"root_2", 2},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // root_2 (index 2)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -366,47 +366,47 @@ int main()
         {"term_7", 7},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* expr */ 2},
+        {JARVIS_GRETYPE_CHAR, '='},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 3},
+        {JARVIS_GRETYPE_CHAR, '\n'},
+        {JARVIS_GRETYPE_END, 0},
         // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 3},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {JARVIS_GRETYPE_END, 0},
         // term (index 3)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '0'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {JARVIS_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {JARVIS_GRETYPE_END, 0},
         // root_4 (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // expr_5 (index 5)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '-'},
+        {JARVIS_GRETYPE_CHAR_ALT, '+'},
+        {JARVIS_GRETYPE_CHAR_ALT, '*'},
+        {JARVIS_GRETYPE_CHAR_ALT, '/'},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 3},
+        {JARVIS_GRETYPE_END, 0},
         // expr_6 (index 6)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_5 */ 5},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // term_7 (index 7)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '0'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {JARVIS_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     verify_parsing(R"""(
@@ -432,85 +432,85 @@ int main()
         {"ws_12", 12},
     }, {
         // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {JARVIS_GRETYPE_END, 0},
         // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* expr */ 2},
+        {JARVIS_GRETYPE_CHAR, '='},
+        {JARVIS_GRETYPE_RULE_REF, /* ws */ 3},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 4},
+        {JARVIS_GRETYPE_CHAR, '\n'},
+        {JARVIS_GRETYPE_END, 0},
         // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 4},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {JARVIS_GRETYPE_END, 0},
         // ws (index 3)
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {JARVIS_GRETYPE_END, 0},
         // term (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, '('},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, ')'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* ident */ 8},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* num */ 9},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_CHAR, '('},
+        {JARVIS_GRETYPE_RULE_REF, /* ws */ 3},
+        {JARVIS_GRETYPE_RULE_REF, /* expr */ 2},
+        {JARVIS_GRETYPE_CHAR, ')'},
+        {JARVIS_GRETYPE_RULE_REF, /* ws */ 3},
+        {JARVIS_GRETYPE_END, 0},
         // root_5 (index 5)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {JARVIS_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // expr_6 (index 6)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '-'},
+        {JARVIS_GRETYPE_CHAR_ALT, '+'},
+        {JARVIS_GRETYPE_CHAR_ALT, '*'},
+        {JARVIS_GRETYPE_CHAR_ALT, '/'},
+        {JARVIS_GRETYPE_RULE_REF, /* term */ 4},
+        {JARVIS_GRETYPE_END, 0},
         // expr_7 (index 7)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {JARVIS_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // ident (index 8)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {JARVIS_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {JARVIS_GRETYPE_RULE_REF, /* ws */ 3},
+        {JARVIS_GRETYPE_END, 0},
         // num (index 9)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '0'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {JARVIS_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {JARVIS_GRETYPE_RULE_REF, /* ws */ 3},
+        {JARVIS_GRETYPE_END, 0},
         // ident_10 (index 10)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_CHAR_ALT, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_CHAR_ALT, '_'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, 'a'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {JARVIS_GRETYPE_CHAR_ALT, '0'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {JARVIS_GRETYPE_CHAR_ALT, '_'},
+        {JARVIS_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // num_11 (index 11)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, '0'},
+        {JARVIS_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {JARVIS_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
         // ws_12 (index 12)
-        {LLAMA_GRETYPE_CHAR, ' '},
-        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
-        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
+        {JARVIS_GRETYPE_CHAR, ' '},
+        {JARVIS_GRETYPE_CHAR_ALT, '\t'},
+        {JARVIS_GRETYPE_CHAR_ALT, '\n'},
+        {JARVIS_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {JARVIS_GRETYPE_ALT, 0},
+        {JARVIS_GRETYPE_END, 0},
     });
 
     return 0;
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 9d2db91f52c35..226f6f9c90579 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -4,7 +4,7 @@
 
 #include "json-schema-to-grammar.h"
 
-#include "llama-grammar.h"
+#include "jarvis-grammar.h"
 
 #include <cassert>
 #include <fstream>
@@ -41,7 +41,7 @@ struct TestCase {
     }
     void verify_expectation_parseable() const {
         try {
-            llama_grammar_parser state;
+            jarvis_grammar_parser state;
             state.parse(expected_grammar.c_str());
             if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
                 throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
@@ -1241,8 +1241,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
 }
 
 int main() {
-    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
-    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
+    fprintf(stderr, "JARVIS_NODE_AVAILABLE = %s\n", getenv("JARVIS_NODE_AVAILABLE") ? "true" : "false");
+    fprintf(stderr, "JARVIS_PYTHON_AVAILABLE = %s\n", getenv("JARVIS_PYTHON_AVAILABLE") ? "true" : "false");
 
     test_all("C++", [](const TestCase & tc) {
         try {
@@ -1254,10 +1254,10 @@ int main() {
         }
     });
 
-    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
+    if (getenv("JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR")) {
         fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
     } else {
-        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+        if (getenv("JARVIS_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
             test_all("Python", [](const TestCase & tc) {
                 write("test-json-schema-input.tmp", tc.schema);
                 tc.verify_status(std::system(
@@ -1268,7 +1268,7 @@ int main() {
             fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
         }
 
-        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+        if (getenv("JARVIS_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
             test_all("JavaScript", [](const TestCase & tc) {
                 write("test-json-schema-input.tmp", tc.schema);
                 tc.verify_status(std::system(
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index 6f1374ca8ed58..3a0d105f1e865 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -2,15 +2,15 @@
 #undef NDEBUG
 #endif
 
-#include "llama.h"
-#include "llama-grammar.h"
+#include "jarvis.h"
+#include "jarvis-grammar.h"
 
 #include <cassert>
 #include <stdexcept>
 
 int main()
 {
-    llama_grammar_parser parsed_grammar;
+    jarvis_grammar_parser parsed_grammar;
 
     std::vector<std::pair<std::string, uint32_t>> expected = {
         {"expr", 2},
@@ -28,74 +28,74 @@ int main()
         {"ws_12", 12},
     };
 
-    std::vector<std::vector<llama_grammar_element>> expected_rules = {
-        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
+    std::vector<std::vector<jarvis_grammar_element>> expected_rules = {
+        {{JARVIS_GRETYPE_RULE_REF, 5}, {JARVIS_GRETYPE_END, 0}},
         {
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_CHAR, 10},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_RULE_REF, 2},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_RULE_REF, 4},
+            {JARVIS_GRETYPE_CHAR, 10},
+            {JARVIS_GRETYPE_END, 0},
         },
-        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
-        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
+        {{JARVIS_GRETYPE_RULE_REF, 4}, {JARVIS_GRETYPE_RULE_REF, 7}, {JARVIS_GRETYPE_END, 0}},
+        {{JARVIS_GRETYPE_RULE_REF, 12}, {JARVIS_GRETYPE_END, 0}},
         {
-            {LLAMA_GRETYPE_RULE_REF, 8},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_RULE_REF, 9},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 40},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 41},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_RULE_REF, 8},
+            {JARVIS_GRETYPE_ALT, 0},
+            {JARVIS_GRETYPE_RULE_REF, 9},
+            {JARVIS_GRETYPE_ALT, 0},
+            {JARVIS_GRETYPE_CHAR, 40},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_RULE_REF, 2},
+            {JARVIS_GRETYPE_CHAR, 41},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_END, 0},
         },
-        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
+        {{JARVIS_GRETYPE_RULE_REF, 1}, {JARVIS_GRETYPE_RULE_REF, 5}, {JARVIS_GRETYPE_ALT, 0}, {JARVIS_GRETYPE_RULE_REF, 1}, {JARVIS_GRETYPE_END, 0}},
         {
-            {LLAMA_GRETYPE_CHAR, 45},
-            {LLAMA_GRETYPE_CHAR_ALT, 43},
-            {LLAMA_GRETYPE_CHAR_ALT, 42},
-            {LLAMA_GRETYPE_CHAR_ALT, 47},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_CHAR, 45},
+            {JARVIS_GRETYPE_CHAR_ALT, 43},
+            {JARVIS_GRETYPE_CHAR_ALT, 42},
+            {JARVIS_GRETYPE_CHAR_ALT, 47},
+            {JARVIS_GRETYPE_RULE_REF, 4},
+            {JARVIS_GRETYPE_END, 0},
         },
-        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
+        {{JARVIS_GRETYPE_RULE_REF, 6}, {JARVIS_GRETYPE_RULE_REF, 7}, {JARVIS_GRETYPE_ALT, 0}, {JARVIS_GRETYPE_END, 0}},
         {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_CHAR, 97},
+            {JARVIS_GRETYPE_CHAR_RNG_UPPER, 122},
+            {JARVIS_GRETYPE_RULE_REF, 10},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_END, 0},
         },
-        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
+        {{JARVIS_GRETYPE_RULE_REF, 11}, {JARVIS_GRETYPE_RULE_REF, 3}, {JARVIS_GRETYPE_END, 0}},
         {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_CHAR_ALT, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_CHAR_ALT, 95},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_CHAR, 97},
+            {JARVIS_GRETYPE_CHAR_RNG_UPPER, 122},
+            {JARVIS_GRETYPE_CHAR_ALT, 48},
+            {JARVIS_GRETYPE_CHAR_RNG_UPPER, 57},
+            {JARVIS_GRETYPE_CHAR_ALT, 95},
+            {JARVIS_GRETYPE_RULE_REF, 10},
+            {JARVIS_GRETYPE_ALT, 0},
+            {JARVIS_GRETYPE_END, 0},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_RULE_REF, 11},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_CHAR_RNG_UPPER, 57},
+            {JARVIS_GRETYPE_RULE_REF, 11},
+            {JARVIS_GRETYPE_ALT, 0},
+            {JARVIS_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_CHAR_RNG_UPPER, 57},
+            {JARVIS_GRETYPE_END, 0},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 32},
-            {LLAMA_GRETYPE_CHAR_ALT, 9},
-            {LLAMA_GRETYPE_CHAR_ALT, 10},
-            {LLAMA_GRETYPE_RULE_REF, 12},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
+            {JARVIS_GRETYPE_CHAR, 32},
+            {JARVIS_GRETYPE_CHAR_ALT, 9},
+            {JARVIS_GRETYPE_CHAR_ALT, 10},
+            {JARVIS_GRETYPE_RULE_REF, 12},
+            {JARVIS_GRETYPE_ALT, 0},
+            {JARVIS_GRETYPE_END, 0},
         },
     };
 
@@ -113,73 +113,73 @@ int main()
         }
     }
 
-    llama_grammar * grammar = NULL;
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+    jarvis_grammar * grammar = NULL;
+    std::vector<const jarvis_grammar_element *> grammar_rules(parsed_grammar.c_rules());
 
-    grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    grammar = jarvis_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     if (grammar == nullptr)
     {
-        throw std::runtime_error("Failed to initialize llama_grammar");
+        throw std::runtime_error("Failed to initialize jarvis_grammar");
     }
 
-    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
+    std::vector<std::vector<jarvis_grammar_element>> expected_stacks = {
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
+            {JARVIS_GRETYPE_RULE_REF, 5},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_CHAR, 97},
         },
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_RULE_REF, 5},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_CHAR, 48},
         },
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_RULE_REF, 5},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_CHAR, 48},
         },
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
+            {JARVIS_GRETYPE_RULE_REF, 5},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_CHAR, 40},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_CHAR, 97},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_CHAR, 48},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_RULE_REF, 3},
+            {JARVIS_GRETYPE_CHAR, 48},
         },
         {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
+            {JARVIS_GRETYPE_CHAR, 61},
+            {JARVIS_GRETYPE_RULE_REF, 7},
+            {JARVIS_GRETYPE_CHAR, 40},
         }};
 
     auto index = 0;
-    for (const llama_grammar_stack & stack : llama_grammar_get_stacks(grammar))
+    for (const jarvis_grammar_stack & stack : jarvis_grammar_get_stacks(grammar))
     {
         // compare stack to expected_stack
         for (uint32_t i = 0; i < stack.size(); i++)
         {
-            const llama_grammar_element * element = stack[i];
-            const llama_grammar_element & expected_element = expected_stacks[index][i];
+            const jarvis_grammar_element * element = stack[i];
+            const jarvis_grammar_element & expected_element = expected_stacks[index][i];
 
             // pretty print error message before asserting
             if (expected_element.type != element->type || expected_element.value != element->value)
@@ -195,7 +195,7 @@ int main()
         index++;
     }
 
-    std::vector<llama_grammar_candidate> next_candidates;
+    std::vector<jarvis_grammar_candidate> next_candidates;
     next_candidates.resize(24);
 
     for (size_t i = 0; i < 24; ++i)
@@ -375,13 +375,13 @@ int main()
         },
     };
 
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
+    std::vector<jarvis_grammar_candidate> rejects = jarvis_grammar_reject_candidates_for_stack(jarvis_grammar_get_rules(grammar), jarvis_grammar_get_stacks(grammar)[0], next_candidates);
 
-    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
+    std::vector<std::vector<jarvis_grammar_candidate>> all_rejects;
 
-    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
+    for (std::size_t count = 0; count < jarvis_grammar_get_stacks(grammar).size(); ++count)
     {
-        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
+        rejects = jarvis_grammar_reject_candidates_for_stack(jarvis_grammar_get_rules(grammar), jarvis_grammar_get_stacks(grammar)[count], next_candidates);
         all_rejects.push_back(rejects);
     }
 
@@ -403,7 +403,7 @@ int main()
         candidate.code_points = nullptr;
     }
 
-    llama_grammar_free_impl(grammar);
+    jarvis_grammar_free_impl(grammar);
 
     return 0;
 }
diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh
index fe90ce0d1b801..c3fd76245da20 100755
--- a/tests/test-lora-conversion-inference.sh
+++ b/tests/test-lora-conversion-inference.sh
@@ -4,7 +4,7 @@ set -e
 # Array of models to iterate over
 declare -a params=(
     "Gemma2ForCausalLM 64"
-    "LlamaForCausalLM 64"
+    "JarvisForCausalLM 64"
     "Phi3ForCausalLM 64"
 )
 
@@ -66,27 +66,27 @@ run_conversion_and_inference_lora() {
         --outtype f32
 
     echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..."
-    ./llama-export-lora \
+    echo "Running jarvis-export-lora with lora for $model_name with hidden_size $hidden_size..."
+    ./jarvis-export-lora \
         -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
         -o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
         --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf
 
     # Run inference
     echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_BASE=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+    echo "Running jarvis-cli without lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_BASE=$(./jarvis-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
         -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
 
     echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_LORA_HOT=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+    echo "Running jarvis-cli with hot lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_HOT=$(./jarvis-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
         --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
         -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
 
     echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_LORA_MERGED=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+    echo "Running jarvis-cli with merged lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_MERGED=$(./jarvis-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
         -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
 
     # Remove any initial white space
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index 858535c3c4020..ff41d6219de74 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include "get-model.h"
 
 #include <cstdlib>
@@ -14,14 +14,14 @@ int main(int argc, char *argv[] ) {
     fprintf(stderr, "using '%s'\n", model_path);
     fclose(file);
 
-    llama_backend_init();
-    auto params = llama_model_params{};
+    jarvis_backend_init();
+    auto params = jarvis_model_params{};
     params.use_mmap = false;
     params.progress_callback = [](float progress, void * ctx){
         (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file(model_path, params);
-    llama_backend_free();
+    auto * model = jarvis_load_model_from_file(model_path, params);
+    jarvis_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index eb39661c3698f..0320b175e86ce 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"
 
 #ifdef NDEBUG
 #undef NDEBUG
@@ -10,9 +10,9 @@
 #include <string>
 #include <vector>
 
-extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
+extern struct jarvis_sampler * jarvis_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<jarvis_token>>& seq_breakers);
 
-static void dump(const llama_token_data_array * cur_p) {
+static void dump(const jarvis_token_data_array * cur_p) {
     for (size_t i = 0; i < cur_p->size; i++) {
         printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
@@ -23,27 +23,27 @@ static void dump(const llama_token_data_array * cur_p) {
 struct sampler_tester {
     sampler_tester(size_t n_vocab) {
         cur.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        for (jarvis_token token_id = 0; token_id < (jarvis_token)n_vocab; token_id++) {
             const float logit = logf(token_id);
-            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+            cur.emplace_back(jarvis_token_data{token_id, logit, 0.0f});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = jarvis_token_data_array { cur.data(), cur.size(), -1, false };
     }
 
     sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
         cur.reserve(probs.size());
-        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+        for (jarvis_token token_id = 0; token_id < (jarvis_token)probs.size(); token_id++) {
             const float logit = logf(probs[token_id]);
-            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+            cur.emplace_back(jarvis_token_data{token_id, logit, probs[token_id]});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = jarvis_token_data_array { cur.data(), cur.size(), -1, false };
     }
 
-    void apply(llama_sampler * sampler) {
-        llama_sampler_apply(sampler, &cur_p);
-        llama_sampler_free(sampler);
+    void apply(jarvis_sampler * sampler) {
+        jarvis_sampler_apply(sampler, &cur_p);
+        jarvis_sampler_free(sampler);
     }
 
     void check() {
@@ -53,20 +53,20 @@ struct sampler_tester {
         }
     }
 
-    llama_token_data_array cur_p;
+    jarvis_token_data_array cur_p;
 
 private:
     const std::vector<float> probs_expected;
 
-    std::vector<llama_token_data> cur;
+    std::vector<jarvis_token_data> cur;
 };
 
 static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_temp(temp));
-    tester.apply(llama_sampler_init_dist(0));
+    tester.apply(jarvis_sampler_init_temp(temp));
+    tester.apply(jarvis_sampler_init_dist(0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -76,8 +76,8 @@ static void test_temp_ext(const std::vector<float> & probs, const std::vector<fl
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
-    tester.apply(llama_sampler_init_dist (0));
+    tester.apply(jarvis_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(jarvis_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -87,8 +87,8 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_top_k(k));
-    tester.apply(llama_sampler_init_dist (0));
+    tester.apply(jarvis_sampler_init_top_k(k));
+    tester.apply(jarvis_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -98,8 +98,8 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_top_p(p, 1));
-    tester.apply(llama_sampler_init_dist (0));
+    tester.apply(jarvis_sampler_init_top_p(p, 1));
+    tester.apply(jarvis_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -109,7 +109,7 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_tail_free(z, 1));
+    tester.apply(jarvis_sampler_init_tail_free(z, 1));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -119,8 +119,8 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_min_p(p, 1));
-    tester.apply(llama_sampler_init_dist (0));
+    tester.apply(jarvis_sampler_init_min_p(p, 1));
+    tester.apply(jarvis_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -130,7 +130,7 @@ static void test_xtc(const std::vector<float> & probs, const std::vector<float>
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    tester.apply(jarvis_sampler_init_xtc(p, t, 0, 0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -140,14 +140,14 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_typical(p, 1));
+    tester.apply(jarvis_sampler_init_typical(p, 1));
     DUMP(&tester.cur_p);
 
     tester.check();
 }
 
 static void test_penalties(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & probs, const std::vector<jarvis_token> & last_tokens,
     const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
     GGML_ASSERT(probs.size() == probs_expected.size());
@@ -155,39 +155,39 @@ static void test_penalties(
     sampler_tester tester(probs, probs_expected);
 
     const size_t n_vocab = probs.size();
-    auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
+    auto * sampler = jarvis_sampler_init_penalties(n_vocab, JARVIS_TOKEN_NULL, JARVIS_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
-        llama_sampler_accept(sampler, last_tokens[i]);
+        jarvis_sampler_accept(sampler, last_tokens[i]);
     }
 
     DUMP(&tester.cur_p);
     tester.apply(sampler);
-    tester.apply(llama_sampler_init_dist(0));
+    tester.apply(jarvis_sampler_init_dist(0));
     DUMP(&tester.cur_p);
 
     tester.check();
 }
 
 static void test_dry(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & probs, const std::vector<jarvis_token> & last_tokens,
     const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
     int dry_allowed_length, int dry_penalty_last_n,
-    const std::vector<std::vector<llama_token>> & seq_breakers
+    const std::vector<std::vector<jarvis_token>> & seq_breakers
 ) {
     GGML_ASSERT(probs.size() == expected_probs.size());
 
     sampler_tester tester(probs, expected_probs);
 
-    auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+    auto * sampler = jarvis_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
-        llama_sampler_accept(sampler, last_tokens[i]);
+        jarvis_sampler_accept(sampler, last_tokens[i]);
     }
 
     DUMP(&tester.cur_p);
     tester.apply(sampler);
-    tester.apply(llama_sampler_init_dist(0));
+    tester.apply(jarvis_sampler_init_dist(0));
     DUMP(&tester.cur_p);
     tester.check();
 }
@@ -196,21 +196,21 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
 ) {
     sampler_tester tester(n_vocab);
 
-          llama_token min_token_id = 0;
-    const llama_token max_token_id = n_vocab-1;
+          jarvis_token min_token_id = 0;
+    const jarvis_token max_token_id = n_vocab-1;
 
     for (auto s : samplers_sequence) {
         switch (s){
-            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
+            case 'k': tester.apply(jarvis_sampler_init_top_k(top_k)); break;
             case 'f': GGML_ABORT("tail_free test not implemented");
             case 'y': GGML_ABORT("typical test not implemented");
-            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
-            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
+            case 'p': tester.apply(jarvis_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(jarvis_sampler_init_min_p(min_p, 1)); break;
             case 't': GGML_ABORT("temperature test not implemented");
             default : GGML_ABORT("Unknown sampler");
         }
 
-        tester.apply(llama_sampler_init_dist(0));
+        tester.apply(jarvis_sampler_init_dist(0));
 
         auto & cur_p = tester.cur_p;
 
@@ -218,7 +218,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
 
         if (s == 'k') {
             const int expected_size = std::min(size, top_k);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
+            min_token_id = std::max(min_token_id, (jarvis_token)(n_vocab - top_k));
 
             GGML_ASSERT(size == expected_size);
             GGML_ASSERT(cur_p.data[0].id == max_token_id);
@@ -253,8 +253,8 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
 
             min_token_id = floorf(min_p * n_vocab);
             min_token_id = std::max(min_token_id, 1);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
-            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
+            min_token_id = std::max(min_token_id, (jarvis_token)(n_vocab - size));
+            min_token_id = std::min(min_token_id, (jarvis_token)(n_vocab - 1));
 
             GGML_ASSERT(size == expected_size);
             GGML_ASSERT(cur_p.data[0].id == max_token_id);
@@ -268,21 +268,21 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
            samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 
-static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
-    std::vector<llama_token_data> cur(data.size());
+static void bench(jarvis_sampler * cnstr, const char * cnstr_name, const std::vector<jarvis_token_data> & data, int n_iter) {
+    std::vector<jarvis_token_data> cur(data.size());
     std::copy(data.begin(), data.end(), cur.begin());
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    llama_sampler_apply(cnstr, &cur_p);
-    llama_sampler_reset(cnstr);
+    jarvis_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    jarvis_sampler_apply(cnstr, &cur_p);
+    jarvis_sampler_reset(cnstr);
     const int64_t t_start = ggml_time_us();
     for (int i = 0; i < n_iter; i++) {
         std::copy(data.begin(), data.end(), cur.begin());
-        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-        llama_sampler_apply(cnstr, &cur_p);
-        llama_sampler_reset(cnstr);
+        jarvis_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        jarvis_sampler_apply(cnstr, &cur_p);
+        jarvis_sampler_reset(cnstr);
     }
     const int64_t t_end = ggml_time_us();
-    llama_sampler_free(cnstr);
+    jarvis_sampler_free(cnstr);
     printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
@@ -291,20 +291,20 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
 static void test_perf() {
     const int n_vocab = 1 << 17;
 
-    std::vector<llama_token_data> data;
+    std::vector<jarvis_token_data> data;
 
     data.reserve(n_vocab);
     for (int i = 0; i < n_vocab; i++) {
         const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
-        data.emplace_back(llama_token_data{i, logit, 0.0f});
+        data.emplace_back(jarvis_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(jarvis_sampler_init_top_k    (40),                     data, 32);
+    BENCH(jarvis_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(jarvis_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(jarvis_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(jarvis_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(jarvis_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
 }
 
 int main(void) {
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 0af85f0020e19..d133f511e289c 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "console.h"
 
@@ -9,8 +9,8 @@
 #include <fstream>
 #include <thread>
 
-//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//static const std::map<std::string, std::vector<jarvis_token>> & k_tests() {
+//    static std::map<std::string, std::vector<jarvis_token>> _k_tests = {
 //        { ""                      , {  }, },
 //        { " "                     , {     220, }, },
 //        { "  "                    , {     256, }, },
@@ -56,10 +56,10 @@
 //    return _k_tests;
 //}
 
-using llama_tests = std::map<std::string, std::vector<llama_token>>;
+using jarvis_tests = std::map<std::string, std::vector<jarvis_token>>;
 
-static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
-    llama_tests tests;
+static jarvis_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    jarvis_tests tests;
 
     std::ifstream ifs_inp(fname_inp);
     if (!ifs_inp) {
@@ -104,7 +104,7 @@ static llama_tests read_tests(const std::string & fname_inp, const std::string &
         const std::string & s = sinp[i];
         const std::string & o = string_strip(sout[i]);
 
-        std::vector<llama_token> toks;
+        std::vector<jarvis_token> toks;
 
         size_t pos = 0;
         while (pos < o.size()) {
@@ -141,31 +141,31 @@ int main(int argc, char **argv) {
 
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
-    llama_model * model;
-    llama_context * ctx;
+    jarvis_model * model;
+    jarvis_context * ctx;
 
-    llama_backend_init();
+    jarvis_backend_init();
 
     // load the vocab
     {
-        auto mparams = llama_model_default_params();
+        auto mparams = jarvis_model_default_params();
 
         mparams.vocab_only = true;
 
-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = jarvis_load_model_from_file(fname.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
             return 1;
         }
 
-        auto cparams = llama_context_default_params();
+        auto cparams = jarvis_context_default_params();
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = jarvis_new_context_with_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            jarvis_free_model(model);
             return 1;
         }
     }
@@ -178,7 +178,7 @@ int main(int argc, char **argv) {
 
     bool success = true;
 
-    const auto k_tests = [&]() -> llama_tests {
+    const auto k_tests = [&]() -> jarvis_tests {
         if (!fname_text.empty()) {
             return {};
         }
@@ -202,7 +202,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < nthread; i++) {
         threads[i] = std::thread([&, i]() {
             for (const auto & test_kv : k_tests) {
-                const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
+                const std::vector<jarvis_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
 
                 // here only print the result of the first thread
                 // because the other threads are running the same tests
@@ -268,7 +268,7 @@ int main(int argc, char **argv) {
 
         fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
 
-        std::vector<llama_token> res;
+        std::vector<jarvis_token> res;
 
         {
             const auto t_start = ggml_time_us();
@@ -292,7 +292,7 @@ int main(int argc, char **argv) {
             }
 
             for (const auto & tok : res) {
-                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                //ofs << tok << " '" << string_strip(jarvis_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
                 ofs << tok << "\n";
             }
         }
@@ -300,10 +300,10 @@ int main(int argc, char **argv) {
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
     }
 
-    llama_free_model(model);
-    llama_free(ctx);
+    jarvis_free_model(model);
+    jarvis_free(ctx);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     printf("\n");
     printf("Tests %s\n", success ? "passed" : "failed");
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index cd760d1ce5be7..e3ab4978a6bb9 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -24,7 +24,7 @@
     print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
     with open(fname_out, 'w', encoding='utf-8') as f:
         for x in res:
-            # LLaMA v3 for some reason strips the space for these tokens (and others)
+            # JARVIS v3 for some reason strips the space for these tokens (and others)
             # if x == 662:
             #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
             # elif x == 1174:
diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh
index 4d2b8365547df..5ba17baeea67a 100755
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@@ -22,7 +22,7 @@ set -e
 printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
 python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
 
-printf "Tokenizing using (cpp) llama.cpp ...\n"
+printf "Tokenizing using (cpp) jarvis.cpp ...\n"
 ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
 
 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 0ff7fc8333d8a..9382352b69c58 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "unicode.h"
 #include "console.h"
@@ -35,37 +35,37 @@ int main(int argc, char **argv) {
         fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
     }
 
-    llama_model * model;
-    llama_context * ctx;
+    jarvis_model * model;
+    jarvis_context * ctx;
 
-    llama_backend_init();
+    jarvis_backend_init();
 
     // load the vocab
     {
-        auto mparams = llama_model_default_params();
+        auto mparams = jarvis_model_default_params();
 
         mparams.vocab_only = true;
 
-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = jarvis_load_model_from_file(fname.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
             return 1;
         }
 
-        auto cparams = llama_context_default_params();
+        auto cparams = jarvis_context_default_params();
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = jarvis_new_context_with_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            jarvis_free_model(model);
             return 1;
         }
     }
 
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
+    //GGML_ASSERT(jarvis_vocab_type(model) == JARVIS_VOCAB_TYPE_BPE);
+    if (jarvis_vocab_type(model) != JARVIS_VOCAB_TYPE_BPE) {
         return 99;
     }
 
@@ -75,13 +75,13 @@ int main(int argc, char **argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_n_vocab(model);
+    const int n_vocab = jarvis_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = common_detokenize(ctx, std::vector<int>(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+            std::vector<jarvis_token> tokens = common_tokenize(ctx, str, false, true);
             if (ignore_merges && tokens.size() > 1) {
                 fprintf(stderr,
                         "%s : error: token %d detokenizes to '%s'(%zu) but "
@@ -123,7 +123,7 @@ int main(int argc, char **argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
+                    std::vector<jarvis_token> tokens = common_tokenize(ctx, str, false);
                     std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
@@ -143,10 +143,10 @@ int main(int argc, char **argv) {
         }
     }
 
-    llama_free_model(model);
-    llama_free(ctx);
+    jarvis_free_model(model);
+    jarvis_free(ctx);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index 9b0716a433332..663b0106d1df3 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -1,4 +1,4 @@
-#include "llama.h"
+#include "jarvis.h"
 #include "common.h"
 #include "unicode.h"
 #include "console.h"
@@ -23,37 +23,37 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
-    llama_model * model;
-    llama_context * ctx;
+    jarvis_model * model;
+    jarvis_context * ctx;
 
-    llama_backend_init();
+    jarvis_backend_init();
 
     // load the vocab
     {
-        auto mparams = llama_model_default_params();
+        auto mparams = jarvis_model_default_params();
 
         mparams.vocab_only = true;
 
-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = jarvis_load_model_from_file(fname.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
             return 1;
         }
 
-        auto cparams = llama_context_default_params();
+        auto cparams = jarvis_context_default_params();
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = jarvis_new_context_with_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            jarvis_free_model(model);
             return 1;
         }
     }
 
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
+    //GGML_ASSERT(jarvis_vocab_type(model) == JARVIS_VOCAB_TYPE_SPM);
+    if (jarvis_vocab_type(model) != JARVIS_VOCAB_TYPE_SPM) {
         return 99;
     }
 
@@ -63,11 +63,11 @@ int main(int argc, char ** argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_n_vocab(model);
+    const int n_vocab = jarvis_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
-        std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+        std::vector<jarvis_token> tokens = common_tokenize(ctx, str, false, true);
         std::string check = common_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
@@ -93,7 +93,7 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+                    std::vector<jarvis_token> tokens = common_tokenize(ctx, str, false, true);
                     std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
@@ -113,10 +113,10 @@ int main(int argc, char ** argv) {
         }
     }
 
-    llama_free_model(model);
-    llama_free(ctx);
+    jarvis_free_model(model);
+    jarvis_free(ctx);
 
-    llama_backend_free();
+    jarvis_backend_free();
 
     return 0;
 }
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 9ebe6c89185a3..a977e3dc77e89 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -1,9 +1,9 @@
-# Test libllama tokenizer == AutoTokenizer.
+# Test libjarvis tokenizer == AutoTokenizer.
 # Brute force random words/text generation.
 #
 # Sample usage:
 #
-#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
+#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-jarvis-bpe.gguf ./models/tokenizers/jarvis-bpe
 #
 
 from __future__ import annotations
@@ -26,22 +26,22 @@
 logger = logging.getLogger("test-tokenizer-random")
 
 
-class LibLlama:
+class LibJarvis:
 
-    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
+    DEFAULT_PATH_JARVIS_H = "./include/jarvis.h"
     DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
-    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+    DEFAULT_PATH_LIBJARVIS = "./build/src/libjarvis.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
 
-    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
-        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
+    def __init__(self, path_jarvis_h: str | None = None, path_includes: list[str] = [], path_libjarvis: str | None = None):
+        path_jarvis_h = path_jarvis_h or self.DEFAULT_PATH_JARVIS_H
         path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
-        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
-        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
-        self.lib.llama_backend_init()
+        path_libjarvis = path_libjarvis or self.DEFAULT_PATH_LIBJARVIS
+        (self.ffi, self.lib) = self._load_libjarvis_cffi(path_jarvis_h, path_includes, path_libjarvis)
+        self.lib.jarvis_backend_init()
 
-    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
+    def _load_libjarvis_cffi(self, path_jarvis_h: str, path_includes: list[str], path_libjarvis: str) -> tuple[cffi.FFI, Any]:
         cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
-        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
+        cmd += ["-I" + path for path in path_includes] + [path_jarvis_h]
         res = subprocess.run(cmd, stdout=subprocess.PIPE)
         assert (res.returncode == 0)
         source = res.stdout.decode()
@@ -53,67 +53,67 @@ def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_
             source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
             source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
         ffi.cdef(source, override=True)
-        lib = ffi.dlopen(path_libllama)
+        lib = ffi.dlopen(path_libjarvis)
         return (ffi, lib)
 
     def model_default_params(self, **kwargs):
-        mparams = self.lib.llama_model_default_params()
+        mparams = self.lib.jarvis_model_default_params()
         for k, v in kwargs.items():
             setattr(mparams, k, v)
         return mparams
 
     def context_default_params(self, **kwargs):
-        cparams = self.lib.llama_context_default_params()
+        cparams = self.lib.jarvis_context_default_params()
         for k, v in kwargs.items():
             setattr(cparams, k, v)
         return cparams
 
 
-class LibLlamaModel:
+class LibJarvisModel:
 
-    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
-        self.lib: Any = libllama.lib
-        self.ffi = libllama.ffi
+    def __init__(self, libjarvis: LibJarvis, path_model: str, mparams={}, cparams={}):
+        self.lib: Any = libjarvis.lib
+        self.ffi = libjarvis.ffi
         if isinstance(mparams, dict):
-            mparams = libllama.model_default_params(**mparams)
-        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
+            mparams = libjarvis.model_default_params(**mparams)
+        self.model = self.lib.jarvis_load_model_from_file(path_model.encode(), mparams)
         if not self.model:
             raise RuntimeError("error: failed to load model '%s'" % path_model)
         if isinstance(cparams, dict):
-            cparams = libllama.context_default_params(**cparams)
-        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
+            cparams = libjarvis.context_default_params(**cparams)
+        self.ctx = self.lib.jarvis_new_context_with_model(self.model, cparams)
         if not self.ctx:
             raise RuntimeError("error: failed to create context for model '%s'" % path_model)
-        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
-        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
+        n_tokens_max = self.lib.jarvis_n_ctx(self.ctx)
+        self.token_ids = self.ffi.new("jarvis_token[]", n_tokens_max)
         self.text_buff = self.ffi.new("uint8_t[]", 1024)
 
     def free(self):
         if self.ctx:
-            self.lib.llama_free(self.ctx)
+            self.lib.jarvis_free(self.ctx)
         if self.model:
-            self.lib.llama_free_model(self.model)
+            self.lib.jarvis_free_model(self.model)
         self.ctx = None
         self.model = None
         self.lib = None
 
     def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
         encoded_text: bytes = text.encode("utf-8")
-        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        num = self.lib.jarvis_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
         while num < 0 and len(self.token_ids) < (16 << 20):
-            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
-            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+            self.token_ids = self.ffi.new("jarvis_token[]", -2 * num)
+            num = self.lib.jarvis_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
         return list(self.token_ids[0:num])
 
     def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
         if len(self.token_ids) < len(ids):
-            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
+            self.token_ids = self.ffi.new("jarvis_token[]", 2 * len(ids))
         for i, id in enumerate(ids):
             self.token_ids[i] = id
-        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        num = self.lib.jarvis_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         while num < 0 and len(self.text_buff) < (16 << 20):
             self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
-            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+            num = self.lib.jarvis_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
 
 
@@ -154,14 +154,14 @@ def decode(self, ids: list[int]) -> str:
         return self.model.decode(ids, skip_special_tokens=False)
 
 
-class TokenizerLlamaCpp (Tokenizer):
+class TokenizerJarvisCpp (Tokenizer):
 
-    libllama: LibLlama | None = None
+    libjarvis: LibJarvis | None = None
 
     def __init__(self, vocab_file: str):
-        if not self.libllama:
-            self.libllama = LibLlama()
-        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
+        if not self.libjarvis:
+            self.libjarvis = LibJarvis()
+        self.model = LibJarvisModel(self.libjarvis, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
 
     def encode(self, text: str) -> list[int]:
         return self.model.tokenize(text, add_special=True, parse_special=True)
@@ -226,7 +226,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
         'a 〇b',      # unicode_ranges_digit, 0x3007
         'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
         '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
-        'Cửa Việt',   # llama-3, ignore_merges = true
+        'Cửa Việt',   # jarvis-3, ignore_merges = true
         '<s>a',       # Phi-3 fail
         '<unk><|endoftext|><s>',  # Phi-3 fail
         'a\na',            # bert fail
@@ -407,7 +407,7 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100
         yield "".join(text)
 
 
-def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
+def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerJarvisCpp, generator: Iterator[str]):
 
     def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
         for i, (a, b) in enumerate(zip(ids1, ids2)):
@@ -491,7 +491,7 @@ def main(argv: list[str] | None = None):
     logger.info(f"VOCABFILE: '{args.vocab_file}'")
 
     tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
-    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
+    tokenizer2 = TokenizerJarvisCpp(args.vocab_file)
 
     # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
     # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
@@ -529,14 +529,14 @@ def main(argv: list[str] | None = None):
     path_vocab_format = "./models/ggml-vocab-%s.gguf"
 
     tokenizers = [
-        "llama-spm",      # SPM
+        "jarvis-spm",      # SPM
         "phi-3",          # SPM
         "gemma",          # SPM
         "gemma-2",        # SPM
         "baichuan",       # SPM
         "bert-bge",       # WPM
         "jina-v2-en",     # WPM
-        "llama-bpe",      # BPE
+        "jarvis-bpe",      # BPE
         "phi-2",          # BPE
         "deepseek-llm",   # BPE
         "deepseek-coder", # BPE