diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline index af8c0cea6155c..8ba5f117e5f08 100644 --- a/.devops/cloud-v-pipeline +++ b/.devops/cloud-v-pipeline @@ -7,16 +7,16 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto checkout scm // Clone the repo on Runner } } - stage('Compiling llama.cpp'){ + stage('Compiling jarvis.cpp'){ sh'''#!/bin/bash - make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V + make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V ''' } - stage('Running llama.cpp'){ + stage('Running jarvis.cpp'){ sh'''#!/bin/bash module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc - qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 - cat llama_log.txt # Printing results + qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt # Running jarvis.cpp on vector qemu-riscv64 + cat jarvis_log.txt # Printing results ''' } } diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index d5acd35e204d3..16d3d6b947eeb 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -26,7 +26,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) && \ cp build/bin/* . diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile index 34ba856d3d1ca..51b6061020b5e 100644 --- a/.devops/full-musa.Dockerfile +++ b/.devops/full-musa.Dockerfile @@ -19,7 +19,7 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) && \ cp build/bin/* . diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index df496bcd2b7ee..620d7d89cf40d 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. ARG ROCM_DOCKER_ARCH="\ gfx803 \ @@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ # Enable cURL -ENV LLAMA_CURL=1 +ENV JARVIS_CURL=1 RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 2a06f82b738ae..62ee6f5069f00 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -15,7 +15,7 @@ WORKDIR /app COPY . . -ENV LLAMA_CURL=1 +ENV JARVIS_CURL=1 RUN make -j$(nproc) diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/jarvis-cli-cann.Dockerfile similarity index 93% rename from .devops/llama-cli-cann.Dockerfile rename to .devops/jarvis-cli-cann.Dockerfile index db5ba2f25ea67..99c83c0b15dfd 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/jarvis-cli-cann.Dockerfile @@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH RUN echo "Building with static libs" && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ - cmake --build build --config Release --target llama-cli + cmake --build build --config Release --target jarvis-cli # TODO: use image with NNRT FROM cosdt/cann:$ASCEND_VERSION AS runtime -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli ENV LC_ALL=C.utf8 @@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME} -ENTRYPOINT ["/llama-cli" ] +ENTRYPOINT ["/jarvis-cli" ] diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/jarvis-cli-cuda.Dockerfile similarity index 82% rename from .devops/llama-cli-cuda.Dockerfile rename to .devops/jarvis-cli-cuda.Dockerfile index b75163b94435a..43f8b2cb9a471 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/jarvis-cli-cuda.Dockerfile @@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) + cmake --build build --config Release --target jarvis-cli -j$(nproc) FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime @@ -31,7 +31,7 @@ RUN apt-get update && \ apt-get install -y libgomp1 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/src/libjarvis.so /libjarvis.so +COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/jarvis-cli" ] diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/jarvis-cli-intel.Dockerfile similarity index 80% rename from .devops/llama-cli-intel.Dockerfile rename to .devops/jarvis-cli-intel.Dockerfile index 79dba06a77d6e..cc3d64afef9df 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/jarvis-cli-intel.Dockerfile @@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "Building with static libs" && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ - cmake --build build --config Release --target llama-cli + cmake --build build --config Release --target jarvis-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/jarvis-cli" ] diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/jarvis-cli-musa.Dockerfile similarity index 77% rename from .devops/llama-cli-musa.Dockerfile rename to .devops/jarvis-cli-musa.Dockerfile index b5696794f1a56..69d13cc79cada 100644 --- a/.devops/llama-cli-musa.Dockerfile +++ b/.devops/jarvis-cli-musa.Dockerfile @@ -16,7 +16,7 @@ WORKDIR /app COPY . . RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) + cmake --build build --config Release --target jarvis-cli -j$(nproc) FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime @@ -24,7 +24,7 @@ RUN apt-get update && \ apt-get install -y libgomp1 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/src/libjarvis.so /libjarvis.so +COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/jarvis-cli" ] diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/jarvis-cli-rocm.Dockerfile similarity index 85% rename from .devops/llama-cli-rocm.Dockerfile rename to .devops/jarvis-cli-rocm.Dockerfile index e60c747bdbf11..2eeb794358221 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/jarvis-cli-rocm.Dockerfile @@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. ARG ROCM_DOCKER_ARCH="\ gfx803 \ @@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ -RUN make -j$(nproc) llama-cli +RUN make -j$(nproc) jarvis-cli -ENTRYPOINT [ "/app/llama-cli" ] +ENTRYPOINT [ "/app/jarvis-cli" ] diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/jarvis-cli-vulkan.Dockerfile similarity index 80% rename from .devops/llama-cli-vulkan.Dockerfile rename to .devops/jarvis-cli-vulkan.Dockerfile index 9b0dad8bf7a13..57ebafa9bed2f 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/jarvis-cli-vulkan.Dockerfile @@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key WORKDIR /app COPY . . RUN cmake -B build -DGGML_VULKAN=1 && \ - cmake --build build --config Release --target llama-cli + cmake --build build --config Release --target jarvis-cli # Clean up WORKDIR / -RUN cp /app/build/bin/llama-cli /llama-cli && \ +RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \ rm -rf /app ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/jarvis-cli" ] diff --git a/.devops/llama-cli.Dockerfile b/.devops/jarvis-cli.Dockerfile similarity index 72% rename from .devops/llama-cli.Dockerfile rename to .devops/jarvis-cli.Dockerfile index 7f741aa46ecf0..6a3137f281679 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/jarvis-cli.Dockerfile @@ -9,15 +9,15 @@ WORKDIR /app COPY . . -RUN make -j$(nproc) llama-cli +RUN make -j$(nproc) jarvis-cli FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libgomp1 -COPY --from=build /app/llama-cli /llama-cli +COPY --from=build /app/jarvis-cli /jarvis-cli ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/jarvis-cli" ] diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/jarvis-cpp-cuda.srpm.spec similarity index 60% rename from .devops/llama-cpp-cuda.srpm.spec rename to .devops/jarvis-cpp-cuda.srpm.spec index 7425d3a9d7a40..c806963f9eb7a 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/jarvis-cpp-cuda.srpm.spec @@ -3,7 +3,7 @@ # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal -# Notes for llama.cpp: +# Notes for jarvis.cpp: # 1. Tags are currently based on hash - which will not sort asciibetically. # We need to declare standard versioning if people want to sort latest releases. # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. @@ -12,44 +12,44 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp-cuda +Name: jarvis.cpp-cuda Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} -Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) +Summary: CPU Inference of JARVIS model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git cuda-toolkit Requires: cuda-toolkit -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggerganov/jarvis.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 %description -CPU inference for Meta's Lllama2 models using default options. +CPU inference for Meta's Ljarvis2 models using default options. %prep -%setup -n llama.cpp-master +%setup -n jarvis.cpp-master %build make -j GGML_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli -cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server -cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple +cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli +cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server +cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/jarviscuda.service [Unit] -Description=Llama.cpp server, CPU only (no GPU support in this build). +Description=Jarvis.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target [Service] Type=simple -EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS +EnvironmentFile=/etc/sysconfig/jarvis +ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -58,8 +58,8 @@ WantedBy=default.target EOF mkdir -p %{buildroot}/etc/sysconfig -%{__cat} < %{buildroot}/etc/sysconfig/llama -LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +%{__cat} < %{buildroot}/etc/sysconfig/jarvis +JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin" EOF %clean @@ -67,11 +67,11 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llama-cuda-cli -%{_bindir}/llama-cuda-server -%{_bindir}/llama-cuda-simple -/usr/lib/systemd/system/llamacuda.service -%config /etc/sysconfig/llama +%{_bindir}/jarvis-cuda-cli +%{_bindir}/jarvis-cuda-server +%{_bindir}/jarvis-cuda-simple +/usr/lib/systemd/system/jarviscuda.service +%config /etc/sysconfig/jarvis %pre diff --git a/.devops/llama-cpp.srpm.spec b/.devops/jarvis-cpp.srpm.spec similarity index 63% rename from .devops/llama-cpp.srpm.spec rename to .devops/jarvis-cpp.srpm.spec index 4d5560089816c..243e932556298 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/jarvis-cpp.srpm.spec @@ -3,7 +3,7 @@ # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal -# Notes for llama.cpp: +# Notes for jarvis.cpp: # 1. Tags are currently based on hash - which will not sort asciibetically. # We need to declare standard versioning if people want to sort latest releases. # In the meantime, YYYYMMDD format will be used. @@ -13,45 +13,45 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp +Name: jarvis.cpp Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} -Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) +Summary: CPU Inference of JARVIS model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git libstdc++-devel Requires: libstdc++ -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggerganov/jarvis.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 %description -CPU inference for Meta's Lllama2 models using default options. +CPU inference for Meta's Ljarvis2 models using default options. Models are not included in this package and must be downloaded separately. %prep -%setup -n llama.cpp-master +%setup -n jarvis.cpp-master %build make -j %install mkdir -p %{buildroot}%{_bindir}/ -cp -p llama-cli %{buildroot}%{_bindir}/llama-cli -cp -p llama-server %{buildroot}%{_bindir}/llama-server -cp -p llama-simple %{buildroot}%{_bindir}/llama-simple +cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli +cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server +cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llama.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/jarvis.service [Unit] -Description=Llama.cpp server, CPU only (no GPU support in this build). +Description=Jarvis.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target [Service] Type=simple -EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llama-server $LLAMA_ARGS +EnvironmentFile=/etc/sysconfig/jarvis +ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -60,8 +60,8 @@ WantedBy=default.target EOF mkdir -p %{buildroot}/etc/sysconfig -%{__cat} < %{buildroot}/etc/sysconfig/llama -LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +%{__cat} < %{buildroot}/etc/sysconfig/jarvis +JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin" EOF %clean @@ -69,11 +69,11 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llama-cli -%{_bindir}/llama-server -%{_bindir}/llama-simple -/usr/lib/systemd/system/llama.service -%config /etc/sysconfig/llama +%{_bindir}/jarvis-cli +%{_bindir}/jarvis-server +%{_bindir}/jarvis-simple +/usr/lib/systemd/system/jarvis.service +%config /etc/sysconfig/jarvis %pre diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/jarvis-server-cuda.Dockerfile similarity index 74% rename from .devops/llama-server-cuda.Dockerfile rename to .devops/jarvis-server-cuda.Dockerfile index a40e24205707f..435fe9e8d9bf9 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/jarvis-server-cuda.Dockerfile @@ -22,8 +22,8 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) + cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target jarvis-server -j$(nproc) FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime @@ -31,12 +31,12 @@ RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so -COPY --from=build /app/build/bin/llama-server /llama-server +COPY --from=build /app/build/src/libjarvis.so /libjarvis.so +COPY --from=build /app/build/bin/jarvis-server /jarvis-server # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/jarvis-server" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/jarvis-server-intel.Dockerfile similarity index 75% rename from .devops/llama-server-intel.Dockerfile rename to .devops/jarvis-server-intel.Dockerfile index 9c355b664f15e..1d3cc936fe00f 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/jarvis-server-intel.Dockerfile @@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ echo "Building with dynamic libs" && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target llama-server + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build build --config Release --target jarvis-server FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev curl -COPY --from=build /app/build/bin/llama-server /llama-server +COPY --from=build /app/build/bin/jarvis-server /jarvis-server ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/jarvis-server" ] diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/jarvis-server-musa.Dockerfile similarity index 68% rename from .devops/llama-server-musa.Dockerfile rename to .devops/jarvis-server-musa.Dockerfile index 193a6d77cb9ed..1c8e8938bde96 100644 --- a/.devops/llama-server-musa.Dockerfile +++ b/.devops/jarvis-server-musa.Dockerfile @@ -15,8 +15,8 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) +RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target jarvis-server -j$(nproc) FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime @@ -24,12 +24,12 @@ RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so -COPY --from=build /app/build/bin/llama-server /llama-server +COPY --from=build /app/build/src/libjarvis.so /libjarvis.so +COPY --from=build /app/build/bin/jarvis-server /jarvis-server # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/jarvis-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/jarvis-server-rocm.Dockerfile similarity index 84% rename from .devops/llama-server-rocm.Dockerfile rename to .devops/jarvis-server-rocm.Dockerfile index 8553af75b61fc..a9192b3dbbc91 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/jarvis-server-rocm.Dockerfile @@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. ARG ROCM_DOCKER_ARCH="\ gfx803 \ @@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 # Enable cURL -ENV LLAMA_CURL=1 +ENV JARVIS_CURL=1 RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev curl -RUN make -j$(nproc) llama-server +RUN make -j$(nproc) jarvis-server HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/app/llama-server" ] +ENTRYPOINT [ "/app/jarvis-server" ] diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/jarvis-server-vulkan.Dockerfile similarity index 75% rename from .devops/llama-server-vulkan.Dockerfile rename to .devops/jarvis-server-vulkan.Dockerfile index 93c5e0c26e691..89811bed3e6ad 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/jarvis-server-vulkan.Dockerfile @@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ - cmake --build build --config Release --target llama-server +RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \ + cmake --build build --config Release --target jarvis-server # Clean up WORKDIR / -RUN cp /app/build/bin/llama-server /llama-server && \ +RUN cp /app/build/bin/jarvis-server /jarvis-server && \ rm -rf /app ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/jarvis-server" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/jarvis-server.Dockerfile similarity index 73% rename from .devops/llama-server.Dockerfile rename to .devops/jarvis-server.Dockerfile index 02accc85e1368..cc39a213c173e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/jarvis-server.Dockerfile @@ -9,21 +9,21 @@ WORKDIR /app COPY . . -ENV LLAMA_CURL=1 +ENV JARVIS_CURL=1 -RUN make -j$(nproc) llama-server +RUN make -j$(nproc) jarvis-server FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/llama-server /llama-server +COPY --from=build /app/jarvis-server /jarvis-server ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 +ENV JARVIS_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/jarvis-server" ] diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index 0ecf19fc56d55..af01140753974 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -6,10 +6,10 @@ let inherit (config.packages) default; binaries = [ - "llama-cli" - "llama-embedding" - "llama-server" - "llama-quantize" + "jarvis-cli" + "jarvis-embedding" + "jarvis-server" + "jarvis-quantize" ]; mkApp = name: { type = "app"; diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix index d607b4575772c..502070aa8a5f2 100644 --- a/.devops/nix/docker.nix +++ b/.devops/nix/docker.nix @@ -2,14 +2,14 @@ lib, dockerTools, buildEnv, - llama-cpp, + jarvis-cpp, interactive ? true, coreutils, }: # A tar that can be fed into `docker load`: # -# $ nix build .#llamaPackages.docker +# $ nix build .#jarvisPackages.docker # $ docker load < result # For details and variations cf. @@ -19,16 +19,16 @@ # Approximate (compressed) sizes, at the time of writing, are: # -# .#llamaPackages.docker: 125M; -# .#llamaPackagesCuda.docker: 537M; -# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. +# .#jarvisPackages.docker: 125M; +# .#jarvisPackagesCuda.docker: 537M; +# .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M. dockerTools.buildLayeredImage { - name = llama-cpp.pname; + name = jarvis-cpp.pname; tag = "latest"; contents = - [ llama-cpp ] + [ jarvis-cpp ] ++ lib.optionals interactive [ coreutils dockerTools.binSh diff --git a/.devops/nix/jetson-support.nix b/.devops/nix/jetson-support.nix index 78e2e40e03864..56f4c5b7805a5 100644 --- a/.devops/nix/jetson-support.nix +++ b/.devops/nix/jetson-support.nix @@ -11,10 +11,10 @@ { legacyPackages = let - caps.llamaPackagesXavier = "7.2"; - caps.llamaPackagesOrin = "8.7"; - caps.llamaPackagesTX2 = "6.2"; - caps.llamaPackagesNano = "5.3"; + caps.jarvisPackagesXavier = "7.2"; + caps.jarvisPackagesOrin = "8.7"; + caps.jarvisPackagesTX2 = "6.2"; + caps.jarvisPackagesNano = "5.3"; pkgsFor = cap: @@ -31,9 +31,9 @@ builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps; packages = lib.optionalAttrs (system == "aarch64-linux") { - jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp; - jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp; - jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp; + jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp; + jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp; + jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp; }; }; } diff --git a/.devops/nix/package-gguf-py.nix b/.devops/nix/package-gguf-py.nix index cca2f36a5bd4d..62b622332bf65 100644 --- a/.devops/nix/package-gguf-py.nix +++ b/.devops/nix/package-gguf-py.nix @@ -1,6 +1,6 @@ { lib, - llamaVersion, + jarvisVersion, numpy, tqdm, sentencepiece, @@ -12,7 +12,7 @@ buildPythonPackage { pname = "gguf"; - version = llamaVersion; + version = jarvisVersion; pyproject = true; nativeBuildInputs = [ poetry-core ]; propagatedBuildInputs = [ diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 5d7d7ea5ae2d0..95d44360ceee4 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -33,7 +33,7 @@ useRocm ? config.rocmSupport, enableCurl ? true, useVulkan ? false, - llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake + jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake # It's necessary to consistently use backendStdenv when building with CUDA support, # otherwise we get libstdc++ errors downstream. @@ -103,8 +103,8 @@ let in effectiveStdenv.mkDerivation (finalAttrs: { - pname = "llama-cpp${pnameSuffix}"; - version = llamaVersion; + pname = "jarvis-cpp${pnameSuffix}"; + version = jarvisVersion; # Note: none of the files discarded here are visible in the sandbox or # affect the output hash. This also means they can be modified without @@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: { --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; - # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015, # `default.metallib` may be compiled with Metal compiler from XCode # and we need to escape sandbox on MacOS to access Metal compiler. # `xcrun` is used find the path of the Metal compiler, which is varible # and not on $PATH - # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + # see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; nativeBuildInputs = @@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: { cmakeFlags = [ - (cmakeBool "LLAMA_BUILD_SERVER" true) + (cmakeBool "JARVIS_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) - (cmakeBool "LLAMA_CURL" enableCurl) + (cmakeBool "JARVIS_CURL" enableCurl) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) @@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { # if they haven't been added yet. postInstall = '' mkdir -p $out/include - cp $src/include/llama.h $out/include/ + cp $src/include/jarvis.h $out/include/ ''; meta = { @@ -218,12 +218,12 @@ effectiveStdenv.mkDerivation (finalAttrs: { # overridden by importing Nixpkgs with `allowBroken = true`. broken = (useMetalKit && !effectiveStdenv.isDarwin); - description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; - homepage = "https://github.com/ggerganov/llama.cpp/"; + description = "Inference of JARVIS model in pure C/C++${descriptionSuffix}"; + homepage = "https://github.com/ggerganov/jarvis.cpp/"; license = lib.licenses.mit; # Accommodates `nix run` and `lib.getExe` - mainProgram = "llama-cli"; + mainProgram = "jarvis-cli"; # These people might respond, on the best effort basis, if you ping them # in case of Nix-specific regressions or for reviewing Nix-specific PRs. diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix index 392e9ffe41bf5..7c91fd9022e48 100644 --- a/.devops/nix/python-scripts.nix +++ b/.devops/nix/python-scripts.nix @@ -9,7 +9,7 @@ }@inputs: let - llama-python-deps = with python3Packages; [ + jarvis-python-deps = with python3Packages; [ numpy sentencepiece transformers @@ -18,7 +18,7 @@ let gguf-py tqdm - # for scripts/compare-llama-bench.py + # for scripts/compare-jarvis-bench.py gitpython tabulate @@ -28,7 +28,7 @@ let ]; - llama-python-test-deps = with python3Packages; [ + jarvis-python-test-deps = with python3Packages; [ # Server bench matplotlib @@ -40,7 +40,7 @@ let in buildPythonPackage ({ - pname = "llama-scripts"; + pname = "jarvis-scripts"; version = "0.0.0"; pyproject = true; @@ -61,6 +61,6 @@ buildPythonPackage ({ src = lib.cleanSource ../../.; }; nativeBuildInputs = [ poetry-core ]; - nativeCheckInputs = llama-python-test-deps; - dependencies = llama-python-deps; + nativeCheckInputs = jarvis-python-test-deps; + dependencies = jarvis-python-deps; }) diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index 478e8c4228afa..4b1b4ff090bd5 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -2,7 +2,7 @@ lib, newScope, python3, - llamaVersion ? "0.0.0", + jarvisVersion ? "0.0.0", }: let @@ -21,7 +21,7 @@ in # Cf. https://noogle.dev/f/lib/makeScope lib.makeScope newScope (self: { - inherit llamaVersion; + inherit jarvisVersion; gguf-py = self.callPackage ./package-gguf-py.nix { inherit buildPythonPackage @@ -34,7 +34,7 @@ lib.makeScope newScope (self: { ; }; python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; }; - llama-cpp = self.callPackage ./package.nix { }; + jarvis-cpp = self.callPackage ./package.nix { }; docker = self.callPackage ./docker.nix { }; docker-min = self.callPackage ./docker.nix { interactive = false; }; sif = self.callPackage ./sif.nix { }; diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix index 7a5e1dd0ffc4c..cc43dd75680e9 100644 --- a/.devops/nix/sif.nix +++ b/.devops/nix/sif.nix @@ -1,7 +1,7 @@ { lib, singularity-tools, - llama-cpp, + jarvis-cpp, bashInteractive, interactive ? false, }: @@ -10,8 +10,8 @@ let optionalInt = cond: x: if cond then x else 0; in singularity-tools.buildImage rec { - inherit (llama-cpp) name; - contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; + inherit (jarvis-cpp) name; + contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ]; # These are excessive (but safe) for most variants. Building singularity # images requires superuser privileges, so we build them inside a VM in a @@ -22,6 +22,6 @@ singularity-tools.buildImage rec { # Expected image sizes: # - cpu/blas: 150M, # - cuda, all gencodes: 560M, - diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; + diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384; memSize = diskSize; } diff --git a/.devops/tools.sh b/.devops/tools.sh index 24dcfd35079cb..a5a56c8231fab 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -10,9 +10,9 @@ shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then python3 ./convert_hf_to_gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./llama-quantize "$@" + ./jarvis-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./llama-cli "$@" + ./jarvis-cli "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 + ./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./llama-server "$@" + ./jarvis-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" - echo " --convert (-c): Convert a llama model into ggml" + echo " --convert (-c): Convert a jarvis model into ggml" echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" diff --git a/.dockerignore b/.dockerignore index 064b7c7be86d0..a07624cfd185e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,8 +12,8 @@ build*/ models/* -/llama-cli -/llama-quantize +/jarvis-cli +/jarvis-quantize arm_neon.h compile_commands.json diff --git a/.editorconfig b/.editorconfig index f88f8da67cd78..ec03eee394d99 100644 --- a/.editorconfig +++ b/.editorconfig @@ -24,7 +24,7 @@ insert_final_newline = unset [examples/server/public/*] indent_size = 2 -[examples/llama.swiftui/llama.swiftui.xcodeproj/*] +[examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*] indent_style = tab [examples/cvector-generator/*.txt] diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml index 54785854f776e..281fdb74ff70f 100644 --- a/.github/ISSUE_TEMPLATE/01-bug-low.yml +++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml @@ -1,5 +1,5 @@ name: Low Severity Bugs -description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) +description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches) title: "Bug: " labels: ["bug-unconfirmed", "low severity"] body: @@ -8,7 +8,7 @@ body: value: | Thanks for taking the time to fill out this bug report! Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. + and the version of jarvis.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. - type: textarea id: what-happened @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./llama-cli --version + $./jarvis-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml index a6285c6f05bac..9a4f564e37aae 100644 --- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml +++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml @@ -1,5 +1,5 @@ name: Medium Severity Bug -description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) +description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable) title: "Bug: " labels: ["bug-unconfirmed", "medium severity"] body: @@ -8,7 +8,7 @@ body: value: | Thanks for taking the time to fill out this bug report! Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. + and the version of jarvis.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. - type: textarea id: what-happened @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./llama-cli --version + $./jarvis-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml index ff816b93769c3..cfa23d4afbdfb 100644 --- a/.github/ISSUE_TEMPLATE/03-bug-high.yml +++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml @@ -1,5 +1,5 @@ name: High Severity Bug -description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) +description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow) title: "Bug: " labels: ["bug-unconfirmed", "high severity"] body: @@ -8,7 +8,7 @@ body: value: | Thanks for taking the time to fill out this bug report! Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. + and the version of jarvis.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. - type: textarea id: what-happened @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./llama-cli --version + $./jarvis-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml index 7af42a80b3b93..e88543452a79c 100644 --- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml +++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml @@ -1,5 +1,5 @@ name: Critical Severity Bug -description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) +description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss) title: "Bug: " labels: ["bug-unconfirmed", "critical severity"] body: @@ -8,7 +8,7 @@ body: value: | Thanks for taking the time to fill out this bug report! Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. + and the version of jarvis.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. - type: textarea id: what-happened @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./llama-cli --version + $./jarvis-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml index 58fca73183d41..b33f44a627b41 100644 --- a/.github/ISSUE_TEMPLATE/05-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml @@ -1,12 +1,12 @@ name: Enhancement -description: Used to request enhancements for llama.cpp +description: Used to request enhancements for jarvis.cpp title: "Feature Request: " labels: ["enhancement"] body: - type: markdown attributes: value: | - [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas) + [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas) - type: checkboxes id: prerequisites @@ -16,18 +16,18 @@ body: options: - label: I am running the latest code. Mention the version if possible as well. required: true - - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). + - label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md). required: true - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). required: true - - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share. + - label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share. required: true - type: textarea id: feature-description attributes: label: Feature Description - description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. + description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement. placeholder: Detailed description of the enhancement validations: required: true @@ -36,7 +36,7 @@ body: id: motivation attributes: label: Motivation - description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. + description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users. placeholder: Explanation of why this feature is needed and its benefits validations: required: true diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/06-research.yml index 3ae4e9f8caaa4..51e4baf6fffa7 100644 --- a/.github/ISSUE_TEMPLATE/06-research.yml +++ b/.github/ISSUE_TEMPLATE/06-research.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) + Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) - type: checkboxes id: research-stage diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/07-refactor.yml index 3a68d3d5355d6..0a8a58fccd0ba 100644 --- a/.github/ISSUE_TEMPLATE/07-refactor.yml +++ b/.github/ISSUE_TEMPLATE/07-refactor.yml @@ -6,8 +6,8 @@ body: - type: markdown attributes: value: | - Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. - Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. + Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. + Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. - type: textarea id: background-description diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index eb8c4b472df4c..fa85823fcdae0 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,11 @@ blank_issues_enabled: true contact_links: - name: Got an idea? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas + url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas about: Pop it there. It may then become an enhancement ticket. - name: Got a question? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a + url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a about: Ask a question there! - name: Want to contribute? - url: https://github.com/ggerganov/llama.cpp/wiki/contribute + url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute about: Head to the contribution guide page of the wiki for areas you can help with diff --git a/.github/labeler.yml b/.github/labeler.yml index 89436740d1ffb..7e5e48b35ac22 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -67,7 +67,7 @@ script: android: - changed-files: - any-glob-to-any-file: - - examples/llama.android/** + - examples/jarvis.android/** server: - changed-files: - any-glob-to-any-file: diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 997c6d9d05397..c1c783730f652 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,6 @@ -- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) +- [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md) - Self-reported review complexity: - [ ] Low - [ ] Medium diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index 1c8787ef78f7e..12f092afcee5f 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -1,5 +1,5 @@ # TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggerganov/llama.cpp/issues/7893 +# https://github.com/ggerganov/jarvis.cpp/issues/7893 # # Benchmark name: Benchmark @@ -27,10 +27,10 @@ on: push: branches: - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' @@ -113,16 +113,16 @@ jobs: set -eux cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DLLAMA_CUBLAS=ON \ + -DJARVIS_BUILD_SERVER=ON \ + -DJARVIS_CURL=ON \ + -DJARVIS_CUBLAS=ON \ -DCUDAToolkit_ROOT=/usr/local/cuda \ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DLLAMA_FATAL_WARNINGS=OFF \ - -DLLAMA_ALL_WARNINGS=OFF \ + -DJARVIS_FATAL_WARNINGS=OFF \ + -DJARVIS_ALL_WARNINGS=OFF \ -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target llama-server + cmake --build build --config Release -j $(nproc) --target jarvis-server - name: Download the dataset id: download_dataset @@ -240,7 +240,7 @@ jobs: message: |

- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + 📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀

@@ -249,9 +249,9 @@ jobs: Expand details for performance related PR only - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} + - Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s + - Token generation (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s - ${{ env.BENCH_GRAPH_XLABEL }} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 423173b975897..d73089ed81b2e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,9 +28,9 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + JARVIS_LOG_COLORS: 1 + JARVIS_LOG_PREFIX: 1 + JARVIS_LOG_TIMESTAMPS: 1 jobs: macOS-latest-cmake-arm64: @@ -55,7 +55,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. + cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -82,14 +82,14 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* + zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip - name: llama-bin-macos-arm64.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip + name: jarvis-bin-macos-arm64.zip macOS-latest-cmake-x64: runs-on: macos-12 @@ -112,8 +112,8 @@ jobs: run: | sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF + # https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -140,20 +140,20 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* + zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip - name: llama-bin-macos-x64.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip + name: jarvis-bin-macos-x64.zip ubuntu-focal-make: runs-on: ubuntu-20.04 env: - LLAMA_NODE_AVAILABLE: true - LLAMA_PYTHON_AVAILABLE: true + JARVIS_NODE_AVAILABLE: true + JARVIS_PYTHON_AVAILABLE: true steps: - name: Clone @@ -177,7 +177,7 @@ jobs: - name: Build id: make_build env: - LLAMA_FATAL_WARNINGS: 1 + JARVIS_FATAL_WARNINGS: 1 run: | CC=gcc-8 make -j $(nproc) @@ -204,8 +204,8 @@ jobs: - name: Build id: make_build env: - LLAMA_FATAL_WARNINGS: 1 - LLAMA_CURL: 1 + JARVIS_FATAL_WARNINGS: 1 + JARVIS_CURL: 1 run: | CC=gcc-8 make -j $(nproc) @@ -230,7 +230,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF + cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(nproc) - name: Test @@ -239,16 +239,16 @@ jobs: cd build ctest -L 'main|curl' --verbose --timeout 900 - - name: Test llama2c conversion - id: llama2c_test + - name: Test jarvis2c conversion + id: jarvis2c_test run: | cd build echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin + echo "Fetch jarvis2c model" + wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin + ./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf + ./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - name: Determine tag name id: tag @@ -268,14 +268,14 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip - name: llama-bin-ubuntu-x64.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip + name: jarvis-bin-ubuntu-x64.zip ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -304,7 +304,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Build (no OpenMP) @@ -313,7 +313,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF + cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Test @@ -487,7 +487,7 @@ jobs: # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 + # ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 macOS-latest-make: runs-on: macos-latest @@ -505,7 +505,7 @@ jobs: - name: Build id: make_build env: - LLAMA_FATAL_WARNINGS: 1 + JARVIS_FATAL_WARNINGS: 1 run: | GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) @@ -517,7 +517,7 @@ jobs: # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 + # ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 # would be great if we fix these macOS-latest-cmake: runs-on: macos-latest @@ -539,7 +539,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. + cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -570,9 +570,9 @@ jobs: cd build cmake -G Xcode .. \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ + -DJARVIS_BUILD_EXAMPLES=OFF \ + -DJARVIS_BUILD_TESTS=OFF \ + -DJARVIS_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=iOS \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml @@ -600,9 +600,9 @@ jobs: cd build cmake -G Xcode .. \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ + -DJARVIS_BUILD_EXAMPLES=OFF \ + -DJARVIS_BUILD_TESTS=OFF \ + -DJARVIS_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=tvOS \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml @@ -629,7 +629,7 @@ jobs: - name: xcodebuild for swift package id: xcodebuild run: | - xcodebuild -scheme llama -destination "${{ matrix.destination }}" + xcodebuild -scheme jarvis -destination "${{ matrix.destination }}" - name: Build Swift Example id: make_build_swift_example @@ -705,23 +705,23 @@ jobs: matrix: include: - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -807,7 +807,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 + $env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1 & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name @@ -827,15 +827,15 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | - Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* + Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt + 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip - name: llama-bin-win-${{ matrix.build }}.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip + name: jarvis-bin-win-${{ matrix.build }}.zip windows-latest-cmake-cuda: runs-on: windows-2019 @@ -865,7 +865,7 @@ jobs: run: | mkdir build cd build - cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON + cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} @@ -886,28 +886,28 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* + 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip - name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime run: | echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" $dst='.\build\bin\cudart\' robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* + 7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* - name: Upload Cuda runtime if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip + name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip windows-latest-cmake-sycl: runs-on: windows-latest @@ -963,14 +963,14 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin echo "cp oneAPI running time dll files to ./build/bin done" - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* + 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip - name: llama-bin-win-sycl-x64.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + name: jarvis-bin-win-sycl-x64.zip windows-latest-cmake-hip: if: ${{ github.event.inputs.create_release != 'true' }} @@ -1060,13 +1060,13 @@ jobs: - name: Pack artifacts id: pack_artifacts run: | - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* + 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* - name: Upload artifacts uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip ios-xcode-build: runs-on: macos-latest @@ -1076,7 +1076,7 @@ jobs: uses: actions/checkout@v4 - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build android-build: runs-on: ubuntu-latest @@ -1098,7 +1098,7 @@ jobs: - name: Build run: | - cd examples/llama.android + cd examples/jarvis.android ./gradlew build --no-daemon @@ -1261,7 +1261,7 @@ jobs: # sudo apt-get install cmake # # - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON +# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON # # - name: Build # run: | @@ -1300,7 +1300,7 @@ jobs: # - name: Upload binaries # uses: actions/upload-artifact@v4 # with: -# name: llama-bin-${{ matrix.arch }} +# name: jarvis-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} # # windows-blas: @@ -1339,7 +1339,7 @@ jobs: # run: > # cmake -S . -B ./build -A ${{ matrix.arch }} # -DCMAKE_BUILD_TYPE=${{ matrix.build }} -# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }} +# -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }} # -DCMAKE_LIBRARY_PATH="$env:blasdir/lib" # # - name: Build @@ -1355,7 +1355,7 @@ jobs: # if: matrix.blas == 'ON' # uses: actions/upload-artifact@v4 # with: -# name: llama-blas-bin-${{ matrix.arch }} +# name: jarvis-blas-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} # # emscripten: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a953cdac907ae..fee3e9145be21 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -37,21 +37,21 @@ jobs: strategy: matrix: config: - - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" } - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + #- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + #- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v4 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 368dbdbe5dccc..e3344be63ad39 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -11,7 +11,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - repository: "ggerganov/llama.cpp" + repository: "ggerganov/jarvis.cpp" - uses: actions/labeler@v5 with: configuration-path: '.github/labeler.yml' diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 0da6acdf1c81e..7473135ef5c79 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -47,8 +47,8 @@ jobs: extra-conf: | extra-platforms = aarch64-linux extra-system-features = nixos-test kvm - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -56,7 +56,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp + name: jarvis-cpp - name: Show all output paths run: > nix run github:nix-community/nix-eval-jobs diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 8ecbbe53b4ed1..3a748d9acf4d3 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -34,8 +34,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -61,8 +61,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -70,7 +70,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp + name: jarvis-cpp - name: Build run: > nix run github:Mic92/nix-fast-build diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 699ac095d6c83..29943d52e2dc3 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -21,10 +21,10 @@ on: paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 + JARVIS_LOG_COLORS: 1 + JARVIS_LOG_PREFIX: 1 + JARVIS_LOG_TIMESTAMPS: 1 + JARVIS_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} @@ -41,7 +41,7 @@ jobs: include: - build_type: Release sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken + fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken steps: - name: Dependencies @@ -99,12 +99,12 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ + -DJARVIS_BUILD_SERVER=ON \ + -DJARVIS_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \ -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server - name: Build id: cmake_build @@ -112,11 +112,11 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ + -DJARVIS_BUILD_SERVER=ON \ + -DJARVIS_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ; + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server - name: Tests id: server_integration_tests @@ -155,8 +155,8 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server + cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server - name: Python setup id: setup_python @@ -180,7 +180,7 @@ jobs: run: | cd examples/server/tests $env:PYTHONIOENCODING = ":replace" - behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp + behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp - name: Slow tests id: server_integration_tests_slow diff --git a/.gitignore b/.gitignore index 1092d097a7542..cf5abf6ff55de 100644 --- a/.gitignore +++ b/.gitignore @@ -48,8 +48,8 @@ build* !build-info.sh !build.zig !docs/build.md -/libllama.so -/llama-* +/libjarvis.so +/jarvis-* /vulkan-shaders-gen android-ndk-* arm_neon.h @@ -57,7 +57,7 @@ cmake-build-* CMakeSettings.json compile_commands.json ggml-metal-embed.metal -llama-batched-swift +jarvis-batched-swift /rpc-server out/ tmp/ @@ -118,7 +118,7 @@ poetry.toml /tests/test-double-float /tests/test-grad0 /tests/test-grammar-parser -/tests/test-llama-grammar +/tests/test-jarvis-grammar /tests/test-opt /tests/test-quantize-fns /tests/test-quantize-perf diff --git a/CMakeLists.txt b/CMakeLists.txt index ef0932a7b9277..db4944fcb677c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. -project("llama.cpp" C CXX) +project("jarvis.cpp" C CXX) include(CheckIncludeFileCXX) #set(CMAKE_WARN_DEPRECATED YES) @@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(LLAMA_STANDALONE ON) + set(JARVIS_STANDALONE ON) include(git-vars) # configure project version # TODO else() - set(LLAMA_STANDALONE OFF) + set(JARVIS_STANDALONE OFF) endif() if (EMSCRIPTEN) set(BUILD_SHARED_LIBS_DEFAULT OFF) - option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON) + option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON) else() if (MINGW) set(BUILD_SHARED_LIBS_DEFAULT OFF) @@ -51,41 +51,41 @@ endif() # # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) +option(JARVIS_ALL_WARNINGS "jarvis: enable all compiler warnings" ON) +option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF) # build -option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) +option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF) # sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) +option(JARVIS_SANITIZE_THREAD "jarvis: enable thread sanitizer" OFF) +option(JARVIS_SANITIZE_ADDRESS "jarvis: enable address sanitizer" OFF) +option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF) # utils -option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) +option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE}) # extra artifacts -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) +option(JARVIS_BUILD_TESTS "jarvis: build tests" ${JARVIS_STANDALONE}) +option(JARVIS_BUILD_EXAMPLES "jarvis: build examples" ${JARVIS_STANDALONE}) +option(JARVIS_BUILD_SERVER "jarvis: build server example" ${JARVIS_STANDALONE}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) +option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) # override ggml options -set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) -set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) -set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) -set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) -set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) +set(GGML_SANITIZE_THREAD ${JARVIS_SANITIZE_THREAD}) +set(GGML_SANITIZE_ADDRESS ${JARVIS_SANITIZE_ADDRESS}) +set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED}) +set(GGML_ALL_WARNINGS ${JARVIS_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${JARVIS_FATAL_WARNINGS}) # change the default for these ggml options -if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE_DEFAULT ON) +if (NOT DEFINED GGML_JARVISFILE) + set(GGML_JARVISFILE_DEFAULT ON) endif() if (NOT DEFINED GGML_AMX) @@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS) endif() # transition helpers -function (llama_option_depr TYPE OLD NEW) +function (jarvis_option_depr TYPE OLD NEW) if (${OLD}) message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") set(${NEW} ON PARENT_SCOPE) endif() endfunction() -llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) -llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) -llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) -llama_option_depr(WARNING LLAMA_METAL GGML_METAL) -llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) -llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) -llama_option_depr(WARNING LLAMA_RPC GGML_RPC) -llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) -llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) -llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS GGML_CUDA) +jarvis_option_depr(WARNING JARVIS_CUDA GGML_CUDA) +jarvis_option_depr(WARNING JARVIS_KOMPUTE GGML_KOMPUTE) +jarvis_option_depr(WARNING JARVIS_METAL GGML_METAL) +jarvis_option_depr(WARNING JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) +jarvis_option_depr(WARNING JARVIS_NATIVE GGML_NATIVE) +jarvis_option_depr(WARNING JARVIS_RPC GGML_RPC) +jarvis_option_depr(WARNING JARVIS_SYCL GGML_SYCL) +jarvis_option_depr(WARNING JARVIS_SYCL_F16 GGML_SYCL_F16) +jarvis_option_depr(WARNING JARVIS_CANN GGML_CANN) # # build the library @@ -132,18 +132,18 @@ add_subdirectory(src) include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) -set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) +set(JARVIS_BUILD_NUMBER ${BUILD_NUMBER}) +set(JARVIS_BUILD_COMMIT ${BUILD_COMMIT}) +set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER}) -set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") -set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") -set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") +set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(JARVIS_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") +set(JARVIS_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") # At the moment some compile definitions are placed within the ggml/src # directory but not exported on the `ggml` target. This could be improved by -# determining _precisely_ which defines are necessary for the llama-config +# determining _precisely_ which defines are necessary for the jarvis-config # package. # set(GGML_TRANSIENT_DEFINES) @@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES) endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) -install(TARGETS llama LIBRARY PUBLIC_HEADER) +set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h) +install(TARGETS jarvis LIBRARY PUBLIC_HEADER) configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama - PATH_VARS LLAMA_INCLUDE_INSTALL_DIR - LLAMA_LIB_INSTALL_DIR - LLAMA_BIN_INSTALL_DIR ) + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis + PATH_VARS JARVIS_INCLUDE_INSTALL_DIR + JARVIS_LIB_INSTALL_DIR + JARVIS_BIN_INSTALL_DIR ) write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake - VERSION ${LLAMA_INSTALL_VERSION} + ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake + VERSION ${JARVIS_INSTALL_VERSION} COMPATIBILITY SameMajorVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis) install( FILES convert_hf_to_gguf.py @@ -190,27 +190,27 @@ install( WORLD_EXECUTE DESTINATION ${CMAKE_INSTALL_BINDIR}) -configure_file(cmake/llama.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" +configure_file(cmake/jarvis.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc" @ONLY) -install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc" DESTINATION lib/pkgconfig) # # utils, programs, examples and tests # -if (LLAMA_BUILD_COMMON) +if (JARVIS_BUILD_COMMON) add_subdirectory(common) endif() -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) +if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION) include(CTest) add_subdirectory(tests) endif() -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) +if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES) add_subdirectory(examples) add_subdirectory(pocs) endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4c882c254cac5..d24987c935c10 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ - Squash-merge PRs - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` -- Optionally pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules +- Optionally pick a `` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules # Coding guidelines @@ -22,7 +22,7 @@ - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices -- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ +- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ ![matmul](media/matmul.png) @@ -30,4 +30,4 @@ The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: -https://github.com/ggerganov/llama.cpp/projects +https://github.com/ggerganov/jarvis.cpp/projects diff --git a/Makefile b/Makefile index 719f45d167463..ad411dbdf8d18 100644 --- a/Makefile +++ b/Makefile @@ -1,44 +1,44 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ libllava.a \ - llama-baby-llama \ - llama-batched \ - llama-batched-bench \ - llama-bench \ - llama-cli \ - llama-convert-llama2c-to-ggml \ - llama-embedding \ - llama-eval-callback \ - llama-export-lora \ - llama-gbnf-validator \ - llama-gguf \ - llama-gguf-hash \ - llama-gguf-split \ - llama-gritlm \ - llama-imatrix \ - llama-infill \ - llama-llava-cli \ - llama-minicpmv-cli\ - llama-lookahead \ - llama-lookup \ - llama-lookup-create \ - llama-lookup-merge \ - llama-lookup-stats \ - llama-parallel \ - llama-passkey \ - llama-perplexity \ - llama-q8dot \ - llama-quantize \ - llama-quantize-stats \ - llama-retrieval \ - llama-save-load-state \ - llama-server \ - llama-simple \ - llama-speculative \ - llama-tokenize \ - llama-vdot \ - llama-cvector-generator \ - llama-gen-docs \ + jarvis-baby-jarvis \ + jarvis-batched \ + jarvis-batched-bench \ + jarvis-bench \ + jarvis-cli \ + jarvis-convert-jarvis2c-to-ggml \ + jarvis-embedding \ + jarvis-eval-callback \ + jarvis-export-lora \ + jarvis-gbnf-validator \ + jarvis-gguf \ + jarvis-gguf-hash \ + jarvis-gguf-split \ + jarvis-gritlm \ + jarvis-imatrix \ + jarvis-infill \ + jarvis-llava-cli \ + jarvis-minicpmv-cli\ + jarvis-lookahead \ + jarvis-lookup \ + jarvis-lookup-create \ + jarvis-lookup-merge \ + jarvis-lookup-stats \ + jarvis-parallel \ + jarvis-passkey \ + jarvis-perplexity \ + jarvis-q8dot \ + jarvis-quantize \ + jarvis-quantize-stats \ + jarvis-retrieval \ + jarvis-save-load-state \ + jarvis-server \ + jarvis-simple \ + jarvis-speculative \ + jarvis-tokenize \ + jarvis-vdot \ + jarvis-cvector-generator \ + jarvis-gen-docs \ tests/test-c.o # Binaries only useful for tests @@ -52,7 +52,7 @@ TEST_TARGETS = \ tests/test-grammar-integration \ tests/test-grammar-parser \ tests/test-json-schema-to-grammar \ - tests/test-llama-grammar \ + tests/test-jarvis-grammar \ tests/test-log \ tests/test-model-load-cancel \ tests/test-opt \ @@ -65,8 +65,8 @@ TEST_TARGETS = \ tests/test-tokenizer-1-spm # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \ retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. @@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding LEGACY_TARGETS_BUILD = main quantize perplexity embedding server # Deprecation aliases -ifdef LLAMA_CUBLAS -$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) +ifdef JARVIS_CUBLAS +$(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.) endif -ifdef LLAMA_CUDA +ifdef JARVIS_CUDA GGML_CUDA := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_KOMPUTE +ifdef JARVIS_KOMPUTE GGML_KOMPUTE := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_METAL +ifdef JARVIS_METAL GGML_METAL := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_RPC +ifdef JARVIS_RPC GGML_RPC := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_SYCL +ifdef JARVIS_SYCL GGML_SYCL := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_SYCL_F16 +ifdef JARVIS_SYCL_F16 GGML_SYCL_F16 := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_OPENBLAS +ifdef JARVIS_OPENBLAS GGML_OPENBLAS := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_OPENBLAS64 +ifdef JARVIS_OPENBLAS64 GGML_OPENBLAS64 := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_BLIS +ifdef JARVIS_BLIS GGML_BLIS := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_NO_LLAMAFILE -GGML_NO_LLAMAFILE := 1 +ifdef JARVIS_NO_JARVISFILE +GGML_NO_JARVISFILE := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_NO_ACCELERATE +ifdef JARVIS_NO_ACCELERATE GGML_NO_ACCELERATE := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_NO_OPENMP +ifdef JARVIS_NO_OPENMP GGML_NO_OPENMP := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_NO_METAL +ifdef JARVIS_NO_METAL GGML_NO_METAL := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_DISABLE_LOGS +ifdef JARVIS_DISABLE_LOGS REMOVE_WARNING := 1 endif -ifdef LLAMA_SERVER_VERBOSE +ifdef JARVIS_SERVER_VERBOSE REMOVE_WARNING := 1 endif @@ -211,8 +211,8 @@ test: $(TEST_TARGETS) @failures=0; \ for test_target in $(TEST_TARGETS); do \ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \ - ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ @@ -257,7 +257,7 @@ MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 -ifdef LLAMA_NO_CCACHE +ifdef JARVIS_NO_CCACHE GGML_NO_CCACHE := 1 DEPRECATE_WARNING := 1 endif @@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES) endif -ifdef LLAMA_DEBUG +ifdef JARVIS_DEBUG MK_CFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g MK_LDFLAGS += -g @@ -336,25 +336,25 @@ else MK_NVCCFLAGS += -O3 -g endif -ifdef LLAMA_SANITIZE_THREAD +ifdef JARVIS_SANITIZE_THREAD MK_CFLAGS += -fsanitize=thread -g MK_CXXFLAGS += -fsanitize=thread -g MK_LDFLAGS += -fsanitize=thread -g endif -ifdef LLAMA_SANITIZE_ADDRESS +ifdef JARVIS_SANITIZE_ADDRESS MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g endif -ifdef LLAMA_SANITIZE_UNDEFINED +ifdef JARVIS_SANITIZE_UNDEFINED MK_CFLAGS += -fsanitize=undefined -g MK_CXXFLAGS += -fsanitize=undefined -g MK_LDFLAGS += -fsanitize=undefined -g endif -ifdef LLAMA_SERVER_SSL +ifdef JARVIS_SERVER_SSL MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_LDFLAGS += -lssl -lcrypto endif @@ -381,7 +381,7 @@ MK_CXXFLAGS += \ -Wmissing-declarations \ -Wmissing-noreturn -ifeq ($(LLAMA_FATAL_WARNINGS),1) +ifeq ($(JARVIS_FATAL_WARNINGS),1) MK_CFLAGS += -Werror MK_CXXFLAGS += -Werror endif @@ -420,7 +420,7 @@ ifeq ($(_WIN32),1) LWINSOCK2 := -lws2_32 endif -ifdef LLAMA_GPROF +ifdef JARVIS_GPROF MK_CFLAGS += -pg MK_CXXFLAGS += -pg endif @@ -448,7 +448,7 @@ endif ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 - # https://github.com/ggerganov/llama.cpp/issues/2922 + # https://github.com/ggerganov/jarvis.cpp/issues/2922 MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move @@ -574,9 +574,9 @@ ifdef GGML_NVPL OBJ_GGML += ggml/src/ggml-blas.o endif # GGML_NVPL -ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML += ggml/src/llamafile/sgemm.o +ifndef GGML_NO_JARVISFILE + MK_CPPFLAGS += -DGGML_USE_JARVISFILE + OBJ_GGML += ggml/src/jarvisfile/sgemm.o endif ifndef GGML_NO_AMX @@ -627,9 +627,9 @@ ifdef GGML_CUDA OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(OBJ_CUDA_TMPL) -ifdef LLAMA_FATAL_WARNINGS +ifdef JARVIS_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings -endif # LLAMA_FATAL_WARNINGS +endif # JARVIS_FATAL_WARNINGS ifndef GGML_MUSA ifndef JETSON_EOL_MODULE_DETECT @@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT endif # GGML_MUSA -ifdef LLAMA_DEBUG +ifdef JARVIS_DEBUG MK_NVCCFLAGS += -lineinfo -endif # LLAMA_DEBUG +endif # JARVIS_DEBUG ifdef GGML_CUDA_DEBUG MK_NVCCFLAGS += --device-debug @@ -920,11 +920,11 @@ OBJ_GGML += \ ggml/src/ggml-quants.o \ ggml/src/ggml-aarch64.o -OBJ_LLAMA = \ - src/llama.o \ - src/llama-vocab.o \ - src/llama-grammar.o \ - src/llama-sampling.o \ +OBJ_JARVIS = \ + src/jarvis.o \ + src/jarvis-vocab.o \ + src/jarvis-grammar.o \ + src/jarvis-sampling.o \ src/unicode.o \ src/unicode-data.o @@ -939,19 +939,19 @@ OBJ_COMMON = \ common/build-info.o \ common/json-schema-to-grammar.o -OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) +OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON) LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT) LIB_GGML_S = $(LIB_PRE)ggml.a -LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT) -LIB_LLAMA_S = $(LIB_PRE)llama.a +LIB_JARVIS = $(LIB_PRE)jarvis$(DSO_EXT) +LIB_JARVIS_S = $(LIB_PRE)jarvis.a LIB_COMMON = $(LIB_PRE)common$(DSO_EXT) LIB_COMMON_S = $(LIB_PRE)common.a -LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON) -LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S) +LIB_ALL = $(LIB_GGML) $(LIB_JARVIS) $(LIB_COMMON) +LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S) GF_CC := $(CC) include scripts/get-flags.mk @@ -971,8 +971,8 @@ include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic endif -ifdef LLAMA_CURL -override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL +ifdef JARVIS_CURL +override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL override LDFLAGS := $(LDFLAGS) -lcurl endif @@ -980,7 +980,7 @@ endif # Print build information # -$(info I llama.cpp build info: ) +$(info I jarvis.cpp build info: ) $(info I UNAME_S: $(UNAME_S)) $(info I UNAME_P: $(UNAME_P)) $(info I UNAME_M: $(UNAME_M)) @@ -1009,30 +1009,30 @@ $(info ) ifdef DEPRECATE_WARNING $(info !!! DEPRECATION WARNING !!!) -$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) -$(info - LLAMA_CUDA) -$(info - LLAMA_METAL) -$(info - LLAMA_METAL_EMBED_LIBRARY) -$(info - LLAMA_OPENMP) -$(info - LLAMA_RPC) -$(info - LLAMA_SYCL) -$(info - LLAMA_SYCL_F16) -$(info - LLAMA_OPENBLAS) -$(info - LLAMA_OPENBLAS64) -$(info - LLAMA_BLIS) -$(info - LLAMA_NO_LLAMAFILE) -$(info - LLAMA_NO_ACCELERATE) -$(info - LLAMA_NO_OPENMP) -$(info - LLAMA_NO_METAL) -$(info - LLAMA_NO_CCACHE) +$(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) +$(info - JARVIS_CUDA) +$(info - JARVIS_METAL) +$(info - JARVIS_METAL_EMBED_LIBRARY) +$(info - JARVIS_OPENMP) +$(info - JARVIS_RPC) +$(info - JARVIS_SYCL) +$(info - JARVIS_SYCL_F16) +$(info - JARVIS_OPENBLAS) +$(info - JARVIS_OPENBLAS64) +$(info - JARVIS_BLIS) +$(info - JARVIS_NO_JARVISFILE) +$(info - JARVIS_NO_ACCELERATE) +$(info - JARVIS_NO_OPENMP) +$(info - JARVIS_NO_METAL) +$(info - JARVIS_NO_CCACHE) $(info ) endif ifdef REMOVE_WARNING $(info !!! REMOVAL WARNING !!!) -$(info The following LLAMA_ options have been removed and are no longer supported) -$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) -$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info The following JARVIS_ options have been removed and are no longer supported) +$(info - JARVIS_DISABLE_LOGS (https://github.com/ggerganov/jarvis.cpp/pull/9418)) +$(info - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418)) $(info ) endif @@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \ ggml/include/ggml-blas.h $(CXX) $(CXXFLAGS) -c $< -o $@ -ifndef GGML_NO_LLAMAFILE -ggml/src/llamafile/sgemm.o: \ - ggml/src/llamafile/sgemm.cpp \ - ggml/src/llamafile/sgemm.h \ +ifndef GGML_NO_JARVISFILE +ggml/src/jarvisfile/sgemm.o: \ + ggml/src/jarvisfile/sgemm.cpp \ + ggml/src/jarvisfile/sgemm.h \ ggml/include/ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # GGML_NO_LLAMAFILE +endif # GGML_NO_JARVISFILE ifndef GGML_NO_AMX ggml/src/ggml-amx.o: \ @@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \ $(OBJ_GGML) ar rcs $(LIB_GGML_S) $^ -# llama +# jarvis src/unicode.o: \ src/unicode.cpp \ @@ -1127,14 +1127,14 @@ src/unicode-data.o: \ src/unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ -src/llama.o: \ - src/llama.cpp \ - src/llama-impl.h \ - src/llama-vocab.h \ - src/llama-grammar.h \ - src/llama-sampling.h \ +src/jarvis.o: \ + src/jarvis.cpp \ + src/jarvis-impl.h \ + src/jarvis-vocab.h \ + src/jarvis-grammar.h \ + src/jarvis-sampling.h \ src/unicode.h \ - include/llama.h \ + include/jarvis.h \ ggml/include/ggml-cuda.h \ ggml/include/ggml-metal.h \ ggml/include/ggml.h \ @@ -1142,37 +1142,37 @@ src/llama.o: \ ggml/include/ggml-backend.h $(CXX) $(CXXFLAGS) -c $< -o $@ -src/llama-vocab.o: \ - src/llama-vocab.cpp \ - src/llama-vocab.h \ - src/llama-impl.h \ - include/llama.h +src/jarvis-vocab.o: \ + src/jarvis-vocab.cpp \ + src/jarvis-vocab.h \ + src/jarvis-impl.h \ + include/jarvis.h $(CXX) $(CXXFLAGS) -c $< -o $@ -src/llama-grammar.o: \ - src/llama-grammar.cpp \ - src/llama-grammar.h \ - src/llama-impl.h \ - src/llama-vocab.h \ - src/llama-sampling.h \ - include/llama.h +src/jarvis-grammar.o: \ + src/jarvis-grammar.cpp \ + src/jarvis-grammar.h \ + src/jarvis-impl.h \ + src/jarvis-vocab.h \ + src/jarvis-sampling.h \ + include/jarvis.h $(CXX) $(CXXFLAGS) -c $< -o $@ -src/llama-sampling.o: \ - src/llama-sampling.cpp \ - src/llama-sampling.h \ - src/llama-impl.h \ - include/llama.h +src/jarvis-sampling.o: \ + src/jarvis-sampling.cpp \ + src/jarvis-sampling.h \ + src/jarvis-impl.h \ + include/jarvis.h $(CXX) $(CXXFLAGS) -c $< -o $@ -$(LIB_LLAMA): \ - $(OBJ_LLAMA) \ +$(LIB_JARVIS): \ + $(OBJ_JARVIS) \ $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_LLAMA_S): \ - $(OBJ_LLAMA) - ar rcs $(LIB_LLAMA_S) $^ +$(LIB_JARVIS_S): \ + $(OBJ_JARVIS) + ar rcs $(LIB_JARVIS_S) $^ # common @@ -1183,7 +1183,7 @@ common/common.o: \ common/sampling.h \ common/json.hpp \ common/json-schema-to-grammar.h \ - include/llama.h + include/jarvis.h $(CXX) $(CXXFLAGS) -c $< -o $@ common/arg.o: \ @@ -1199,7 +1199,7 @@ common/log.o: \ common/sampling.o: \ common/sampling.cpp \ common/sampling.h \ - include/llama.h + include/jarvis.h $(CXX) $(CXXFLAGS) -c $< -o $@ common/console.o: \ @@ -1224,7 +1224,7 @@ common/ngram-cache.o: \ $(LIB_COMMON): \ $(OBJ_COMMON) \ - $(LIB_LLAMA) \ + $(LIB_JARVIS) \ $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) @@ -1246,7 +1246,7 @@ clean: rm -rvf ggml/*.dll rm -rvf ggml/*.so rm -vrf ggml/src/*.o - rm -rvf ggml/src/llamafile/*.o + rm -rvf ggml/src/jarvisfile/*.o rm -rvf common/build-info.cpp rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-cuda/*.o @@ -1269,75 +1269,75 @@ clean: # Helper function that replaces .c, .cpp, and .cu file endings with .o: GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) -llama-cli: examples/main/main.cpp \ +jarvis-cli: examples/main/main.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo - @echo '==== Run ./llama-cli -h for help. ====' + @echo '==== Run ./jarvis-cli -h for help. ====' @echo -llama-infill: examples/infill/infill.cpp \ +jarvis-infill: examples/infill/infill.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-simple: examples/simple/simple.cpp \ +jarvis-simple: examples/simple/simple.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-tokenize: examples/tokenize/tokenize.cpp \ +jarvis-tokenize: examples/tokenize/tokenize.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched: examples/batched/batched.cpp \ +jarvis-batched: examples/batched/batched.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched-bench: examples/batched-bench/batched-bench.cpp \ +jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize: examples/quantize/quantize.cpp \ +jarvis-quantize: examples/quantize/quantize.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ +jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-perplexity: examples/perplexity/perplexity.cpp \ +jarvis-perplexity: examples/perplexity/perplexity.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-imatrix: examples/imatrix/imatrix.cpp \ +jarvis-imatrix: examples/imatrix/imatrix.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-embedding: examples/embedding/embedding.cpp \ +jarvis-embedding: examples/embedding/embedding.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gritlm: examples/gritlm/gritlm.cpp \ +jarvis-gritlm: examples/gritlm/gritlm.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-save-load-state: examples/save-load-state/save-load-state.cpp \ +jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gguf: examples/gguf/gguf.cpp \ +jarvis-gguf: examples/gguf/gguf.cpp \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \ examples/gguf-hash/deps/sha256/sha256.c $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ -llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\ +jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gguf-split: examples/gguf-split/gguf-split.cpp \ +jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-eval-callback: examples/eval-callback/eval-callback.cpp \ +jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ +jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ +jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp \ +jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-baby-llama: examples/baby-llama/baby-llama.cpp \ +jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-export-lora: examples/export-lora/export-lora.cpp \ +jarvis-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-retrieval: examples/retrieval/retrieval.cpp \ +jarvis-retrieval: examples/retrieval/retrieval.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-speculative: examples/speculative/speculative.cpp \ +jarvis-speculative: examples/speculative/speculative.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-parallel: examples/parallel/parallel.cpp \ +jarvis-parallel: examples/parallel/parallel.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookahead: examples/lookahead/lookahead.cpp \ +jarvis-lookahead: examples/lookahead/lookahead.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup: examples/lookup/lookup.cpp \ +jarvis-lookup: examples/lookup/lookup.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-create: examples/lookup/lookup-create.cpp \ +jarvis-lookup-create: examples/lookup/lookup-create.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-merge: examples/lookup/lookup-merge.cpp \ +jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-stats: examples/lookup/lookup-stats.cpp \ +jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-passkey: examples/passkey/passkey.cpp \ +jarvis-passkey: examples/passkey/passkey.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ +jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) endif # GGML_RPC -llama-server: \ +jarvis-server: \ examples/server/server.cpp \ examples/server/utils.hpp \ examples/server/httplib.h \ @@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -llama-gen-docs: examples/gen-docs/gen-docs.cpp \ +jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual -llama-llava-cli: examples/llava/llava-cli.cpp \ +jarvis-llava-cli: examples/llava/llava-cli.cpp \ examples/llava/llava.cpp \ examples/llava/llava.h \ examples/llava/clip.cpp \ @@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual -llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ +jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ examples/llava/llava.cpp \ examples/llava/llava.h \ examples/llava/clip.cpp \ @@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp \ +tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-c.o: tests/test-c.c include/llama.h +tests/test-c.o: tests/test-c.c include/jarvis.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ tests/test-backend-ops: tests/test-backend-ops.cpp \ @@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \ # PoCs # -llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ +jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ +jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning # Eventually we will want to remove these target from building all the time. main: examples/deprecation-warning/deprecation-warning.o $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." + @echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead." server: examples/deprecation-warning/deprecation-warning.o $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." + @echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead." quantize: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard quantize)) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" - @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." + @echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif @@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard perplexity)) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" - @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." + @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif @@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard embedding)) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" - @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." + @echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." @echo "#########" endif diff --git a/Package.swift b/Package.swift index 3a17e6c349b01..2832bcf5c3caa 100644 --- a/Package.swift +++ b/Package.swift @@ -3,10 +3,10 @@ import PackageDescription var sources = [ - "src/llama.cpp", - "src/llama-vocab.cpp", - "src/llama-grammar.cpp", - "src/llama-sampling.cpp", + "src/jarvis.cpp", + "src/jarvis-vocab.cpp", + "src/jarvis-grammar.cpp", + "src/jarvis-sampling.cpp", "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", @@ -45,7 +45,7 @@ cSettings.append( #endif let package = Package( - name: "llama", + name: "jarvis", platforms: [ .macOS(.v12), .iOS(.v14), @@ -53,11 +53,11 @@ let package = Package( .tvOS(.v14) ], products: [ - .library(name: "llama", targets: ["llama"]), + .library(name: "jarvis", targets: ["jarvis"]), ], targets: [ .target( - name: "llama", + name: "jarvis", path: ".", exclude: [ "cmake", diff --git a/README.md b/README.md index 8fe1f4b4b6a7a..94bd09da9df8c 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,30 @@ -# llama.cpp +# jarvis.cpp -![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) +![jarvis](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) -[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) +[![Server](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml) +[![Conan Center](https://shields.io/conan/v/jarvis-cpp)](https://conan.io/center/jarvis-cpp) -[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) +[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) -Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +Inference of Meta's [JARVIS](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ ## Recent API changes -- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) -- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) +- [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289) +- [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291) ## Hot topics -- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669** -- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669** +- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- ## Description -The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide +The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide variety of hardware - locally and in the cloud. - Plain C/C++ implementation without any dependencies @@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud. - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has +Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has improved significantly thanks to many contributions. It is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. @@ -43,31 +43,31 @@ improved significantly thanks to many contributions. It is the main playground f Typically finetunes of the base models below are supported as well. -- [X] LLaMA 🦙 -- [x] LLaMA 2 🦙🦙 -- [x] LLaMA 3 🦙🦙🦙 +- [X] JARVIS 🦙 +- [x] JARVIS 2 🦙🦙 +- [x] JARVIS 3 🦙🦙🦙 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) -- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) +- [X] [Chinese JARVIS / Alpaca](https://github.com/ymcui/Chinese-JARVIS-Alpaca) and [Chinese JARVIS-2 / Alpaca-2](https://github.com/ymcui/Chinese-JARVIS-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) +- [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) -- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) -- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) -- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) +- [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417) +- [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [X] [StableLM models](https://huggingface.co/stabilityai) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) +- [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) - [x] [GPT-2](https://huggingface.co/gpt2) -- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) +- [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) @@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well. **Bindings:** -- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) -- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) -- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) -- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) +- Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python) +- Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp) +- Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp) +- JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp) - JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli) -- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) -- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) -- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) -- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) -- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) -- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) -- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) +- JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm) +- Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis) +- Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb) +- Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs) +- Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp) +- Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs) +- C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp) - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) -- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) -- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) -- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) -- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) -- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) -- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) -- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) -- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) -- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) +- Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj) +- React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn) +- Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp) +- Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig) +- Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart) +- PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326) +- Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp) +- Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift) +- Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis) **UI:** Unless otherwise noted these projects are open-source with permissive licensing: - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) -- [iohub/collama](https://github.com/iohub/coLLaMA) +- [iohub/cojarvis](https://github.com/iohub/coJARVIS) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) - [Faraday](https://faraday.dev/) (proprietary) @@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) -- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) +- [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile) - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) -- [ollama/ollama](https://github.com/ollama/ollama) +- [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) - [psugihara/FreeChat](https://github.com/psugihara/FreeChat) - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) @@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [AIKit](https://github.com/sozercan/aikit) (MIT) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) -- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL) - [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT) -*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* +*(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)* **Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML -- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp -- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption +- [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) – download models from the Ojarvis library to be used directly with jarvis.cpp +- [crashr/gppm](https://github.com/crashr/gppm) – launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage -- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) +- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) **Infrastructure:** -- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs -- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly +- [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly **Games:** - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. @@ -198,11 +198,11 @@ Unless otherwise noted these projects are open-source with permissive licensing: ## Demo
-Typical run using LLaMA v2 13B on M2 Ultra +Typical run using JARVIS v2 13B on M2 Ultra ``` -$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -I llama.cpp build info: +$ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e +I jarvis.cpp build info: I UNAME_S: Darwin I UNAME_P: arm I UNAME_M: arm64 @@ -215,12 +215,12 @@ I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1) make: Nothing to be done for `default'. main: build = 1041 (cf658ad) main: seed = 1692823051 -llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest)) -llama_model_loader: - type f32: 81 tensors -llama_model_loader: - type q4_0: 281 tensors -llama_model_loader: - type q6_K: 1 tensors +jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest)) +jarvis_model_loader: - type f32: 81 tensors +jarvis_model_loader: - type q4_0: 281 tensors +jarvis_model_loader: - type q6_K: 1 tensors llm_load_print_meta: format = GGUF V1 (latest) -llm_load_print_meta: arch = llama +llm_load_print_meta: arch = jarvis llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 @@ -240,7 +240,7 @@ llm_load_print_meta: freq_scale = 1 llm_load_print_meta: model type = 13B llm_load_print_meta: model ftype = mostly Q4_0 llm_load_print_meta: model size = 13.02 B -llm_load_print_meta: general.name = LLaMA v2 +llm_load_print_meta: general.name = JARVIS v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' @@ -248,8 +248,8 @@ llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.11 MB llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state) ................................................................................................... -llama_new_context_with_model: kv self size = 400.00 MB -llama_new_context_with_model: compute buffer total size = 75.41 MB +jarvis_new_context_with_model: kv self size = 400.00 MB +jarvis_new_context_with_model: compute buffer total size = 75.41 MB system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 @@ -271,19 +271,19 @@ How does a Website Work? A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable! The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking. How to -llama_print_timings: load time = 576.45 ms -llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second) -llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second) -llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second) -llama_print_timings: total time = 25431.49 ms +jarvis_print_timings: load time = 576.45 ms +jarvis_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second) +jarvis_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second) +jarvis_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second) +jarvis_print_timings: total time = 25431.49 ms ```
-Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook +Demo of running both JARVIS-7B and whisper.cpp on a single M1 Pro MacBook -And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: +And here is another demo of running both JARVIS-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4 @@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support Firstly, you need to get the binary. There are different methods that you can follow: - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md) -- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md) +- Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md) - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md) -- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases) +- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases) You can run a basic completion using this command: ```bash -llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128 +jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128 # Output: # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. @@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters. If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter: ```bash -llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv +jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv # Output: # > hi, who are you? @@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv # Easy peasy! The answer to 1+1 is... 2! ``` -By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template) ```bash -./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml +./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml ``` You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters: ```bash -./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' +./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' ``` ### Web server -[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. +[jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. Example usage: ```bash -./llama-server -m your_model.gguf --port 8080 +./jarvis-server -m your_model.gguf --port 8080 # Basic web UI can be accessed via browser: http://localhost:8080 # Chat completion endpoint: http://localhost:8080/v1/chat/completions @@ -357,7 +357,7 @@ Example usage: > [!NOTE] > If you prefer basic usage, please consider using conversation mode instead of interactive mode -In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes JARVIS emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. Here is an example of a few-shot interaction, invoked with the command @@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command ./examples/chat-13B.sh # custom arguments using a 13B model -./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt +./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` -Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program. +Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ### Persistent Interaction -The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. +The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. ```bash # Start a new chat @@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ ### Constrained output with grammars -`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: +`jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: ```bash -./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' +./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' ``` The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). @@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar ## Build -Please refer to [Build llama.cpp locally](./docs/build.md) +Please refer to [Build jarvis.cpp locally](./docs/build.md) ## Supported backends @@ -430,12 +430,12 @@ Please refer to [Build llama.cpp locally](./docs/build.md) ### Prepare and Quantize > [!NOTE] -> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours. +> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours. -To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. +To obtain the official JARVIS 2 weights please see the Obtaining and using the Facebook JARVIS 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. -Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. -It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face. +Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives. +It does not support JARVIS 3, you can use `convert_hf_to_gguf.py` with JARVIS 3 downloaded from Hugging Face. To learn more about quantizing model, [read this documentation](./examples/quantize/README.md) @@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). -To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md) +To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md) ## Contributing - Contributors can open PRs -- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch +- Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues, PRs and projects is very appreciated! -- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +- See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information -- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) +- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) ## Other documentations @@ -470,14 +470,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - [Running on Docker](./docs/docker.md) - [Build on Android](./docs/android.md) - [Performance troubleshooting](./docs/development/token_generation_performance_tips.md) -- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) +- [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks) **Seminal papers and background on the models** -If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: -- LLaMA: - - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) - - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) +If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of JARVIS models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between JARVIS models and ChatGPT: +- JARVIS: + - [Introducing JARVIS: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/) + - [JARVIS: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) - GPT-3 - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) - GPT-3.5 / InstructGPT / ChatGPT: diff --git a/SECURITY.md b/SECURITY.md index f4322c6ee4d18..da3cafecc23e6 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,6 +1,6 @@ # Security Policy - - [**Using llama.cpp securely**](#using-llamacpp-securely) + - [**Using jarvis.cpp securely**](#using-jarviscpp-securely) - [Untrusted models](#untrusted-models) - [Untrusted inputs](#untrusted-inputs) - [Data privacy](#data-privacy) @@ -8,7 +8,7 @@ - [Multi-Tenant environments](#multi-tenant-environments) - [**Reporting a vulnerability**](#reporting-a-vulnerability) -## Using llama.cpp securely +## Using jarvis.cpp securely ### Untrusted models Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources. @@ -26,7 +26,7 @@ For maximum security when handling untrusted inputs, you may need to employ the * Sandboxing: Isolate the environment where the inference happens. * Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics. -* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches. +* Updates: Keep both JARVIS C++ and your libraries updated with the latest security patches. * Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as: * Validation: Enforce strict rules on allowed characters and data types. * Filtering: Remove potentially malicious scripts or code fragments. @@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your ## Reporting a vulnerability -Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++. +Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of JARVIS C++. However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. -Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). +Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new). A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/ci/README.md b/ci/README.md index 4064705190697..a6a39b7901f18 100644 --- a/ci/README.md +++ b/ci/README.md @@ -1,11 +1,11 @@ # CI -In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: +In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework: https://github.com/ggml-org/ci It monitors the `master` branch for new commits and runs the -[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us +[ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled to cover various hardware architectures, including GPU and Apple Silicon instances. diff --git a/ci/run.sh b/ci/run.sh index dc26d94eed1fd..d4d934e86ac69 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -36,7 +36,7 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" +CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" @@ -217,7 +217,7 @@ function gg_sum_test_scripts_release { function gg_get_model { local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf" local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf" - local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" + local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf" if [[ -s $gguf_0 ]]; then echo -n "$gguf_0" elif [[ -s $gguf_1 ]]; then @@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug { local model; model=$(gg_get_model) cd build-ci-debug set -e - (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e cd .. } @@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release { local model; model=$(gg_get_model) cd build-ci-release set -e - (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e cd .. } @@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release { gg_printf '```\n' } -# open_llama_7b_v2 +# open_jarvis_7b_v2 -function gg_run_open_llama_7b_v2 { +function gg_run_open_jarvis_7b_v2 { cd ${SRC} - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin - gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin + gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ - path_models="../models-mnt/open-llama/7B-v2" + path_models="../models-mnt/open-jarvis/7B-v2" path_wiki="../models-mnt/wikitext/wikitext-2-raw" rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release @@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" @@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k - ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k - ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k - ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k - ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -387,10 +387,10 @@ function gg_run_open_llama_7b_v2 { set +e } -function gg_sum_open_llama_7b_v2 { +function gg_sum_open_jarvis_7b_v2 { gg_printf '### %s\n\n' "${ci}" - gg_printf 'OpenLLaMA 7B-v2:\n' + gg_printf 'OpenJARVIS 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" @@ -449,45 +449,45 @@ function gg_run_pythia_1_4b { wiki_test_60="${path_wiki}/wiki.test-60.raw" - ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k - ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k - ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k - ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k - ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/jarvis-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/jarvis-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -580,47 +580,47 @@ function gg_run_pythia_2_8b { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k - ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k - ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k - ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k - ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -704,10 +704,10 @@ function gg_run_embd_bge_small { model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" - ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0 - (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/jarvis-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log set +e } @@ -752,7 +752,7 @@ function gg_run_rerank_tiny { model_f16="${path_models}/ggml-model-f16.gguf" # for this model, the SEP token is "" - (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log + (time ./bin/jarvis-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log # sample output # rerank score 0: 0.029 @@ -804,11 +804,11 @@ function gg_check_build_requirements { ## main -export LLAMA_LOG_PREFIX=1 -export LLAMA_LOG_TIMESTAMPS=1 +export JARVIS_LOG_PREFIX=1 +export JARVIS_LOG_TIMESTAMPS=1 if [ -z ${GG_BUILD_LOW_PERF} ]; then - # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt + # Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt mnt_models=${MNT}/models mkdir -p ${mnt_models} @@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run pythia_1_4b else test $ret -eq 0 && gg_run pythia_2_8b - #test $ret -eq 0 && gg_run open_llama_7b_v2 + #test $ret -eq 0 && gg_run open_jarvis_7b_v2 fi test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_release diff --git a/cmake/llama-config.cmake.in b/cmake/jarvis-config.cmake.in similarity index 61% rename from cmake/llama-config.cmake.in rename to cmake/jarvis-config.cmake.in index f072b76a39d2e..a64ac57a49a54 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/jarvis-config.cmake.in @@ -1,7 +1,7 @@ -set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) -set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) -set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) -set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(JARVIS_VERSION @JARVIS_INSTALL_VERSION@) +set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@) +set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@) +set(JARVIS_SHARED_LIB @BUILD_SHARED_LIBS@) set(GGML_BLAS @GGML_BLAS@) set(GGML_CUDA @GGML_CUDA@) @@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@) @PACKAGE_INIT@ -set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") -set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") -set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") +set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@") +set_and_check(JARVIS_LIB_DIR "@PACKAGE_JARVIS_LIB_INSTALL_DIR@") +set_and_check(JARVIS_BIN_DIR "@PACKAGE_JARVIS_BIN_INSTALL_DIR@") # Ensure transient dependencies satisfied @@ -66,25 +66,25 @@ endif() find_library(ggml_LIBRARY ggml REQUIRED - HINTS ${LLAMA_LIB_DIR}) + HINTS ${JARVIS_LIB_DIR}) -find_library(llama_LIBRARY llama +find_library(jarvis_LIBRARY jarvis REQUIRED - HINTS ${LLAMA_LIB_DIR}) + HINTS ${JARVIS_LIB_DIR}) -set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") -set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") +set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") +set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@") -add_library(llama UNKNOWN IMPORTED) +add_library(jarvis UNKNOWN IMPORTED) -set_target_properties(llama +set_target_properties(jarvis PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" - INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" + INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}" + INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${llama_LIBRARY}" + IMPORTED_LOCATION "${jarvis_LIBRARY}" INTERFACE_COMPILE_FEATURES cxx_std_11 POSITION_INDEPENDENT_CODE ON ) -check_required_components(Llama) +check_required_components(Jarvis) diff --git a/cmake/llama.pc.in b/cmake/jarvis.pc.in similarity index 62% rename from cmake/llama.pc.in rename to cmake/jarvis.pc.in index 326acbb6108fd..f61f10f3ac073 100644 --- a/cmake/llama.pc.in +++ b/cmake/jarvis.pc.in @@ -3,8 +3,8 @@ exec_prefix=${prefix} libdir=${exec_prefix}/lib includedir=${prefix}/include -Name: llama -Description: Port of Facebook's LLaMA model in C/C++ +Name: jarvis +Description: Port of Facebook's JARVIS model in C/C++ Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lllama +Libs: -L${libdir} -ljarvis Cflags: -I${includedir} diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 042e895add5e2..cfaa05b33ab72 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() -set(LLAMA_COMMON_EXTRA_LIBS build_info) +set(JARVIS_COMMON_EXTRA_LIBS build_info) # Use curl to download model url -if (LLAMA_CURL) +if (JARVIS_CURL) find_package(CURL REQUIRED) - add_definitions(-DLLAMA_USE_CURL) + add_definitions(-DJARVIS_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) - set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) + set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) endif () target_include_directories(${TARGET} PUBLIC .) target_compile_features (${TARGET} PUBLIC cxx_std_11) -target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +target_link_libraries (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads) diff --git a/common/arg.cpp b/common/arg.cpp index e1e933934f0ef..73a3542593ca2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -17,7 +17,7 @@ using json = nlohmann::ordered_json; -common_arg & common_arg::set_examples(std::initializer_list examples) { +common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; } @@ -33,7 +33,7 @@ common_arg & common_arg::set_sparam() { return *this; } -bool common_arg::in_example(enum llama_example ex) { +bool common_arg::in_example(enum jarvis_example ex) { return examples.find(ex) != examples.end(); } @@ -279,7 +279,7 @@ static void common_params_print_usage(common_params_context & ctx_arg) { std::vector sparam_options; std::vector specific_options; for (auto & opt : ctx_arg.options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + // in case multiple JARVIS_EXAMPLE_* are set, we prioritize the JARVIS_EXAMPLE_* matching current example if (opt.is_sparam) { sparam_options.push_back(&opt); } else if (opt.in_example(ctx_arg.ex)) { @@ -292,12 +292,12 @@ static void common_params_print_usage(common_params_context & ctx_arg) { print_options(common_options); printf("\n\n----- sampling params -----\n\n"); print_options(sparam_options); - // TODO: maybe convert enum llama_example to string + // TODO: maybe convert enum jarvis_example to string printf("\n\n----- example-specific params -----\n\n"); print_options(specific_options); } -bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { +bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **)) { auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -322,7 +322,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e return true; } -common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { +common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **)) { common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; @@ -339,12 +339,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex /** * filter options by example * rules: - * - all examples inherit options from LLAMA_EXAMPLE_COMMON - * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example - * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + * - all examples inherit options from JARVIS_EXAMPLE_COMMON + * - if JARVIS_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_*,} are set, we will prioritize the JARVIS_EXAMPLE_* matching current example */ auto add_opt = [&](common_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + if (arg.in_example(ex) || arg.in_example(JARVIS_EXAMPLE_COMMON)) { ctx_arg.options.push_back(std::move(arg)); } }; @@ -361,8 +361,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--version"}, "show version and build info", [](common_params &) { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + fprintf(stderr, "version: %d (%s)\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT); + fprintf(stderr, "built with %s for %s\n", JARVIS_COMPILER, JARVIS_BUILD_TARGET); exit(0); } )); @@ -379,14 +379,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.display_prompt = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-co", "--color"}, string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [](common_params & params) { params.use_color = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL, JARVIS_EXAMPLE_SPECULATIVE, JARVIS_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), @@ -396,7 +396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams.n_threads = std::thread::hardware_concurrency(); } } - ).set_env("LLAMA_ARG_THREADS")); + ).set_env("JARVIS_ARG_THREADS")); add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", @@ -416,7 +416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", @@ -426,7 +426,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", @@ -524,7 +524,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", @@ -534,14 +534,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid range"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", [](common_params & params, int value) { params.draft_cpuparams.strict_cpu = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), @@ -551,14 +551,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", [](common_params & params, int value) { params.draft_cpuparams.poll = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", @@ -568,7 +568,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", @@ -578,14 +578,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", [](common_params & params, int value) { params.draft_cpuparams_batch.strict_cpu = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-batch-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), @@ -595,70 +595,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", [](common_params & params, int value) { params.draft_cpuparams_batch.poll = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--draft"}, "N", string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), [](common_params & params, int value) { params.n_draft = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE, JARVIS_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-ps", "--p-split"}, "N", string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), [](common_params & params, const std::string & value) { params.p_split = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", [](common_params & params, const std::string & value) { params.lookup_cache_static = value; } - ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({JARVIS_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", [](common_params & params, const std::string & value) { params.lookup_cache_dynamic = value; } - ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({JARVIS_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](common_params & params, int value) { params.n_ctx = value; } - ).set_env("LLAMA_ARG_CTX_SIZE")); + ).set_env("JARVIS_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), [](common_params & params, int value) { params.n_predict = value; } - ).set_env("LLAMA_ARG_N_PREDICT")); + ).set_env("JARVIS_ARG_N_PREDICT")); add_opt(common_arg( {"-b", "--batch-size"}, "N", string_format("logical maximum batch size (default: %d)", params.n_batch), [](common_params & params, int value) { params.n_batch = value; } - ).set_env("LLAMA_ARG_BATCH")); + ).set_env("JARVIS_ARG_BATCH")); add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", string_format("physical maximum batch size (default: %d)", params.n_ubatch), [](common_params & params, int value) { params.n_ubatch = value; } - ).set_env("LLAMA_ARG_UBATCH")); + ).set_env("JARVIS_ARG_UBATCH")); add_opt(common_arg( {"--keep"}, "N", string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), @@ -672,24 +672,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), [](common_params & params, int value) { params.n_chunks = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX, JARVIS_EXAMPLE_PERPLEXITY, JARVIS_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), [](common_params & params) { params.flash_attn = true; } - ).set_env("LLAMA_ARG_FLASH_ATTN")); + ).set_env("JARVIS_ARG_FLASH_ATTN")); add_opt(common_arg( {"-p", "--prompt"}, "PROMPT", - ex == LLAMA_EXAMPLE_MAIN + ex == JARVIS_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", [](common_params & params, const std::string & value) { @@ -698,12 +698,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--no-perf"}, - string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + string_format("disable internal libjarvis performance timings (default: %s)", params.no_perf ? "true" : "false"), [](common_params & params) { params.no_perf = true; params.sparams.no_perf = true; } - ).set_env("LLAMA_ARG_NO_PERF")); + ).set_env("JARVIS_ARG_NO_PERF")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", @@ -730,7 +730,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.in_files.push_back(value); } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", @@ -767,42 +767,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.n_print = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", [](common_params & params, const std::string & value) { params.path_prompt_cache = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", [](common_params & params) { params.prompt_cache_all = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", [](common_params & params) { params.prompt_cache_ro = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-sp", "--special"}, string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), [](common_params & params) { params.special = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, string_format( @@ -815,28 +815,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.conversation = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), [](common_params & params) { params.interactive = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-if", "--interactive-first"}, string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), [](common_params & params) { params.interactive_first = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", [](common_params & params) { params.multiline_input = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", @@ -844,7 +844,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_prefix_bos = true; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", @@ -852,7 +852,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_prefix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL})); add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", @@ -860,14 +860,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_suffix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL})); add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"--spm-infill"}, string_format( @@ -877,7 +877,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.spm_infill = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + ).set_examples({JARVIS_EXAMPLE_SERVER, JARVIS_EXAMPLE_INFILL})); add_opt(common_arg( {"--samplers"}, "SAMPLERS", string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), @@ -888,7 +888,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", - string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, JARVIS_DEFAULT_SEED), [](common_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } @@ -1101,7 +1101,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", [](common_params & params, const std::string & value) { std::stringstream ss(value); - llama_token key; + jarvis_token key; char sign; std::string value_str; try { @@ -1149,103 +1149,103 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } + /**/ if (value == "none") { params.pooling_type = JARVIS_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = JARVIS_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = JARVIS_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = JARVIS_POOLING_TYPE_LAST; } + else if (value == "rank") { params.pooling_type = JARVIS_POOLING_TYPE_RANK; } else { throw std::invalid_argument("invalid value"); } } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); + ).set_examples({JARVIS_EXAMPLE_EMBEDDING, JARVIS_EXAMPLE_RETRIEVAL, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_POOLING")); add_opt(common_arg( {"--attention"}, "{causal,non-causal}", "attention type for embeddings, use model default if unspecified", [](common_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + /**/ if (value == "causal") { params.attention_type = JARVIS_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = JARVIS_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + ).set_examples({JARVIS_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_YARN; } else { throw std::invalid_argument("invalid value"); } } - ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); + ).set_env("JARVIS_ARG_ROPE_SCALING_TYPE")); add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", [](common_params & params, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } - ).set_env("LLAMA_ARG_ROPE_SCALE")); + ).set_env("JARVIS_ARG_ROPE_SCALE")); add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", [](common_params & params, const std::string & value) { params.rope_freq_base = std::stof(value); } - ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); + ).set_env("JARVIS_ARG_ROPE_FREQ_BASE")); add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", [](common_params & params, const std::string & value) { params.rope_freq_scale = std::stof(value); } - ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); + ).set_env("JARVIS_ARG_ROPE_FREQ_SCALE")); add_opt(common_arg( {"--yarn-orig-ctx"}, "N", string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), [](common_params & params, int value) { params.yarn_orig_ctx = value; } - ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); + ).set_env("JARVIS_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } - ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); + ).set_env("JARVIS_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } - ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); + ).set_env("JARVIS_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } - ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); + ).set_env("JARVIS_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } - ).set_env("LLAMA_ARG_YARN_BETA_FAST")); + ).set_env("JARVIS_ARG_YARN_BETA_FAST")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", string_format("group-attention factor (default: %d)", params.grp_attn_n), [](common_params & params, int value) { params.grp_attn_n = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); + ).set_env("JARVIS_ARG_GRP_ATTN_N").set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", string_format("group-attention width (default: %d)", params.grp_attn_w), [](common_params & params, int value) { params.grp_attn_w = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_env("JARVIS_ARG_GRP_ATTN_W").set_examples({JARVIS_EXAMPLE_MAIN})); add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", @@ -1259,7 +1259,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.no_kv_offload = true; } - ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + ).set_env("JARVIS_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), @@ -1267,7 +1267,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex // TODO: get the type right here params.cache_type_k = value; } - ).set_env("LLAMA_ARG_CACHE_TYPE_K")); + ).set_env("JARVIS_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), @@ -1275,141 +1275,141 @@ common_params_context common_params_parser_init(common_params & params, llama_ex // TODO: get the type right here params.cache_type_v = value; } - ).set_env("LLAMA_ARG_CACHE_TYPE_V")); + ).set_env("JARVIS_ARG_CACHE_TYPE_V")); add_opt(common_arg( {"--perplexity", "--all-logits"}, string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), [](common_params & params) { params.logits_all = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", [](common_params & params) { params.hellaswag = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), [](common_params & params, int value) { params.hellaswag_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", [](common_params & params) { params.winogrande = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande-tasks"}, "N", string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), [](common_params & params, int value) { params.winogrande_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", [](common_params & params) { params.multiple_choice = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), [](common_params & params, int value) { params.multiple_choice_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", [](common_params & params) { params.kl_divergence = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", [](common_params & params, const std::string & value) { params.logits_file = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-stride"}, "N", string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), [](common_params & params, int value) { params.ppl_stride = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), [](common_params & params, int value) { params.ppl_output_type = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({JARVIS_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), [](common_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } - ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + ).set_env("JARVIS_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), [](common_params & params, int value) { params.n_parallel = value; } - ).set_env("LLAMA_ARG_N_PARALLEL")); + ).set_env("JARVIS_ARG_N_PARALLEL")); add_opt(common_arg( {"-ns", "--sequences"}, "N", string_format("number of sequences to decode (default: %d)", params.n_sequences), [](common_params & params, int value) { params.n_sequences = value; } - ).set_examples({LLAMA_EXAMPLE_PARALLEL})); + ).set_examples({JARVIS_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), [](common_params & params) { params.cont_batching = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CONT_BATCHING")); add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", [](common_params & params) { params.cont_batching = false; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj = value; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples({JARVIS_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", [](common_params & params, const std::string & value) { params.image.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); - if (llama_supports_rpc()) { + ).set_examples({JARVIS_EXAMPLE_LLAVA})); + if (jarvis_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", [](common_params & params, const std::string & value) { params.rpc_servers = value; } - ).set_env("LLAMA_ARG_RPC")); + ).set_env("JARVIS_ARG_RPC")); } add_opt(common_arg( {"--mlock"}, @@ -1417,14 +1417,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.use_mlock = true; } - ).set_env("LLAMA_ARG_MLOCK")); + ).set_env("JARVIS_ARG_MLOCK")); add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", [](common_params & params) { params.use_mmap = false; } - ).set_env("LLAMA_ARG_NO_MMAP")); + ).set_env("JARVIS_ARG_NO_MMAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" @@ -1432,36 +1432,36 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- isolate: only spawn threads on CPUs on the node that execution started on\n" "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", + "see https://github.com/ggerganov/jarvis.cpp/issues/1437", [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { throw std::invalid_argument("invalid value"); } } - ).set_env("LLAMA_ARG_NUMA")); + ).set_env("JARVIS_ARG_NUMA")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", [](common_params & params, int value) { params.n_gpu_layers = value; - if (!llama_supports_gpu_offload()) { + if (!jarvis_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + ).set_env("JARVIS_ARG_N_GPU_LAYERS")); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", [](common_params & params, int value) { params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { + if (!jarvis_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1471,23 +1471,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + params.split_mode = JARVIS_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + params.split_mode = JARVIS_SPLIT_MODE_LAYER; } else if (arg_next == "row") { #ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + fprintf(stderr, "warning: The split mode value:[row] is not supported by jarvis.cpp with SYCL. It's developing.\nExit!\n"); exit(1); #endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; + params.split_mode = JARVIS_SPLIT_MODE_ROW; } else { throw std::invalid_argument("invalid value"); } - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); + if (!jarvis_supports_gpu_offload()) { + fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); } } - ).set_env("LLAMA_ARG_SPLIT_MODE")); + ).set_env("JARVIS_ARG_SPLIT_MODE")); add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", @@ -1498,33 +1498,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex const std::regex regex{ R"([,/]+)" }; std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { + if (split_arg.size() >= jarvis_max_devices()) { throw std::invalid_argument( - string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)jarvis_max_devices()) ); } - for (size_t i = 0; i < llama_max_devices(); ++i) { + for (size_t i = 0; i < jarvis_max_devices(); ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); } else { params.tensor_split[i] = 0.0f; } } - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); + if (!jarvis_supports_gpu_offload()) { + fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); } } - ).set_env("LLAMA_ARG_TENSOR_SPLIT")); + ).set_env("JARVIS_ARG_TENSOR_SPLIT")); add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), [](common_params & params, int value) { params.main_gpu = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); + if (!jarvis_supports_gpu_offload()) { + fprintf(stderr, "warning: jarvis.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); } } - ).set_env("LLAMA_ARG_MAIN_GPU")); + ).set_env("JARVIS_ARG_MAIN_GPU")); add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), @@ -1549,7 +1549,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.lora_adapters.push_back({ std::string(value), 1.0 }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", @@ -1557,7 +1557,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.lora_adapters.push_back({ fname, std::stof(scale) }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", @@ -1587,10 +1587,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.model_alias = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ALIAS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", - ex == LLAMA_EXAMPLE_EXPORT_LORA + ex == JARVIS_EXAMPLE_EXPORT_LORA ? std::string("model path from which to load base model") : string_format( "model path (default: `models/$filename` with filename from `--hf-file` " @@ -1599,35 +1599,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.model = value; } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + ).set_examples({JARVIS_EXAMPLE_COMMON, JARVIS_EXAMPLE_EXPORT_LORA}).set_env("JARVIS_ARG_MODEL")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.model_draft = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({JARVIS_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", [](common_params & params, const std::string & value) { params.model_url = value; } - ).set_env("LLAMA_ARG_MODEL_URL")); + ).set_env("JARVIS_ARG_MODEL_URL")); add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", [](common_params & params, const std::string & value) { params.hf_repo = value; } - ).set_env("LLAMA_ARG_HF_REPO")); + ).set_env("JARVIS_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", [](common_params & params, const std::string & value) { params.hf_file = value; } - ).set_env("LLAMA_ARG_HF_FILE")); + ).set_env("JARVIS_ARG_HF_FILE")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", @@ -1645,41 +1645,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.context_files.push_back(value); } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({JARVIS_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-size"}, "N", string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), [](common_params & params, int value) { params.chunk_size = value; } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({JARVIS_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-separator"}, "STRING", string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), [](common_params & params, const std::string & value) { params.chunk_separator = value; } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({JARVIS_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--junk"}, "N", string_format("number of times to repeat the junk text (default: %d)", params.n_junk), [](common_params & params, int value) { params.n_junk = value; } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + ).set_examples({JARVIS_EXAMPLE_PASSKEY})); add_opt(common_arg( {"--pos"}, "N", string_format("position of the passkey in the junk text (default: %d)", params.i_pos), [](common_params & params, int value) { params.i_pos = value; } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + ).set_examples({JARVIS_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", string_format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA + ex == JARVIS_EXAMPLE_EXPORT_LORA ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + : ex == JARVIS_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), [](common_params & params, const std::string & value) { @@ -1687,49 +1687,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cvector_outfile = value; params.lora_outfile = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX, JARVIS_EXAMPLE_CVECTOR_GENERATOR, JARVIS_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), [](common_params & params, int value) { params.n_out_freq = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), [](common_params & params, int value) { params.n_save_freq = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--process-output"}, string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), [](common_params & params) { params.process_output = true; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--no-ppl"}, string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), [](common_params & params) { params.compute_ppl = false; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", string_format("start processing the input from chunk N (default: %d)", params.i_chunk), [](common_params & params, int value) { params.i_chunk = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({JARVIS_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), [](common_params & params) { params.is_pp_shared = true; } - ).set_examples({LLAMA_EXAMPLE_BENCH})); + ).set_examples({JARVIS_EXAMPLE_BENCH})); add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", @@ -1737,7 +1737,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } - ).set_examples({LLAMA_EXAMPLE_BENCH})); + ).set_examples({JARVIS_EXAMPLE_BENCH})); add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", @@ -1745,7 +1745,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } - ).set_examples({LLAMA_EXAMPLE_BENCH})); + ).set_examples({JARVIS_EXAMPLE_BENCH})); add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", @@ -1753,70 +1753,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } - ).set_examples({LLAMA_EXAMPLE_BENCH})); + ).set_examples({JARVIS_EXAMPLE_BENCH})); add_opt(common_arg( {"--embd-normalize"}, "N", string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), [](common_params & params, int value) { params.embd_normalize = value; } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + ).set_examples({JARVIS_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", [](common_params & params, const std::string & value) { params.embd_out = value; } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + ).set_examples({JARVIS_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-separator"}, "STRING", "separator of embeddings (default \\n) for example \"<#sep#>\"", [](common_params & params, const std::string & value) { params.embd_sep = value; } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + ).set_examples({JARVIS_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", string_format("ip address to listen (default: %s)", params.hostname.c_str()), [](common_params & params, const std::string & value) { params.hostname = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", string_format("port to listen (default: %d)", params.port), [](common_params & params, int value) { params.port = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", string_format("path to serve static files from (default: %s)", params.public_path.c_str()), [](common_params & params, const std::string & value) { params.public_path = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_STATIC_PATH")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), [](common_params & params) { params.embedding = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), [](common_params & params) { params.reranking = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_RERANKING")); add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", [](common_params & params, const std::string & value) { params.api_keys.push_back(value); } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_API_KEY")); add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", @@ -1833,21 +1833,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } key_file.close(); } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({JARVIS_EXAMPLE_SERVER})); add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", [](common_params & params, const std::string & value) { params.ssl_file_key = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_SSL_KEY_FILE")); add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", [](common_params & params, const std::string & value) { params.ssl_file_cert = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_SSL_CERT_FILE")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), @@ -1855,49 +1855,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.timeout_read = value; params.timeout_write = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_TIMEOUT")); add_opt(common_arg( {"--threads-http"}, "N", string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), [](common_params & params, int value) { params.n_threads_http = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_THREADS_HTTP")); add_opt(common_arg( {"--cache-reuse"}, "N", string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), [](common_params & params, int value) { params.n_cache_reuse = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CACHE_REUSE")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_metrics = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_METRICS")); add_opt(common_arg( {"--slots"}, string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_slots = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--props"}, string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_props = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--no-slots"}, "disables slots monitoring endpoint", [](common_params & params) { params.endpoint_slots = false; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + ).set_examples({JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_NO_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", @@ -1908,44 +1908,44 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.slot_save_path += DIRECTORY_SEPARATOR; } } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({JARVIS_EXAMPLE_SERVER})); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template", [](common_params & params, const std::string & value) { if (!common_chat_verify_template(value)) { throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" - "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + "note: jarvis.cpp does not use jinja parser, we only support commonly used templates\n", value.c_str() )); } params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_SERVER}).set_env("JARVIS_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({JARVIS_EXAMPLE_SERVER})); add_opt(common_arg( {"--lora-init-without-apply"}, string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), [](common_params & params) { params.lora_init_without_apply = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({JARVIS_EXAMPLE_SERVER})); add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", [](common_params & params) { params.simple_io = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + ).set_examples({JARVIS_EXAMPLE_MAIN, JARVIS_EXAMPLE_INFILL})); add_opt(common_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", @@ -1963,28 +1963,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.cvector_positive_file = value; } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--negative-file"}, "FNAME", string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), [](common_params & params, const std::string & value) { params.cvector_negative_file = value; } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-batch"}, "N", string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), [](common_params & params, int value) { params.n_pca_batch = value; } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-iter"}, "N", string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), [](common_params & params, int value) { params.n_pca_iterations = value; } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", @@ -1993,7 +1993,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + ).set_examples({JARVIS_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", @@ -2002,7 +2002,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } } - ).set_examples({LLAMA_EXAMPLE_BENCH})); + ).set_examples({JARVIS_EXAMPLE_BENCH})); add_opt(common_arg( {"--log-disable"}, "Log disable", @@ -2023,7 +2023,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params &) { common_log_set_colors(common_log_main(), true); } - ).set_env("LLAMA_LOG_COLORS")); + ).set_env("JARVIS_LOG_COLORS")); add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", @@ -2039,21 +2039,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.verbosity = value; common_log_set_verbosity_thold(value); } - ).set_env("LLAMA_LOG_VERBOSITY")); + ).set_env("JARVIS_LOG_VERBOSITY")); add_opt(common_arg( {"--log-prefix"}, "Enable prefx in log messages", [](common_params &) { common_log_set_prefix(common_log_main(), true); } - ).set_env("LLAMA_LOG_PREFIX")); + ).set_env("JARVIS_LOG_PREFIX")); add_opt(common_arg( {"--log-timestamps"}, "Enable timestamps in log messages", [](common_params &) { common_log_set_timestamps(common_log_main(), true); } - ).set_env("LLAMA_LOG_TIMESTAMPS")); + ).set_env("JARVIS_LOG_TIMESTAMPS")); return ctx_arg; } diff --git a/common/arg.h b/common/arg.h index a6700d323cc14..7c6f1eeea3308 100644 --- a/common/arg.h +++ b/common/arg.h @@ -11,7 +11,7 @@ // struct common_arg { - std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::set examples = {JARVIS_EXAMPLE_COMMON}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value @@ -52,17 +52,17 @@ struct common_arg { void (*handler)(common_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - common_arg & set_examples(std::initializer_list examples); + common_arg & set_examples(std::initializer_list examples); common_arg & set_env(const char * env); common_arg & set_sparam(); - bool in_example(enum llama_example ex); + bool in_example(enum jarvis_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); }; struct common_params_context { - enum llama_example ex = LLAMA_EXAMPLE_COMMON; + enum jarvis_example ex = JARVIS_EXAMPLE_COMMON; common_params & params; std::vector options; void(*print_usage)(int, char **) = nullptr; @@ -71,7 +71,7 @@ struct common_params_context { // parse input arguments from CLI // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr); // function to be used by test-arg-parser -common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in index 0b945aa68fff3..aac4ba7e9e33a 100644 --- a/common/build-info.cpp.in +++ b/common/build-info.cpp.in @@ -1,4 +1,4 @@ -int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; -char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; -char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; -char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; +int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@; +char const *JARVIS_COMMIT = "@BUILD_COMMIT@"; +char const *JARVIS_COMPILER = "@BUILD_COMPILER@"; +char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@"; diff --git a/common/common.cpp b/common/common.cpp index ff8cc4076e95d..fa32f671eb6f5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -8,7 +8,7 @@ #define JSON_ASSERT GGML_ASSERT #include "json.hpp" #include "json-schema-to-grammar.h" -#include "llama.h" +#include "jarvis.h" #include #include @@ -48,7 +48,7 @@ #include #include #endif -#if defined(LLAMA_USE_CURL) +#if defined(JARVIS_USE_CURL) #include #include #include @@ -58,7 +58,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if defined(LLAMA_USE_CURL) +#if defined(JARVIS_USE_CURL) #ifdef __linux__ #include #elif defined(_WIN32) @@ -66,8 +66,8 @@ #else #include #endif -#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 -#endif // LLAMA_USE_CURL +#define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 +#endif // JARVIS_USE_CURL using json = nlohmann::ordered_json; @@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD } void common_init() { - llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { - if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { + jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { + if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) { common_log_add(common_log_main(), level, "%s", text); } }, NULL); @@ -376,7 +376,7 @@ void common_init() { const char * build_type = " (debug)"; #endif - LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type); } std::string common_params_get_system_info(const common_params & params) { @@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) { #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later // TODO: windows + arm64 + mingw64 DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); - os << " / " << logicalProcessorCount << " | " << llama_print_system_info(); + os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info(); #else - os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); + os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info(); #endif return os.str(); @@ -483,7 +483,7 @@ std::string string_from(const std::vector & values) { return buf.str(); } -std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { +std::string string_from(const struct jarvis_context * ctx, const std::vector & tokens) { std::stringstream buf; buf << "[ "; @@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector & overrides) { +bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); return false; } - llama_model_kv_override kvo; + jarvis_model_kv_override kvo; std::strncpy(kvo.key, data, sep - data); kvo.key[sep - data] = 0; sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT; kvo.val_i64 = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT; kvo.val_f64 = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.val_bool = true; } else if (std::strcmp(sep, "false") == 0) { @@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector 127) { LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; @@ -788,8 +788,8 @@ std::string fs_get_cache_directory() { } return p; }; - if (getenv("LLAMA_CACHE")) { - cache_directory = std::getenv("LLAMA_CACHE"); + if (getenv("JARVIS_CACHE")) { + cache_directory = std::getenv("JARVIS_CACHE"); } else { #ifdef __linux__ if (std::getenv("XDG_CACHE_HOME")) { @@ -803,7 +803,7 @@ std::string fs_get_cache_directory() { cache_directory = std::getenv("LOCALAPPDATA"); #endif // __linux__ cache_directory = ensure_trailing_slash(cache_directory); - cache_directory += "llama.cpp"; + cache_directory += "jarvis.cpp"; } return ensure_trailing_slash(cache_directory); } @@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) { // struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; - auto mparams = common_model_params_to_llama(params); + auto mparams = common_model_params_to_jarvis(params); - llama_model * model = nullptr; + jarvis_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else if (!params.model_url.empty()) { model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else { - model = llama_load_model_from_file(params.model.c_str(), mparams); + model = jarvis_load_model_from_file(params.model.c_str(), mparams); } if (model == NULL) { @@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.reranking) { bool ok = true; - if (llama_token_bos(model) == LLAMA_TOKEN_NULL) { + if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__); ok = false; } - if (llama_token_eos(model) == LLAMA_TOKEN_NULL) { + if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__); ok = false; } - if (llama_token_sep(model) == LLAMA_TOKEN_NULL) { + if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__); ok = false; } if (!ok) { - llama_free_model(model); + jarvis_free_model(model); return iparams; } } - auto cparams = common_context_params_to_llama(params); + auto cparams = common_context_params_to_jarvis(params); - llama_context * lctx = llama_new_context_with_model(model, cparams); + jarvis_context * lctx = jarvis_new_context_with_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); + jarvis_free_model(model); return iparams; } if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; - if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = jarvis_n_layer(model); const auto cvec = common_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { - llama_free(lctx); - llama_free_model(model); + jarvis_free(lctx); + jarvis_free_model(model); return iparams; } - int err = llama_control_vector_apply(lctx, + int err = jarvis_control_vector_apply(lctx, cvec.data.data(), cvec.data.size(), cvec.n_embd, params.control_vector_layer_start, params.control_vector_layer_end); if (err) { - llama_free(lctx); - llama_free_model(model); + jarvis_free(lctx); + jarvis_free_model(model); return iparams; } @@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) { common_lora_adapter_container loaded_la; loaded_la.path = la.path; loaded_la.scale = la.scale; - loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str()); if (loaded_la.adapter == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_free_model(model); + jarvis_free(lctx); + jarvis_free_model(model); return iparams; } iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters @@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) { common_lora_adapters_apply(lctx, iparams.lora_adapters); } - if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { + if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); params.sparams.ignore_eos = false; } @@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); - std::vector tmp; - llama_token bos = llama_token_bos(model); - llama_token eos = llama_token_eos(model); + std::vector tmp; + jarvis_token bos = jarvis_token_bos(model); + jarvis_token eos = jarvis_token_eos(model); // some models (e.g. T5) don't have a BOS token - if (bos != LLAMA_TOKEN_NULL) { + if (bos != JARVIS_TOKEN_NULL) { tmp.push_back(bos); } - if (eos != LLAMA_TOKEN_NULL) { + if (eos != JARVIS_TOKEN_NULL) { tmp.push_back(eos); } if (tmp.empty()) { tmp.push_back(0); } - if (llama_model_has_encoder(model)) { - llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); - llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (jarvis_model_has_encoder(model)) { + jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size())); + jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model); if (decoder_start_token_id == -1) { decoder_start_token_id = bos; } tmp.clear(); tmp.push_back(decoder_start_token_id); } - if (llama_model_has_decoder(model)) { - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); + if (jarvis_model_has_decoder(model)) { + jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(lctx); - llama_synchronize(lctx); - llama_perf_context_reset(lctx); + jarvis_kv_cache_clear(lctx); + jarvis_synchronize(lctx); + jarvis_perf_context_reset(lctx); } iparams.model = model; @@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { - llama_lora_adapter_clear(ctx); +void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector & lora_adapters) { + jarvis_lora_adapter_clear(ctx); for (auto & la : lora_adapters) { if (la.scale != 0.0f) { - llama_lora_adapter_set(ctx, la.adapter, la.scale); + jarvis_lora_adapter_set(ctx, la.adapter, la.scale); } } } -struct llama_model_params common_model_params_to_llama(const common_params & params) { - auto mparams = llama_model_default_params(); +struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) { + auto mparams = jarvis_model_default_params(); if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; @@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { throw std::runtime_error("Unsupported cache type: " + s); } -struct llama_context_params common_context_params_to_llama(const common_params & params) { - auto cparams = llama_context_default_params(); +struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) { + auto cparams = jarvis_context_default_params(); cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; @@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & if (params.reranking) { cparams.embeddings = true; - cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; + cparams.pooling_type = JARVIS_POOLING_TYPE_RANK; } cparams.type_k = kv_cache_type_from_str(params.cache_type_k); @@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p return tpp; } -#ifdef LLAMA_USE_CURL +#ifdef JARVIS_USE_CURL #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 @@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); // helper function to hide password in URL - auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string { std::size_t protocol_pos = url.find("://"); if (protocol_pos == std::string::npos) { return url; // Malformed URL @@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa // start the download LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); if (!was_perform_successful) { return false; @@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa return true; } -struct llama_model * common_load_model_from_url( +struct jarvis_model * common_load_model_from_url( const char * model_url, const char * path_model, const char * hf_token, - const struct llama_model_params & params) { + const struct jarvis_model_params & params) { // Basic validation of the model_url if (!model_url || strlen(model_url) == 0) { LOG_ERR("%s: invalid model_url\n", __func__); @@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url( if (n_split > 1) { char split_prefix[PATH_MAX] = {0}; - char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0}; // Verify the first split file format // and extract split URL and PATH prefixes { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); return NULL; } - if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); return NULL; } @@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url( for (int idx = 1; idx < n_split; idx++) { futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool { char split_path[PATH_MAX] = {0}; - llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); + jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); - char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; - llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); + char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0}; + jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); return common_download_file(split_url, split_path, hf_token); }, idx)); @@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url( } } - return llama_load_model_from_file(path_model, params); + return jarvis_load_model_from_file(path_model, params); } -struct llama_model * common_load_model_from_hf( +struct jarvis_model * common_load_model_from_hf( const char * repo, const char * model, const char * path_model, const char * hf_token, - const struct llama_model_params & params) { + const struct jarvis_model_params & params) { // construct hugging face model url: // - // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf - // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf + // --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf + // https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf // // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf @@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf( #else -struct llama_model * common_load_model_from_url( +struct jarvis_model * common_load_model_from_url( const char * /*model_url*/, const char * /*path_model*/, const char * /*hf_token*/, - const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + const struct jarvis_model_params & /*params*/) { + LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } -struct llama_model * common_load_model_from_hf( +struct jarvis_model * common_load_model_from_hf( const char * /*repo*/, const char * /*model*/, const char * /*path_model*/, const char * /*hf_token*/, - const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + const struct jarvis_model_params & /*params*/) { + LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; } -#endif // LLAMA_USE_CURL +#endif // JARVIS_USE_CURL // // Batch utils // -void common_batch_clear(struct llama_batch & batch) { +void common_batch_clear(struct jarvis_batch & batch) { batch.n_tokens = 0; } void common_batch_add( - struct llama_batch & batch, - llama_token id, - llama_pos pos, - const std::vector & seq_ids, + struct jarvis_batch & batch, + jarvis_token id, + jarvis_pos pos, + const std::vector & seq_ids, bool logits) { - GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); + GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded"); batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; @@ -1485,26 +1485,26 @@ void common_batch_add( // Vocab utils // -std::vector common_tokenize( - const struct llama_context * ctx, +std::vector common_tokenize( + const struct jarvis_context * ctx, const std::string & text, bool add_special, bool parse_special) { - return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); + return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special); } -std::vector common_tokenize( - const struct llama_model * model, +std::vector common_tokenize( + const struct jarvis_model * model, const std::string & text, bool add_special, bool parse_special) { // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; - std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + std::vector result(n_tokens); + n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -1512,13 +1512,13 @@ std::vector common_tokenize( return result; } -std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { +std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' - const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special); if (n_chars < 0) { piece.resize(-n_chars); - int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special); GGML_ASSERT(check == -n_chars); } else { @@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token return piece; } -std::string common_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_detokenize(jarvis_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); if (n_chars < 0) { text.resize(-n_chars); - n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } @@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector= 0; } -std::string common_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct jarvis_model * model, const std::string & tmpl, const std::vector & msgs, bool add_ass) { int alloc_size = 0; bool fallback = false; // indicate if we must fallback to default chatml - std::vector chat; + std::vector chat; for (auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); alloc_size += (msg.role.size() + msg.content.size()) * 1.25; @@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model, std::vector buf(alloc_size); // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); // error: chat template is not supported if (res < 0) { if (ptr_tmpl != nullptr) { // if the custom "tmpl" is not supported, we throw an error - // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + // this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template() throw std::runtime_error("this custom template is not supported"); } else { // If the built-in template is not supported, we default to chatml - res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); fallback = true; } } @@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model, // if it turns out that our buffer is too small, we resize it if ((size_t) res > buf.size()) { buf.resize(res); - res = llama_chat_apply_template( + res = jarvis_chat_apply_template( fallback ? nullptr : model, fallback ? "chatml" : ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); @@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model, return formatted_chat; } -std::string common_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct jarvis_model * model, const std::string & tmpl, const std::vector & past_msg, const common_chat_msg & new_msg, @@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model, return ss.str(); } -std::string common_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct jarvis_model * model, const std::string & tmpl) { std::vector msgs = { {"system", "You are a helpful assistant"}, @@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model, // KV cache utils // -void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); - llama_kv_cache_view_cell * c_curr = view.cells; - llama_seq_id * cs_curr = view.cells_sequences; + jarvis_kv_cache_view_cell * c_curr = view.cells; + jarvis_seq_id * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { if (i % row_size == 0) { @@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); - std::unordered_map seqs; - llama_kv_cache_view_cell * c_curr = view.cells; - llama_seq_id * cs_curr = view.cells_sequences; + std::unordered_map seqs; + jarvis_kv_cache_view_cell * c_curr = view.cells; + jarvis_seq_id * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { for (int j = 0; j < view.n_seq_max; j++) { @@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha } } -void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, +void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const auto & sparams = params.sparams; - fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); - fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "build_commit: %s\n", JARVIS_COMMIT); + fprintf(stream, "build_number: %d\n", JARVIS_BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); @@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons #endif // NDEBUG fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx))); #ifdef __OPTIMIZE__ fprintf(stream, "optimize: true\n"); @@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices()); yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); diff --git a/common/common.h b/common/common.h index 18b2121ed89b0..e3e41053ff3d6 100644 --- a/common/common.h +++ b/common/common.h @@ -2,7 +2,7 @@ #pragma once -#include "llama.h" +#include "jarvis.h" #include #include @@ -18,8 +18,8 @@ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET); \ } while(0) #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" @@ -30,14 +30,14 @@ struct common_lora_adapter_info { }; struct common_lora_adapter_container : common_lora_adapter_info { - struct llama_lora_adapter * adapter; + struct jarvis_lora_adapter * adapter; }; // build info -extern int LLAMA_BUILD_NUMBER; -extern char const * LLAMA_COMMIT; -extern char const * LLAMA_COMPILER; -extern char const * LLAMA_BUILD_TARGET; +extern int JARVIS_BUILD_NUMBER; +extern char const * JARVIS_COMMIT; +extern char const * JARVIS_COMPILER; +extern char const * JARVIS_BUILD_TARGET; struct common_control_vector_load_info; @@ -61,25 +61,25 @@ int32_t cpu_get_num_math(); // Common params // -enum llama_example { - LLAMA_EXAMPLE_COMMON, - LLAMA_EXAMPLE_SPECULATIVE, - LLAMA_EXAMPLE_MAIN, - LLAMA_EXAMPLE_INFILL, - LLAMA_EXAMPLE_EMBEDDING, - LLAMA_EXAMPLE_PERPLEXITY, - LLAMA_EXAMPLE_RETRIEVAL, - LLAMA_EXAMPLE_PASSKEY, - LLAMA_EXAMPLE_IMATRIX, - LLAMA_EXAMPLE_BENCH, - LLAMA_EXAMPLE_SERVER, - LLAMA_EXAMPLE_CVECTOR_GENERATOR, - LLAMA_EXAMPLE_EXPORT_LORA, - LLAMA_EXAMPLE_LLAVA, - LLAMA_EXAMPLE_LOOKUP, - LLAMA_EXAMPLE_PARALLEL, - - LLAMA_EXAMPLE_COUNT, +enum jarvis_example { + JARVIS_EXAMPLE_COMMON, + JARVIS_EXAMPLE_SPECULATIVE, + JARVIS_EXAMPLE_MAIN, + JARVIS_EXAMPLE_INFILL, + JARVIS_EXAMPLE_EMBEDDING, + JARVIS_EXAMPLE_PERPLEXITY, + JARVIS_EXAMPLE_RETRIEVAL, + JARVIS_EXAMPLE_PASSKEY, + JARVIS_EXAMPLE_IMATRIX, + JARVIS_EXAMPLE_BENCH, + JARVIS_EXAMPLE_SERVER, + JARVIS_EXAMPLE_CVECTOR_GENERATOR, + JARVIS_EXAMPLE_EXPORT_LORA, + JARVIS_EXAMPLE_LLAVA, + JARVIS_EXAMPLE_LOOKUP, + JARVIS_EXAMPLE_PARALLEL, + + JARVIS_EXAMPLE_COUNT, }; enum common_sampler_type { @@ -103,7 +103,7 @@ enum dimre_method { // sampler parameters struct common_sampler_params { - uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler + uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler int32_t n_prev = 64; // number of previous tokens to remember int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. @@ -149,7 +149,7 @@ struct common_sampler_params { std::string grammar; // optional BNF-like grammar to constrain sampling - std::vector logit_bias; // logit biases to apply + std::vector logit_bias; // logit biases to apply // print the parameters into a string std::string print() const; @@ -192,10 +192,10 @@ struct common_params { ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; - enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings - enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings + enum jarvis_split_mode split_mode = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs + enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED; + enum jarvis_pooling_type pooling_type = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings + enum jarvis_attention_type attention_type = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings struct common_sampler_params sparams; @@ -219,9 +219,9 @@ struct common_params { std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) - std::vector kv_overrides; + std::vector kv_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply) std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio); #ifdef __GNUC__ #ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif #else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) #endif -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2) std::string string_format(const char * fmt, ...); std::string string_strip(const std::string & str); @@ -424,13 +424,13 @@ std::vector string_split(const std::string & input, ch return parts; } -bool string_parse_kv_override(const char * data, std::vector & overrides); +bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); std::string string_from(bool value); std::string string_from(const std::vector & values); -std::string string_from(const struct llama_context * ctx, const std::vector & tokens); -std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); +std::string string_from(const struct jarvis_context * ctx, const std::vector & tokens); +std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch); // // Filesystem utils @@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename); // struct common_init_result { - struct llama_model * model = nullptr; - struct llama_context * context = nullptr; + struct jarvis_model * model = nullptr; + struct jarvis_context * context = nullptr; std::vector lora_adapters; }; struct common_init_result common_init_from_params(common_params & params); -struct llama_model_params common_model_params_to_llama (const common_params & params); -struct llama_context_params common_context_params_to_llama(const common_params & params); +struct jarvis_model_params common_model_params_to_jarvis (const common_params & params); +struct jarvis_context_params common_context_params_to_jarvis(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params); +struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params); // clear LoRA adapters from context, then apply new list of adapters -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector & lora_adapters); // Batch utils -void common_batch_clear(struct llama_batch & batch); +void common_batch_clear(struct jarvis_batch & batch); void common_batch_add( - struct llama_batch & batch, - llama_token id, - llama_pos pos, - const std::vector & seq_ids, + struct jarvis_batch & batch, + jarvis_token id, + jarvis_pos pos, + const std::vector & seq_ids, bool logits); // @@ -481,14 +481,14 @@ void common_batch_add( // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` -std::vector common_tokenize( - const struct llama_context * ctx, +std::vector common_tokenize( + const struct jarvis_context * ctx, const std::string & text, bool add_special, bool parse_special = false); -std::vector common_tokenize( - const struct llama_model * model, +std::vector common_tokenize( + const struct jarvis_model * model, const std::string & text, bool add_special, bool parse_special = false); @@ -496,23 +496,23 @@ std::vector common_tokenize( // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` std::string common_token_to_piece( - const struct llama_context * ctx, - llama_token token, + const struct jarvis_context * ctx, + jarvis_token token, bool special = true); // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens std::string common_detokenize( - llama_context * ctx, - const std::vector & tokens, + jarvis_context * ctx, + const std::vector & tokens, bool special = true); // // Chat template utils // -// same with llama_chat_message, but uses std::string +// same with jarvis_chat_message, but uses std::string struct common_chat_msg { std::string role; std::string content; @@ -521,23 +521,23 @@ struct common_chat_msg { // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl); -// CPP wrapper for llama_chat_apply_template +// CPP wrapper for jarvis_chat_apply_template // If the built-in template is not supported, we default to chatml // If the custom "tmpl" is not supported, we throw an error -std::string common_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct jarvis_model * model, const std::string & tmpl, const std::vector & chat, bool add_ass); // Format single message, while taking into account the position of that message in chat history -std::string common_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct jarvis_model * model, const std::string & tmpl, const std::vector & past_msg, const common_chat_msg & new_msg, bool add_ass); // Returns an example of formatted chat -std::string common_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct jarvis_model * model, const std::string & tmpl); // @@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model, // // Dump the KV cache view with the number of sequences per cell. -void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); +void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40); // // Embedding utils @@ -596,5 +596,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_non_result_info( - FILE * stream, const common_params & params, const llama_context * lctx, + FILE * stream, const common_params & params, const jarvis_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/console.cpp b/common/console.cpp index 078a8d678d933..d7c1d46d8dd09 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -435,7 +435,7 @@ namespace console { fputc('\n', out); has_more = !has_more; } else { - // llama will just eat the single space, it won't act as a space + // jarvis will just eat the single space, it won't act as a space if (line.length() == 1 && line.back() == ' ') { line.clear(); pop_cursor(); diff --git a/common/json.hpp b/common/json.hpp index a858728c4ceb8..a6f53f0b45aca 100644 --- a/common/json.hpp +++ b/common/json.hpp @@ -5336,7 +5336,7 @@ template class iteration_proxy }; // Structured Bindings Support -// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/ // And see https://github.com/nlohmann/json/pull/1391 template = 0> auto get(const nlohmann::detail::iteration_proxy_value& i) -> decltype(i.key()) @@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value& i) -> decl return i.key(); } // Structured Bindings Support -// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/ // And see https://github.com/nlohmann/json/pull/1391 template = 0> auto get(const nlohmann::detail::iteration_proxy_value& i) -> decltype(i.value()) @@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END // The Addition to the STD Namespace is required to add // Structured Bindings Support to the iteration_proxy_value class -// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/ // And see https://github.com/nlohmann/json/pull/1391 namespace std { diff --git a/common/log.cpp b/common/log.cpp index 04c7c0ed10595..3b022ad9ff3c6 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -8,7 +8,7 @@ #include #include -int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; +int common_log_verbosity_thold = LOG_DEFAULT_JARVIS; void common_log_set_verbosity_thold(int verbosity) { common_log_verbosity_thold = verbosity; diff --git a/common/log.h b/common/log.h index 66605cc69a314..37d7a0146f5d1 100644 --- a/common/log.h +++ b/common/log.h @@ -11,7 +11,7 @@ #endif #define LOG_DEFAULT_DEBUG 1 -#define LOG_DEFAULT_LLAMA 0 +#define LOG_DEFAULT_JARVIS 0 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower // set via common_log_set_verbosity() diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a9dfb67142528..c1576b136fccd 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -9,7 +9,7 @@ #include void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, - std::vector & inp, int nnew, bool print_progress) { + std::vector & inp, int nnew, bool print_progress) { const int64_t t_start_ms = ggml_time_ms(); const int64_t inp_size = inp.size(); @@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, for (int64_t i = i_start; i < inp_size; ++i) { const int64_t ngram_start = i - ngram_size; common_ngram ngram(&inp[ngram_start], ngram_size); - const llama_token token = inp[i]; + const jarvis_token token = inp[i]; common_ngram_cache::iterator part_it = ngram_cache.find(ngram); if (part_it == ngram_cache.end()) { @@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, } // Helper function to get a token from the combined, speculative sequence of inp and draft. -static llama_token get_token(const std::vector & inp, const std::vector & draft, const size_t i) { +static jarvis_token get_token(const std::vector & inp, const std::vector & draft, const size_t i) { return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; } // If sample size or percentage are below these thresholds the draft is aborted early: -constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; -constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; -constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; -constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; +constexpr int draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2, 2, 1, 1}; +constexpr int draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50}; +constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4, 3, 2, 2}; +constexpr int draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66}; // Helper function that tries to draft a token from only the static ngram cache: -static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { +static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { return -1; @@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram int max_count_static = 0; int sum_count_static = 0; - llama_token max_token = -1; + jarvis_token max_token = -1; - for (std::pair token_count_static : part_static) { - const llama_token token = token_count_static.first; + for (std::pair token_count_static : part_static) { + const jarvis_token token = token_count_static.first; const int32_t count_static = token_count_static.second; if (count_static > max_count_static) { @@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram sum_count_static += count_static; } - if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { + if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) { return -1; } - if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { + if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) { return -1; } return max_token; } // Try to draft a token from primary cache (context/dynamic), validate with static cache: -static llama_token try_draft( +static jarvis_token try_draft( common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { - llama_token drafted_token = -1; + jarvis_token drafted_token = -1; for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { const common_ngram ngram_primary = ngrams_primary[i]; @@ -112,10 +112,10 @@ static llama_token try_draft( int max_count_primary = 0; int max_count_static = 0; int sum_count_primary = 0; - llama_token max_token = -1; + jarvis_token max_token = -1; - for (std::pair token_count_primary : part_primary) { - const llama_token token = token_count_primary.first; + for (std::pair token_count_primary : part_primary) { + const jarvis_token token = token_count_primary.first; common_ngram_cache_part::iterator token_count_static_it = part_static.find(token); @@ -143,22 +143,22 @@ static llama_token try_draft( } void common_ngram_cache_draft( - std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static ) { GGML_ASSERT(draft.size() == 1); const int inp_size = inp.size(); - if (inp_size < LLAMA_NGRAM_STATIC) { + if (inp_size < JARVIS_NGRAM_STATIC) { return; } while ((int) draft.size()-1 < n_draft) { - llama_token drafted_token = -1; + jarvis_token drafted_token = -1; - const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; + const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1; common_ngram ngram_static; - for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { + for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) { ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); } common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); @@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil file_out.write(reinterpret_cast(&ngram), sizeof(common_ngram)); file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); - for (std::pair item2 : token_counts) { - const llama_token token = item2.first; + for (std::pair item2 : token_counts) { + const jarvis_token token = item2.first; const int32_t count = item2.second; GGML_ASSERT(count > 0); - file_out.write(reinterpret_cast(&token), sizeof(llama_token)); + file_out.write(reinterpret_cast(&token), sizeof(jarvis_token)); file_out.write(reinterpret_cast(&count), sizeof(int32_t)); } } @@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) { common_ngram ngram; int32_t ntokens; - llama_token token; + jarvis_token token; int32_t count; char * ngramc = reinterpret_cast(&ngram); @@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) { for (int i = 0; i < ntokens; ++i) { GGML_ASSERT(!hashmap_file.eof()); - GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); + GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token))); GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); GGML_ASSERT(count > 0); @@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng continue; } - for (std::pair token_count : part) { - const llama_token token = token_count.first; + for (std::pair token_count : part) { + const jarvis_token token = token_count.first; const int32_t count = token_count.second; GGML_ASSERT(count > 0); diff --git a/common/ngram-cache.h b/common/ngram-cache.h index 09c2b0319f2c0..c3fb21c6ace95 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -1,34 +1,34 @@ #pragma once -#include "llama.h" +#include "jarvis.h" #include #include #include -#define LLAMA_NGRAM_MIN 1 -#define LLAMA_NGRAM_MAX 4 -#define LLAMA_NGRAM_STATIC 2 +#define JARVIS_NGRAM_MIN 1 +#define JARVIS_NGRAM_MAX 4 +#define JARVIS_NGRAM_STATIC 2 // Data structures to map n-grams to empirical token probabilities: struct common_ngram { - llama_token tokens[LLAMA_NGRAM_MAX]; + jarvis_token tokens[JARVIS_NGRAM_MAX]; common_ngram() { - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) { tokens[i] = -1; } } - common_ngram(const llama_token * input, const int ngram_size) { - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + common_ngram(const jarvis_token * input, const int ngram_size) { + for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) { tokens[i] = i < ngram_size ? input[i] : -1; } } bool operator==(const common_ngram & other) const { - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) { if (tokens[i] != other.tokens[i]) { return false; } @@ -38,7 +38,7 @@ struct common_ngram { }; struct common_token_hash_function { - size_t operator()(const llama_token token) const { + size_t operator()(const jarvis_token token) const { // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ return token * 11400714819323198485llu; } @@ -47,7 +47,7 @@ struct common_token_hash_function { struct common_ngram_hash_function { size_t operator()(const common_ngram & ngram) const { size_t hash = common_token_hash_function{}(ngram.tokens[0]); - for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { + for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) { hash ^= common_token_hash_function{}(ngram.tokens[i]); } return hash; @@ -55,7 +55,7 @@ struct common_ngram_hash_function { }; // token -> number of times token has been seen -typedef std::unordered_map common_ngram_cache_part; +typedef std::unordered_map common_ngram_cache_part; // n-gram -> empirical distribution of following tokens typedef std::unordered_map common_ngram_cache; @@ -71,7 +71,7 @@ typedef std::unordered_map & inp_data, int nnew, bool print_progress); + common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); // Try to draft tokens from ngram caches. // inp: the tokens generated so far. @@ -82,7 +82,7 @@ void common_ngram_cache_update( // nc_dynamic: ngram cache based on previous user generations. // nc_static: ngram cache generated from a large text corpus, used for validation. void common_ngram_cache_draft( - std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); // Save an ngram cache to a file. diff --git a/common/sampling.cpp b/common/sampling.cpp index 48a9df8ba5b88..b6cad63334e7b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -6,7 +6,7 @@ #include // the ring buffer works similarly to std::deque, but with a fixed capacity -// TODO: deduplicate with llama-impl.h +// TODO: deduplicate with jarvis-impl.h template struct ring_buffer { ring_buffer(size_t cap) : capacity(cap), data(cap) {} @@ -101,24 +101,24 @@ struct ring_buffer { struct common_sampler { common_sampler_params params; - struct llama_sampler * grmr; - struct llama_sampler * chain; + struct jarvis_sampler * grmr; + struct jarvis_sampler * chain; - ring_buffer prev; + ring_buffer prev; - std::vector cur; + std::vector cur; - llama_token_data_array cur_p; + jarvis_token_data_array cur_p; - void set_logits(struct llama_context * ctx, int idx) { - const auto * logits = llama_get_logits_ith(ctx, idx); + void set_logits(struct jarvis_context * ctx, int idx) { + const auto * logits = jarvis_get_logits_ith(ctx, idx); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx)); cur.resize(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f}; } cur_p = { cur.data(), cur.size(), -1, false }; @@ -141,31 +141,31 @@ std::string common_sampler_params::print() const { return std::string(result); } -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { - llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); +struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) { + jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params(); lparams.no_perf = params.no_perf; auto * result = new common_sampler { /* .params = */ params, - /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), - /* .chain = */ llama_sampler_chain_init(lparams), - /* .prev = */ ring_buffer(std::max(32, params.n_prev)), + /* .grmr = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"), + /* .chain = */ jarvis_sampler_chain_init(lparams), + /* .prev = */ ring_buffer(std::max(32, params.n_prev)), /* .cur = */ {}, /* .cur_p = */ {}, }; - llama_sampler_chain_add(result->chain, - llama_sampler_init_logit_bias( - llama_n_vocab(model), + jarvis_sampler_chain_add(result->chain, + jarvis_sampler_init_logit_bias( + jarvis_n_vocab(model), params.logit_bias.size(), params.logit_bias.data())); - llama_sampler_chain_add(result->chain, - llama_sampler_init_penalties( - llama_n_vocab (model), - llama_token_eos(model), - llama_token_nl (model), + jarvis_sampler_chain_add(result->chain, + jarvis_sampler_init_penalties( + jarvis_n_vocab (model), + jarvis_token_eos(model), + jarvis_token_nl (model), params.penalty_last_n, params.penalty_repeat, params.penalty_freq, @@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co c_breakers.push_back(str.c_str()); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k (params.top_k)); break; case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p (params.top_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p (params.min_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); break; case COMMON_SAMPLER_TYPE_TFS_Z: - llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical (params.typ_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill (model)); break; default: GGML_ASSERT(false && "unknown sampler type"); } } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp)); + jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { GGML_ASSERT(false && "unknown mirostat version"); } @@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { - llama_sampler_free(gsmpl->grmr); + jarvis_sampler_free(gsmpl->grmr); - llama_sampler_free(gsmpl->chain); + jarvis_sampler_free(gsmpl->chain); delete gsmpl; } } -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { +void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) { if (accept_grammar) { - llama_sampler_accept(gsmpl->grmr, token); + jarvis_sampler_accept(gsmpl->grmr, token); } - llama_sampler_accept(gsmpl->chain, token); + jarvis_sampler_accept(gsmpl->chain, token); gsmpl->prev.push_back(token); } void common_sampler_reset(struct common_sampler * gsmpl) { - llama_sampler_reset(gsmpl->grmr); + jarvis_sampler_reset(gsmpl->grmr); - llama_sampler_reset(gsmpl->chain); + jarvis_sampler_reset(gsmpl->chain); } struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { return new common_sampler { /* .params = */ gsmpl->params, - /* .grmr = */ llama_sampler_clone(gsmpl->grmr), - /* .chain = */ llama_sampler_clone(gsmpl->chain), + /* .grmr = */ jarvis_sampler_clone(gsmpl->grmr), + /* .chain = */ jarvis_sampler_clone(gsmpl->chain), /* .prev = */ gsmpl->prev, /* .cur = */ gsmpl->cur, /* .cur_p = */ gsmpl->cur_p, }; } -void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { +void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance if (gsmpl) { - llama_perf_sampler_print(gsmpl->chain); + jarvis_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); } } -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) { gsmpl->set_logits(ctx, idx); auto & grmr = gsmpl->grmr; @@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co auto & cur_p = gsmpl->cur_p; // initialized by set_logits if (grammar_first) { - llama_sampler_apply(grmr, &cur_p); + jarvis_sampler_apply(grmr, &cur_p); } - llama_sampler_apply(chain, &cur_p); + jarvis_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); - const llama_token id = cur_p.data[cur_p.selected].id; + const jarvis_token id = cur_p.data[cur_p.selected].id; if (grammar_first) { return id; @@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // check if it the sampled token fits the grammar { - llama_token_data single_token_data = { id, 1.0f, 0.0f }; - llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; + jarvis_token_data single_token_data = { id, 1.0f, 0.0f }; + jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; - llama_sampler_apply(grmr, &single_token_data_array); + jarvis_sampler_apply(grmr, &single_token_data_array); const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; if (is_valid) { @@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain gsmpl->set_logits(ctx, idx); - llama_sampler_apply(grmr, &cur_p); - llama_sampler_apply(chain, &cur_p); + jarvis_sampler_apply(grmr, &cur_p); + jarvis_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); @@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co } uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { - return llama_sampler_get_seed(gsmpl->chain); + return jarvis_sampler_get_seed(gsmpl->chain); } // helpers -llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { +jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { return &gsmpl->cur_p; } -llama_token common_sampler_last(const struct common_sampler * gsmpl) { +jarvis_token common_sampler_last(const struct common_sampler * gsmpl) { return gsmpl->prev.rat(0); } std::string common_sampler_print(const struct common_sampler * gsmpl) { std::string result = "logits "; - for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { - const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); - result += std::string("-> ") + llama_sampler_name(smpl) + " "; + for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) { + const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i); + result += std::string("-> ") + jarvis_sampler_name(smpl) + " "; } return result; } -std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { +std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) { n = std::min(n, (int) gsmpl->prev.size()); if (n <= 0) { @@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_ result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab for (int i = n - 1; i >= 0; i--) { - const llama_token id = gsmpl->prev.rat(i); + const jarvis_token id = gsmpl->prev.rat(i); - GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); + GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen"); result += common_token_to_piece(ctx_main, id); } diff --git a/common/sampling.h b/common/sampling.h index d37f25ad37c4a..9dc17ed24b69f 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -1,13 +1,13 @@ #pragma once -#include "llama.h" +#include "jarvis.h" #include "common.h" #include #include -// common_sampler extends llama_sampler with additional functionality: +// common_sampler extends jarvis_sampler with additional functionality: // // - grammar support // - custom sampler logic based on the parameters @@ -24,7 +24,7 @@ // grammar constraints are applied to the full vocabulary and the token is resampled. // // The common_sampler also maintains a container with the last accepted tokens. In the future, this can -// be moved into the core llama library. +// be moved into the core jarvis library. // // For convenience, the common_sampler also maintains a container with the current candidate tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens. @@ -34,19 +34,19 @@ struct common_sampler; -// llama_sampler API overloads +// jarvis_sampler API overloads -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); +struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params); void common_sampler_free(struct common_sampler * gsmpl); // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar); void common_sampler_reset (struct common_sampler * gsmpl); struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing -void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); +void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl); // extended sampling implementation: // @@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // if grammar_first is true, the grammar is applied before the samplers (slower) // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers // access the internal list of current candidate tokens -llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); +jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); // get the last accepted token -llama_token common_sampler_last(const struct common_sampler * gsmpl); +jarvis_token common_sampler_last(const struct common_sampler * gsmpl); // print the sampler chain into a string std::string common_sampler_print(const struct common_sampler * gsmpl); // get a string representation of the last accepted tokens -std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); +std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n); char common_sampler_type_to_chr(enum common_sampler_type cnstr); std::string common_sampler_type_to_str(enum common_sampler_type cnstr); diff --git a/common/train.cpp b/common/train.cpp index 661ad8382eab6..c913f6dbd8521 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -34,7 +34,7 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; + state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; return state; @@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 } int64_t get_example_targets_batch( - struct llama_context * lctx, + struct jarvis_context * lctx, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs, int64_t example_id, @@ -221,7 +221,7 @@ int64_t get_example_targets_batch( const size_t * samples_begin, const size_t * samples_size, size_t samples_count, - const llama_token * train_data, + const jarvis_token * train_data, size_t n_train_data, bool separate_with_eos, bool separate_with_bos, @@ -241,8 +241,8 @@ int64_t get_example_targets_batch( int64_t used_samples = 0; ggml_set_f32(target_probs, 0.0f); - llama_token bos = llama_token_bos(llama_get_model(lctx)); - llama_token eos = llama_token_eos(llama_get_model(lctx)); + jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx)); + jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx)); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); for (int k=0; k= sample_size && fill_with_next_samples) { if (!sample_separation_eos) { // insert eos token to separate samples @@ -281,7 +281,7 @@ int64_t get_example_targets_batch( } // note: no else-if here if (sample_offs < sample_size) { - token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); + token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1)); ++sample_offs; } ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); @@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai } -struct llama_file { +struct jarvis_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; - llama_file(const char * fname, const char * mode) { + jarvis_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { size = 0; @@ -788,7 +788,7 @@ struct llama_file { write_raw(&val, sizeof(val)); } - ~llama_file() { + ~jarvis_file() { if (fp) { std::fclose(fp); } @@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu } size_t tokenize_file( - struct llama_context * lctx, + struct jarvis_context * lctx, const char * filename, const std::string & sample_start, bool include_sample_start, bool overlapping_samples, unsigned context_length, - std::vector & out_tokens, + std::vector & out_tokens, std::vector & out_samples_begin, std::vector & out_samples_size) { - struct llama_file f(filename, "rb"); + struct jarvis_file f(filename, "rb"); if (f.size == 0) { out_tokens.clear(); @@ -844,7 +844,7 @@ size_t tokenize_file( } // account for possible leading whitespace that will be added by tokenizer - // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] + // e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12] const int n_max_tokens_overhead = 1; std::vector buf; @@ -862,8 +862,8 @@ size_t tokenize_file( // tokenize all data at once out_tokens.resize(buf.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize( - llama_get_model(lctx), + int n_tokens = jarvis_tokenize( + jarvis_get_model(lctx), buf.data(), (int) buf.size(), out_tokens.data(), @@ -871,8 +871,8 @@ size_t tokenize_file( false, false); if (n_tokens < 0) { out_tokens.resize(-n_tokens); - n_tokens = llama_tokenize( - llama_get_model(lctx), + n_tokens = jarvis_tokenize( + jarvis_get_model(lctx), buf.data(), (int) buf.size(), out_tokens.data(), @@ -915,7 +915,7 @@ size_t tokenize_file( out_samples_size.resize(out_samples_begin.size(), 0); std::vector buf_sample; - std::vector tok_sample; + std::vector tok_sample; const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); size_t found_too_big_sample = 0; @@ -925,11 +925,11 @@ size_t tokenize_file( size_t found_max_sample_size = 0; size_t max_token_text_size = 0; - int n_vocab = llama_n_vocab(llama_get_model(lctx)); - for (llama_token token=0; token < n_vocab; ++token) { + int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx)); + for (jarvis_token token=0; token < n_vocab; ++token) { max_token_text_size = std::max( max_token_text_size, - strlen(llama_token_get_text(llama_get_model(lctx), token))); + strlen(jarvis_token_get_text(jarvis_get_model(lctx), token))); } // upper bound of context byte length. @@ -957,7 +957,7 @@ size_t tokenize_file( } if (sample_size > 0) { - // llama_tokenize expects zero terminated string, + // jarvis_tokenize expects zero terminated string, // copy sample into buffer and zero terminate it. buf_sample.resize(sample_size); memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); @@ -966,7 +966,7 @@ size_t tokenize_file( // tokenize the sample tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize(llama_get_model(lctx), + int n_tokens = jarvis_tokenize(jarvis_get_model(lctx), buf_sample.data(), (int) buf_sample.size(), tok_sample.data(), @@ -974,7 +974,7 @@ size_t tokenize_file( false, false); if (n_tokens < 0) { tok_sample.resize(-n_tokens); - n_tokens = llama_tokenize(llama_get_model(lctx), + n_tokens = jarvis_tokenize(jarvis_get_model(lctx), buf_sample.data(), (int) buf_sample.size(), tok_sample.data(), @@ -1365,7 +1365,7 @@ bool consume_common_train_arg( *invalid_param = true; return true; } - if (llama_supports_gpu_offload()) { + if (jarvis_supports_gpu_offload()) { params->n_gpu_layers = std::stoi(argv[i]); } else { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); diff --git a/common/train.h b/common/train.h index 263d940c04298..82c4a24c5d3ee 100644 --- a/common/train.h +++ b/common/train.h @@ -7,9 +7,9 @@ #include #include "ggml.h" -#include "llama.h" +#include "jarvis.h" -#define LLAMA_TRAIN_MAX_NODES 16384 +#define JARVIS_TRAIN_MAX_NODES 16384 typedef std::string mt19937_state; @@ -92,9 +92,9 @@ struct train_opt_callback_data { struct train_state * train; save_train_files_callback save_cb; void * save_data; - struct llama_context * lctx; + struct jarvis_context * lctx; int last_save_iter; - llama_token * tokens_data; + jarvis_token * tokens_data; size_t tokens_size; size_t * samples_begin; size_t * samples_size; @@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); size_t tokenize_file( - struct llama_context * lctx, + struct jarvis_context * lctx, const char * filename, const std::string & sample_start, bool include_sample_start, bool overlapping_samples, unsigned context_length, - std::vector & out_tokens, + std::vector & out_tokens, std::vector & out_samples_begin, std::vector & out_samples_size); int64_t get_example_targets_batch( - struct llama_context * lctx, + struct jarvis_context * lctx, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs, int64_t example_id, @@ -165,7 +165,7 @@ int64_t get_example_targets_batch( const size_t * samples_begin, const size_t * samples_size, size_t samples_count, - const llama_token * train_data, + const jarvis_token * train_data, size_t n_train_data, bool separate_with_eos, bool separate_with_bos, diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a34dabe235a34..bc25aab73df1f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -49,7 +49,7 @@ class Model: _model_classes: dict[str, type[Model]] = {} dir_model: Path - ftype: gguf.LlamaFileType + ftype: gguf.JarvisFileType fname_out: Path is_big_endian: bool endianess: gguf.GGUFEndian @@ -69,7 +69,7 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): @@ -96,15 +96,15 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type - if self.ftype == gguf.LlamaFileType.GUESSED: + if self.ftype == gguf.JarvisFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_F16 + self.ftype = gguf.JarvisFileType.MOSTLY_F16 else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + self.ftype = gguf.JarvisFileType.MOSTLY_BF16 # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, @@ -308,7 +308,7 @@ def prepare_tensors(self): if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp # Some tensor types are always in float32 if data_qtype is False and ( any( @@ -337,25 +337,25 @@ def prepare_tensors(self): ) ): if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, + gguf.JarvisFileType.MOSTLY_TQ1_0, + gguf.JarvisFileType.MOSTLY_TQ2_0, ): # TODO: use Q4_K and Q6_K data_qtype = gguf.GGMLQuantizationType.F16 # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: + if self.ftype == gguf.JarvisFileType.ALL_F32: data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + elif self.ftype == gguf.JarvisFileType.MOSTLY_F16: data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16: data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0: data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0: data_qtype = gguf.GGMLQuantizationType.TQ2_0 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -394,7 +394,7 @@ def prepare_metadata(self, vocab_only: bool): if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + # Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0' output_type: str = self.ftype.name.partition("_")[2] # Filename Output @@ -537,13 +537,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # ref: https://github.com/ggerganov/jarvis.cpp/pull/6920 # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer + # use in jarvis.cpp to implement the same pre-tokenizer chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' @@ -559,8 +559,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: # or pull the latest version of the model from Huggingface # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" + # ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B + res = "jarvis-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" @@ -616,7 +616,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + # ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct res = "smaug-bpe" if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": # ref: https://huggingface.co/LumiOpen/Poro-34B-chat @@ -666,7 +666,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") logger.warning("**************************************************************************************") @@ -746,7 +746,7 @@ def _set_vocab_qwen(self): def _set_vocab_sentencepiece(self, add_to_gguf=True): tokens, scores, toktypes = self._create_vocab_sentencepiece() - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -835,8 +835,8 @@ def _create_vocab_sentencepiece(self): return tokens, scores, toktypes - def _set_vocab_llama_hf(self): - vocab = gguf.LlamaHfVocab(self.dir_model) + def _set_vocab_jarvis_hf(self): + vocab = gguf.JarvisHfVocab(self.dir_model) tokens = [] scores = [] toktypes = [] @@ -848,7 +848,7 @@ def _set_vocab_llama_hf(self): assert len(tokens) == vocab.vocab_size - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -857,7 +857,7 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") vocab_reader = gguf.GGUFReader(tokenizer_path, "r") @@ -875,7 +875,7 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab assert field # token list self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - if model_name == "llama-spm": + if model_name == "jarvis-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) assert field # token scores self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) @@ -884,7 +884,7 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab assert field # token types self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - if model_name != "llama-spm": + if model_name != "jarvis-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) assert field # token merges self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) @@ -1226,7 +1226,7 @@ def set_vocab(self): tokens.append(token_text) toktypes.append(toktype) - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -1515,21 +1515,21 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): - model_arch = gguf.MODEL_ARCH.LLAMA +@Model.register("JARVISForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class JarvisModel(Model): + model_arch = gguf.MODEL_ARCH.JARVIS def set_vocab(self): try: self._set_vocab_sentencepiece() except FileNotFoundError: try: - self._set_vocab_llama_hf() + self._set_vocab_jarvis_hf() except (FileNotFoundError, TypeError): - # Llama 3 + # Jarvis 3 self._set_vocab_gpt2() - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + # Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -1583,9 +1583,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_kv_head = self.hparams.get("num_key_value_heads") if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head) # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -1625,7 +1625,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": + if rope_scaling.get("rope_type", '').lower() == "jarvis3": base = self.hparams.get("rope_theta", 10000.0) dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) @@ -1793,7 +1793,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently + # But jarvis.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} @@ -1842,7 +1842,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def set_vocab(self): - self._set_vocab_llama_hf() + self._set_vocab_jarvis_hf() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2188,7 +2188,7 @@ def set_vocab(self): if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -2456,7 +2456,7 @@ def set_vocab(self): if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -2468,7 +2468,7 @@ def set_vocab(self): if chat_eos_token_id is not None: # For the chat model, we replace the eos with '<|im_end|>'. # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + # https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" " in chat mode so that the conversation can end normally.") @@ -2505,8 +2505,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] # The model weights of q and k equire additional reshape. - q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) v = v.reshape((-1, v.shape[-1])) return [ @@ -2769,7 +2769,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") @@ -2816,7 +2816,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") @@ -2894,7 +2894,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) - # required by llama.cpp, unused + # required by jarvis.cpp, unused self.gguf_writer.add_head_count(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -3024,7 +3024,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_clamp_kqv(clip_qkv) # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel + # Copied from: JarvisModel def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3032,9 +3032,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_kv_head = self.hparams.get("num_key_value_heads") if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_head) if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] @@ -3174,12 +3174,12 @@ def __init__(self, *args, **kwargs): assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) - # Uses the tokenizer from meta-llama/Llama-2-7b-hf + # Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf def set_vocab(self): try: self._set_vocab_sentencepiece() except FileNotFoundError: - self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) + self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"]) def set_gguf_parameters(self): n_embd = self._n_embd @@ -3300,7 +3300,7 @@ def set_vocab(self): toktypes[token_id] = token_type scores[token_id] = token_score - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -3322,9 +3322,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_kv_head = self.hparams.get("num_key_value_heads") if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_head) if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head) # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -3882,7 +3882,7 @@ def set_vocab_chatglm3(self): scores.append(score) toktypes.append(toktype) - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("jarvis") # glm3 needs prefix and suffix formatted as: # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" self.gguf_writer.add_tokenizer_pre("chatglm-spm") @@ -4087,7 +4087,7 @@ def set_gguf_parameters(self): def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": + if rope_scaling.get("rope_type", '').lower() == "jarvis3": base = self.hparams.get("rope_theta", 10000.0) dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) @@ -4116,12 +4116,12 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: @Model.register("GraniteForCausalLM") -class GraniteModel(LlamaModel): +class GraniteModel(JarvisModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE def set_gguf_parameters(self): - """Granite uses standard llama parameters with the following differences: + """Granite uses standard jarvis parameters with the following differences: - No head_dim support - New multiplier params: @@ -4196,9 +4196,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter hidden_dim = self.hparams.get("hidden_size") if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head) if name.endswith(("q_norm.weight", "q_norm.bias")): data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) if name.endswith(("k_norm.weight", "k_norm.bias")): @@ -4379,14 +4379,14 @@ def main() -> None: logger.error(f'Error: {args.model} is not a directory') sys.exit(1) - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, - "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, - "auto": gguf.LlamaFileType.GUESSED, + ftype_map: dict[str, gguf.JarvisFileType] = { + "f32": gguf.JarvisFileType.ALL_F32, + "f16": gguf.JarvisFileType.MOSTLY_F16, + "bf16": gguf.JarvisFileType.MOSTLY_BF16, + "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0, + "tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0, + "tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0, + "auto": gguf.JarvisFileType.GUESSED, } is_split = args.split_max_tensors > 0 or args.split_max_size != "0" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 28cd02e5a7f66..b4324a3cd1922 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -5,10 +5,10 @@ # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py # # This is necessary in order to analyze the type of pre-tokenizer used by the model and -# provide the necessary information to llama.cpp via the GGUF header in order to implement +# provide the necessary information to jarvis.cpp via the GGUF header in order to implement # the same pre-tokenizer. # -# ref: https://github.com/ggerganov/llama.cpp/pull/6920 +# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920 # # Instructions: # @@ -18,9 +18,9 @@ # python3 convert_hf_to_gguf_update.py # # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py -# - Update llama.cpp with the new pre-tokenizer if necessary +# - Update jarvis.cpp with the new pre-tokenizer if necessary # -# TODO: generate tokenizer tests for llama.cpp +# TODO: generate tokenizer tests for jarvis.cpp # import logging @@ -65,8 +65,8 @@ class TOKENIZER_TYPE(IntEnum): # TODO: add models here, base models preferred models = [ - {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + {"name": "jarvis-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", }, + {"name": "jarvis-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", }, {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, @@ -86,7 +86,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, - {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, + {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B @@ -215,7 +215,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer + # use in jarvis.cpp to implement the same pre-tokenizer chktxt = {repr(CHK_TXT)} @@ -239,7 +239,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") logger.warning("**************************************************************************************") @@ -311,7 +311,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: "3333333", "33333333", "333333333", - "Cửa Việt", # llama-bpe fails on this + "Cửa Việt", # jarvis-bpe fails on this " discards", CHK_TXT, ] diff --git a/convert_llama_ggml_to_gguf.py b/convert_jarvis_ggml_to_gguf.py old mode 100755 new mode 100644 similarity index 96% rename from convert_llama_ggml_to_gguf.py rename to convert_jarvis_ggml_to_gguf.py index 29b14e98dd237..788a595cc8549 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_jarvis_ggml_to_gguf.py @@ -223,13 +223,13 @@ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') self.n_kv_head = n_kv_head - self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) + self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer) def save(self): logger.info('* Preparing to save GGUF file') gguf_writer = gguf.GGUFWriter( self.cfg.output, - gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], + gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS], use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) @@ -286,7 +286,7 @@ def add_params(self, gguf_writer): def add_vocab(self, gguf_writer): hp = self.model.hyperparameters - gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_tokenizer_model('jarvis') gguf_writer.add_tokenizer_pre('default') tokens = [] scores = [] @@ -358,7 +358,7 @@ def add_tensors(self, gguf_writer): def handle_metadata(cfg, hp): - import examples.convert_legacy_llama as convert + import examples.convert_legacy_jarvis as convert assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory' hf_config_path = cfg.model_metadata_dir / "config.json" @@ -396,11 +396,11 @@ def handle_args(): parser.add_argument('--desc', help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, - help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + help = 'grouped-query attention factor (use 8 for JARVIS2 70B)') parser.add_argument('--eps', default = '5.0e-06', - help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + help = 'RMS norm eps: Use 1e-6 for JARVIS1 and OpenJARVIS, use 1e-5 for JARVIS2') parser.add_argument('--context-length', '-c', type=int, default = 2048, - help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + help = 'Default max context length: JARVIS1 is typically 2048, JARVIS2 is typically 4096') parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, @@ -417,7 +417,7 @@ def main(): logger.info(f'* Using config: {cfg}') logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===') if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): - logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') + logger.info('- Note: If converting JARVIS2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() logger.info('* Scanning GGML input file') diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bc68f68afb768..f0eabf62bf2a3 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -271,12 +271,12 @@ def parse_args() -> argparse.Namespace: args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "auto": gguf.LlamaFileType.GUESSED, + ftype_map: dict[str, gguf.JarvisFileType] = { + "f32": gguf.JarvisFileType.ALL_F32, + "f16": gguf.JarvisFileType.MOSTLY_F16, + "bf16": gguf.JarvisFileType.MOSTLY_BF16, + "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0, + "auto": gguf.JarvisFileType.GUESSED, } ftype = ftype_map[args.outtype] @@ -372,9 +372,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: dest = list(super().modify_tensors(data_torch, name, bid)) # some archs may have the same tensor for lm_head and output (tie word embeddings) - # in this case, adapters targeting lm_head will fail when using llama-export-lora + # in this case, adapters targeting lm_head will fail when using jarvis-export-lora # therefore, we ignore them for now - # see: https://github.com/ggerganov/llama.cpp/issues/9065 + # see: https://github.com/ggerganov/jarvis.cpp/issues/9065 if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: diff --git a/docs/android.md b/docs/android.md index 320b62240382f..e4a071396921d 100644 --- a/docs/android.md +++ b/docs/android.md @@ -5,14 +5,14 @@ [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. -With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell: +With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell: ``` $ apt update && apt upgrade -y $ apt install git cmake ``` -Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. +Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake. Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: @@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F $ curl -L {model-url} -o ~/{model}.gguf ``` -Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: +Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and: ``` -$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" +$ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" ``` -Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. +Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 ## Cross-compile using Android NDK -It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.) +It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.) -Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory: +Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory: ``` $ cmake \ @@ -45,15 +45,15 @@ $ cmake \ -DCMAKE_C_FLAGS="-march=armv8.7a" \ -DCMAKE_CXX_FLAGS="-march=armv8.7a" \ -DGGML_OPENMP=OFF \ - -DGGML_LLAMAFILE=OFF \ + -DGGML_JARVISFILE=OFF \ -B build-android ``` Notes: - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time - - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325) + - `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325) -The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use. +The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use. Feel free to adjust the Android ABI for your target. Once the project is configured: @@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release After installing, go ahead and download the model of your choice to your host system. Then: ``` -$ adb shell "mkdir /data/local/tmp/llama.cpp" -$ adb push {install-dir} /data/local/tmp/llama.cpp/ -$ adb push {model}.gguf /data/local/tmp/llama.cpp/ +$ adb shell "mkdir /data/local/tmp/jarvis.cpp" +$ adb push {install-dir} /data/local/tmp/jarvis.cpp/ +$ adb push {model}.gguf /data/local/tmp/jarvis.cpp/ $ adb shell ``` In the `adb shell`: ``` -$ cd /data/local/tmp/llama.cpp -$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}" +$ cd /data/local/tmp/jarvis.cpp +$ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}" ``` That's it! diff --git a/docs/backend/BLIS.md b/docs/backend/BLIS.md index 35d06bd0f303d..7e9048135a2de 100644 --- a/docs/backend/BLIS.md +++ b/docs/backend/BLIS.md @@ -25,13 +25,13 @@ sudo make install We recommend using openmp since it's easier to modify the cores being used. -### llama.cpp compilation +### jarvis.cpp compilation Makefile: ```bash make GGML_BLIS=1 -j -# make GGML_BLIS=1 llama-benchmark-matmult +# make GGML_BLIS=1 jarvis-benchmark-matmult ``` CMake: @@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME .. make -j ``` -### llama.cpp execution +### jarvis.cpp execution According to the BLIS documentation, we could set the following environment variables to modify the behavior of openmp: diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 6bdd9d2daab90..ee92299473de0 100644 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -1,4 +1,4 @@ -# llama.cpp for CANN +# jarvis.cpp for CANN - [Background](#background) - [News](#news) @@ -17,9 +17,9 @@ **CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform. -**Llama.cpp + CANN** +**Jarvis.cpp + CANN** -The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly. +The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly. ## News @@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi | GritLM-7B | √ | √ | √ | | internlm2_5-7b-chat | √ | √ | √ | | koala-7B-HF | √ | √ | √ | -| Llama-2-7b-chat-hf | √ | √ | √ | -| Llama-3-Smaug-8B | √ | √ | √ | -| Llama2-Chinese-7b-Chat | √ | √ | √ | -| Llama3-8B | √ | √ | √ | -| Llama3-8b-chinese | √ | √ | √ | +| Jarvis-2-7b-chat-hf | √ | √ | √ | +| Jarvis-3-Smaug-8B | √ | √ | √ | +| Jarvis2-Chinese-7b-Chat | √ | √ | √ | +| Jarvis3-8B | √ | √ | √ | +| Jarvis3-8b-chinese | √ | √ | √ | | mamba-130m-hf | √ | √ | √ | | Mistral-7B-Instruct-v0.2 | √ | √ | √ | | Mixtral-8x7B-Instruct-v0.1 | x | √ | √ | @@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi ## Docker ### Build Images -You can get a image with llama.cpp in one command. +You can get a image with jarvis.cpp in one command. ```sh -docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile . +docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile . ``` ### Run container @@ -133,7 +133,7 @@ npu-smi info # Select the cards that you want to use, make sure these cards are not used by someone. # Following using cards of device0. -docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:" +docker run --name jarviscpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:" ``` *Notes:* @@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager Upon a successful installation, CANN is enabled for the available ascend devices. -### II. Build llama.cpp +### II. Build jarvis.cpp ```sh cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release @@ -242,13 +242,13 @@ cmake --build build --config release - Use device 0: ```sh - ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 + ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` - Use multiple devices: ```sh - ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer + ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` ### **GitHub contribution**: diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index ea34182e41a4c..541fe043b23cb 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -1,4 +1,4 @@ -# llama.cpp for SYCL +# jarvis.cpp for SYCL - [Background](#background) - [Recommended Release](#recommended-release) @@ -24,9 +24,9 @@ - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. -### Llama.cpp + SYCL +### Jarvis.cpp + SYCL -The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. +The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. ## Recommended Release @@ -36,7 +36,7 @@ The following release is verified with good quality: |Commit ID|Tag|Release|Verified Platform| |-|-|-|-| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| ## News @@ -46,7 +46,7 @@ The following release is verified with good quality: - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. - 2024.5 - - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770. + - Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770. - Arch Linux is verified successfully. - 2024.4 @@ -54,8 +54,8 @@ The following release is verified with good quality: - 2024.3 - Release binary files of Windows. - - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). - - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). + - A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd). + - New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437). - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. - Support detecting all GPUs with level-zero and same top **Max compute units**. @@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family: *Notes:* - **Memory** - - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. + - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`. - - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. + - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. - **Execution Unit (EU)** - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use. @@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets. ### Build image ```sh # Using FP16 -docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . +docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile . ``` *Notes*: To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command. -You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative. +You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative. ### Run container @@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the * # First, find all the DRI cards ls -la /dev/dri # Then, pick the card that you want to use (here for e.g. /dev/dri/card1). -docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 ``` *Notes:* @@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]: [hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9] ``` -### II. Build llama.cpp +### II. Build jarvis.cpp #### Intel GPU @@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR -# Build LLAMA with Nvidia BLAS acceleration through SYCL +# Build JARVIS with Nvidia BLAS acceleration through SYCL # Option 1: Use FP32 (recommended for better performance in most cases) cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx @@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR -# Build LLAMA with rocBLAS acceleration through SYCL +# Build JARVIS with rocBLAS acceleration through SYCL ## AMD # Use FP32, FP16 is not supported @@ -344,7 +344,7 @@ cmake --build build --config Release -j -v #### Retrieve and prepare model -You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example. ##### Check device @@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh -./build/bin/llama-ls-sycl-device +./build/bin/jarvis-ls-sycl-device ``` This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: @@ -390,12 +390,12 @@ Choose one of following methods to run. - Use device 0: ```sh -./examples/sycl/run-llama2.sh 0 +./examples/sycl/run-jarvis2.sh 0 ``` - Use multiple devices: ```sh -./examples/sycl/run-llama2.sh +./examples/sycl/run-jarvis2.sh ``` 2. Command line @@ -418,13 +418,13 @@ Examples: - Use device 0: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 +ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` - Use multiple devices: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer +ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` *Notes:* @@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/) -### II. Build llama.cpp +### II. Build jarvis.cpp You could download the release package for Windows directly, which including binary files and depended oneAPI dll files. @@ -506,7 +506,7 @@ Choose one of following methods to build from source code. 2. CMake -On the oneAPI command line window, step into the llama.cpp main directory and run the following: +On the oneAPI command line window, step into the jarvis.cpp main directory and run the following: ``` @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @@ -524,34 +524,34 @@ Or, use CMake presets to build: ```sh cmake --preset x64-windows-sycl-release -cmake --build build-x64-windows-sycl-release -j --target llama-cli +cmake --build build-x64-windows-sycl-release -j --target jarvis-cli cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release -cmake --build build-x64-windows-sycl-release -j --target llama-cli +cmake --build build-x64-windows-sycl-release -j --target jarvis-cli cmake --preset x64-windows-sycl-debug -cmake --build build-x64-windows-sycl-debug -j --target llama-cli +cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli ``` 3. Visual Studio -You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. +You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. *Notes:* -- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`. +- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`. ### III. Run the inference #### Retrieve and prepare model -You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example. ##### Check device 1. Enable oneAPI running environment -On the oneAPI command line window, run the following and step into the llama.cpp directory: +On the oneAPI command line window, run the following and step into the jarvis.cpp directory: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` @@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ``` -build\bin\llama-ls-sycl-device.exe +build\bin\jarvis-ls-sycl-device.exe ``` This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: @@ -589,7 +589,7 @@ Choose one of following methods to run. 1. Script ``` -examples\sycl\win-run-llama2.bat +examples\sycl\win-run-jarvis2.bat ``` 2. Command line @@ -613,13 +613,13 @@ Examples: - Use device 0: ``` -build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 +build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 ``` - Use multiple devices: ``` -build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer +build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` @@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ``` Otherwise, please double-check the GPU driver installation steps. -- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend? +- Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend? - No. We can't support Ollama issue directly, because we aren't familiar with Ollama. + No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis. - Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. + Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it. - It's same for other projects including llama.cpp SYCL backend. + It's same for other projects including jarvis.cpp SYCL backend. - Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer` diff --git a/docs/build.md b/docs/build.md index 4e362ebc78fa3..5fd1e051a1a01 100644 --- a/docs/build.md +++ b/docs/build.md @@ -1,13 +1,13 @@ -# Build llama.cpp locally +# Build jarvis.cpp locally **To get the Code:** ```bash -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp +git clone https://github.com/ggerganov/jarvis.cpp +cd jarvis.cpp ``` -In order to build llama.cpp you have four different options. +In order to build jarvis.cpp you have four different options. - Using `make`: - On Linux or MacOS: @@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options. 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 2. Extract `w64devkit` on your pc. 3. Run `w64devkit.exe`. - 4. Use the `cd` command to reach the `llama.cpp` folder. + 4. Use the `cd` command to reach the `jarvis.cpp` folder. 5. From here you can run: ```bash make ``` - Notes: - - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. + - For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - - For debug builds, run `make LLAMA_DEBUG=1` + - For debug builds, run `make JARVIS_DEBUG=1` - Using `CMake`: @@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options. **Notes**: - - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. + - For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - For debug builds, there are two cases: @@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i 4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`. 5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`. 6. Run `w64devkit.exe`. - 7. Use the `cd` command to reach the `llama.cpp` folder. + 7. Use the `cd` command to reach the `jarvis.cpp` folder. 8. From here you can run: ```bash @@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information. SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. -llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). +jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). -For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md). +For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md). ### Intel oneMKL -Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md). +Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md). - Using manual oneAPI installation: By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: @@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f - Using oneAPI docker image: If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above. -Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. +Check [Optimizing and Running JARVIS2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information. ### CUDA @@ -300,7 +300,7 @@ Libs: -lvulkan-1 EOF ``` -Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`. +Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`. #### MSYS2 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies. @@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a mingw-w64-ucrt-x86_64-vulkan-devel \ mingw-w64-ucrt-x86_64-shaderc ``` -Switch into `llama.cpp` directory and build using CMake. +Switch into `jarvis.cpp` directory and build using CMake. ```sh cmake -B build -DGGML_VULKAN=ON cmake --build build --config Release @@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container. ```sh # Build the image -docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile . +docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile . # Then, use it: -docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 ``` **Without docker**: @@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages. -Then, build llama.cpp using the cmake command below: +Then, build jarvis.cpp using the cmake command below: ```bash cmake -B build -DGGML_VULKAN=1 cmake --build build --config Release # Test the output binary (with "-ngl 33" to offload all layers to GPU) -./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 +./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 # You should see in the output, ggml_vulkan detected your GPU. For example: # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 @@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann) -Go to `llama.cpp` directory and build using CMake. +Go to `jarvis.cpp` directory and build using CMake. ```bash cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release cmake --build build --config release @@ -375,15 +375,15 @@ cmake --build build --config release You can test with: -`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32` +`./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32` -If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`: +If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`: ```bash llm_load_tensors: CANN buffer size = 13313.00 MiB -llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB +jarvis_new_context_with_model: CANN compute buffer size = 1260.81 MiB ``` -For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). +For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md). ### Android @@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md) ### Arm CPU optimized mulmat kernels -Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. +Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. -To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). +To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`). diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 04c5ccbbe60c3..d72c70b30e5e0 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -1,9 +1,9 @@ -# Add a new model architecture to `llama.cpp` +# Add a new model architecture to `jarvis.cpp` Adding a model requires few steps: 1. Convert the model to GGUF -2. Define the model architecture in `llama.cpp` +2. Define the model architecture in `jarvis.cpp` 3. Build the GGML graph implementation After following these steps, you can open PR. @@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). +Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. @@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. -### 2. Define the model architecture in `llama.cpp` +### 2. Define the model architecture in `jarvis.cpp` -The model params and tensors layout must be defined in `llama.cpp`: +The model params and tensors layout must be defined in `jarvis.cpp`: 1. Define a new `llm_arch` 2. Define the tensors layout in `LLM_TENSOR_NAMES` 3. Add any non standard metadata in `llm_load_hparams` 4. Create the tensors for inference in `llm_load_tensors` -5. If the model has a RoPE operation, add the rope type in `llama_rope_type` +5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type` NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions. ### 3. Build the GGML graph implementation -This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. +This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`. -Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. +Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. -Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). +Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/). ## GGUF specification @@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md ## Resources -- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 -- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 -- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 -- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 -- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 -- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 -- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 -- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 -- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 +- YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268 +- support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009 +- support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283 +- Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406 +- BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423 +- Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204 +- Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491 +- support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515 +- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948 diff --git a/docs/development/debugging-tests.md b/docs/development/debugging-tests.md index 18407f688f9db..38b6767622c85 100644 --- a/docs/development/debugging-tests.md +++ b/docs/development/debugging-tests.md @@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults. ```bash -cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON .. +cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON .. make -j ``` @@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention ```bash ... -1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" +1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf" 1: Working Directory: . Labels: main - Test #1: test-tokenizer-0-llama-spm + Test #1: test-tokenizer-0-jarvis-spm ... -4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf" +4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf" 4: Working Directory: . Labels: main Test #4: test-tokenizer-0-falcon @@ -86,8 +86,8 @@ Labels: main #### Step 4: Identify Test Command for Debugging So for test #1 above we can tell these two pieces of relevant information: -* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0` -* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf` +* Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0` +* Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf` #### Step 5: Run GDB on test command @@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model} Example: ```bash -gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" +gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf" ``` diff --git a/docs/development/llama-star/idea-arch.key b/docs/development/jarvis-star/idea-arch.key old mode 100755 new mode 100644 similarity index 100% rename from docs/development/llama-star/idea-arch.key rename to docs/development/jarvis-star/idea-arch.key diff --git a/docs/development/llama-star/idea-arch.pdf b/docs/development/jarvis-star/idea-arch.pdf similarity index 100% rename from docs/development/llama-star/idea-arch.pdf rename to docs/development/jarvis-star/idea-arch.pdf diff --git a/docs/development/token_generation_performance_tips.md b/docs/development/token_generation_performance_tips.md index 41b7232c976b3..62aeb11789fdb 100644 --- a/docs/development/token_generation_performance_tips.md +++ b/docs/development/token_generation_performance_tips.md @@ -1,23 +1,23 @@ # Token generation performance troubleshooting ## Verifying that the model is running on the GPU with CUDA -Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell -./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " +./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` -When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: +When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: ```shell -llama_model_load_internal: [cublas] offloading 60 layers to GPU -llama_model_load_internal: [cublas] offloading output layer to GPU -llama_model_load_internal: [cublas] total VRAM used: 17223 MB +jarvis_model_load_internal: [cublas] offloading 60 layers to GPU +jarvis_model_load_internal: [cublas] offloading output layer to GPU +jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB ... rest of inference ``` If you see these lines, then the GPU is being used. ## Verifying that the CPU is not oversaturated -llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. +jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. # Example of runtime flags effect on inference speed benchmark These runs were tested on the following machine: @@ -27,7 +27,7 @@ RAM: 32GB Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) -Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` +Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Result: diff --git a/docs/docker.md b/docs/docker.md index 8d90e6ded5738..4015000245953 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -2,26 +2,26 @@ ## Prerequisites * Docker must be installed and running on your system. -* Create a folder to store big models & intermediate files (ex. /llama/models) +* Create a folder to store big models & intermediate files (ex. /jarvis/models) ## Images We have three Docker images available for this project: -1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) -2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) -3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) +1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) +2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: -- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). @@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i Replace `/path/to/models` below with the actual path where you downloaded the models. ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B ``` On completion, you are ready to play! ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a light image: ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a server image: ```bash -docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 ``` ## Docker With CUDA @@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ## Building Docker locally ```bash -docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . -docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile . -docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile . +docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile . +docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile . +docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. @@ -74,18 +74,18 @@ The defaults are: The resulting images, are essentially the same as the non-CUDA images: -1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. -3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. +1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization. +2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file. +3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file. ## Usage After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. ```bash -docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ## Docker With MUSA @@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/ ## Building Docker locally ```bash -docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile . -docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile . -docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile . +docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile . +docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile . +docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. @@ -108,16 +108,16 @@ The defaults are: The resulting images, are essentially the same as the non-MUSA images: -1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-musa`: This image only includes the main executable file. -3. `local/llama.cpp:server-musa`: This image only includes the server executable file. +1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert JARVIS models into ggml and convert into 4-bit quantization. +2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file. +3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file. ## Usage After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag. ```bash -docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` diff --git a/docs/install.md b/docs/install.md index 10a568506835b..e5baee4a7f495 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,39 +1,39 @@ -# Install pre-built version of llama.cpp +# Install pre-built version of jarvis.cpp ## Homebrew On Mac and Linux, the homebrew package manager can be used via ```sh -brew install llama.cpp +brew install jarvis.cpp ``` -The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 +The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668 ## Nix On Mac and Linux, the Nix package manager can be used via ```sh -nix profile install nixpkgs#llama-cpp +nix profile install nixpkgs#jarvis-cpp ``` For flake enabled installs. Or ```sh -nix-env --file '' --install --attr llama-cpp +nix-env --file '' --install --attr jarvis-cpp ``` For non-flake enabled installs. -This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). +This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164). ## Flox -On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via +On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via ```sh -flox install llama-cpp +flox install jarvis-cpp ``` -Flox follows the nixpkgs build of llama.cpp. +Flox follows the nixpkgs build of jarvis.cpp. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ead630661c8e2..5755f879a45d0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(cvector-generator) - add_subdirectory(baby-llama) + add_subdirectory(baby-jarvis) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(convert-jarvis2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) add_subdirectory(export-lora) @@ -27,7 +27,7 @@ else() add_subdirectory(gritlm) add_subdirectory(imatrix) add_subdirectory(infill) - add_subdirectory(llama-bench) + add_subdirectory(jarvis-bench) add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) @@ -41,7 +41,7 @@ else() if (GGML_RPC) add_subdirectory(rpc) endif() - if (LLAMA_BUILD_SERVER) + if (JARVIS_BUILD_SERVER) add_subdirectory(server) endif() if (GGML_SYCL) diff --git a/examples/Miku.sh b/examples/Miku.sh index 0f6c8c8787107..1725dbf0099aa 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -2,7 +2,7 @@ set -e AI_NAME="${AI_NAME:-Miku}" -MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}" +MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}" USER_NAME="${USER_NAME:-Anon}" # Uncomment and adjust to the number of CPU cores you want to use. @@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then GEN_OPTIONS+=(--threads "$N_THREAD") fi -./llama-cli "${GEN_OPTIONS[@]}" \ +./jarvis-cli "${GEN_OPTIONS[@]}" \ --model "$MODEL" \ --in-prefix " " \ --in-suffix "${AI_NAME}:" \ diff --git a/examples/baby-jarvis/CMakeLists.txt b/examples/baby-jarvis/CMakeLists.txt new file mode 100644 index 0000000000000..a0703600b3d7a --- /dev/null +++ b/examples/baby-jarvis/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET jarvis-baby-jarvis) +add_executable(${TARGET} baby-jarvis.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-jarvis/baby-jarvis.cpp similarity index 96% rename from examples/baby-llama/baby-llama.cpp rename to examples/baby-jarvis/baby-jarvis.cpp index 3ce91070b4ed7..03f22bac8461c 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-jarvis/baby-jarvis.cpp @@ -11,8 +11,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#ifdef LLAMA_DEFAULT_RMS_EPS -constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +#ifdef JARVIS_DEFAULT_RMS_EPS +constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS; #else constexpr float rms_norm_eps = 5e-6f; #endif @@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor( return tensor; } -struct llama_hparams { +struct jarvis_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; @@ -80,17 +80,17 @@ struct llama_hparams { uint32_t n_layer = 32; uint32_t n_rot = 64; - bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + bool operator!=(const jarvis_hparams & other) const { + return memcmp(this, &other, sizeof(jarvis_hparams)); } }; -static uint32_t get_n_ff(const struct llama_hparams* hparams) { +static uint32_t get_n_ff(const struct jarvis_hparams* hparams) { const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; return n_ff; } -struct llama_hparams_lora { +struct jarvis_hparams_lora { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; @@ -100,12 +100,12 @@ struct llama_hparams_lora { uint32_t n_rot = 64; uint32_t n_lora = 64; - bool operator!=(const llama_hparams_lora & other) const { - return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0; + bool operator!=(const jarvis_hparams_lora & other) const { + return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0; } }; -struct llama_layer { +struct jarvis_layer { // normalization struct ggml_tensor * attention_norm; @@ -124,7 +124,7 @@ struct llama_layer { struct ggml_tensor * w3; }; -struct llama_layer_lora { +struct jarvis_layer_lora { // normalization struct ggml_tensor * attention_norm; @@ -148,34 +148,34 @@ struct llama_layer_lora { }; -struct llama_kv_cache { +struct jarvis_kv_cache { struct ggml_context * ctx = NULL; struct ggml_tensor * k; struct ggml_tensor * v; - // llama_ctx_buffer buf; + // jarvis_ctx_buffer buf; int n; // number of tokens currently in the cache }; -struct llama_model { +struct jarvis_model { struct ggml_context * ctx = NULL; - llama_hparams hparams; + jarvis_hparams hparams; struct ggml_tensor * tok_embeddings; struct ggml_tensor * norm; struct ggml_tensor * output; - std::vector layers; + std::vector layers; }; -struct llama_model_lora { +struct jarvis_model_lora { struct ggml_context * ctx = NULL; - llama_hparams_lora hparams; + jarvis_hparams_lora hparams; struct ggml_tensor * tok_embeddings; @@ -183,10 +183,10 @@ struct llama_model_lora { struct ggml_tensor * outputa; struct ggml_tensor * outputb; - std::vector layers; + std::vector layers; }; -static void init_model(struct llama_model * model) { +static void init_model(struct jarvis_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) { } -static void init_model_lora(struct llama_model_lora * model) { +static void init_model_lora(struct jarvis_model_lora * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) { } } -static void set_param_model(struct llama_model * model) { +static void set_param_model(struct jarvis_model * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) { } } -static void set_param_model_lora(struct llama_model_lora * model) { +static void set_param_model_lora(struct jarvis_model_lora * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) { } } -static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl static void randomize_model_lora( - struct llama_model_lora * model, int seed, float mean, float std, float min, float max + struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max ) { const auto & hparams = model->hparams; @@ -391,7 +391,7 @@ static void randomize_model_lora( free_random_normal_distribution(rnd); } -static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); } -static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { +static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_ } static struct ggml_tensor * forward( - struct llama_model * model, - struct llama_kv_cache * cache, + struct jarvis_model * model, + struct jarvis_kv_cache * cache, struct ggml_context * ctx0, struct ggml_cgraph * gf, struct ggml_tensor * tokens_input, @@ -472,7 +472,7 @@ static struct ggml_tensor * forward( ) { const int N = n_tokens; - struct llama_kv_cache& kv_self = *cache; + struct jarvis_kv_cache& kv_self = *cache; const auto & hparams = model->hparams; const int n_ctx = hparams.n_ctx; const int n_embd = hparams.n_embd; @@ -692,8 +692,8 @@ static struct ggml_tensor * forward( } static struct ggml_tensor * forward_batch( - struct llama_model * model, - struct llama_kv_cache * cache, + struct jarvis_model * model, + struct jarvis_kv_cache * cache, struct ggml_context * ctx0, struct ggml_cgraph * gf, struct ggml_tensor * tokens_input, @@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch( ) { const int N = n_tokens; - struct llama_kv_cache& kv_self = *cache; + struct jarvis_kv_cache& kv_self = *cache; const auto & hparams = model->hparams; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; @@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch( } static struct ggml_tensor * forward_lora( - struct llama_model_lora * model, - struct llama_kv_cache * cache, + struct jarvis_model_lora * model, + struct jarvis_kv_cache * cache, struct ggml_context * ctx0, struct ggml_cgraph * gf, struct ggml_tensor * tokens_input, @@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora( ) { const int N = n_tokens; - struct llama_kv_cache& kv_self = *cache; + struct jarvis_kv_cache& kv_self = *cache; const auto & hparams = model->hparams; const int n_ctx = hparams.n_ctx; @@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) { lcparams.mem_buffer = NULL; lcparams.no_alloc = false; - struct llama_model model; + struct jarvis_model model; model.hparams.n_vocab = 8; model.hparams.n_ctx = 8; model.hparams.n_embd = 32; @@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) { randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); /* - struct llama_model_lora model_lora; + struct jarvis_model_lora model_lora; // model.hparams.n_vocab = 6; // model.hparams.n_ctx = 64; // model.hparams.n_embd = 128; @@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) { */ int n_batch = 8; // key + value cache for the self attention - struct llama_kv_cache kv_self; + struct jarvis_kv_cache kv_self; printf("init_kv_cache\n"); kv_self.ctx = model.ctx; init_kv_cache(&kv_self, &model, n_batch); @@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) { int n_past = 0; struct ggml_cgraph * gf = NULL; - gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); + gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); @@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) { struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = NULL; - gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); + gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true); int n_past = 0; struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt deleted file mode 100644 index 71b82105c8863..0000000000000 --- a/examples/baby-llama/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-baby-llama) -add_executable(${TARGET} baby-llama.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/base-translate.sh b/examples/base-translate.sh index 103a52f55e6db..1db10dfd59036 100755 --- a/examples/base-translate.sh +++ b/examples/base-translate.sh @@ -5,7 +5,7 @@ # # Usage: # -# cd llama.cpp +# cd jarvis.cpp # make -j # # ./examples/base-translate.sh "" [extra-main-args] @@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then eargs="${@:3}" fi -ftmp="__llama.cpp_example_tmp__.txt" +ftmp="__jarvis.cpp_example_tmp__.txt" trap "rm -f $ftmp" EXIT echo "Translate from English to French: @@ -58,4 +58,4 @@ echo "$2 model=$1 # generate the most likely continuation until the string "===" is found -./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs +./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 959acaeeebc38..f84e368f22422 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-batched-bench) +set(TARGET jarvis-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index df67c47e378cf..b8d3152666d0d 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -1,6 +1,6 @@ -# llama.cpp/example/batched-bench +# jarvis.cpp/example/batched-bench -Benchmark the batched decoding performance of `llama.cpp` +Benchmark the batched decoding performance of `jarvis.cpp` ## Usage @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] +./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] -# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 +# JARVIS 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared +./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps +# JARVIS 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared +./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps # custom set of batches -./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 +./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a3b21ad6bce44..349f16aade71e 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,7 +1,7 @@ #include "arg.h" #include "common.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include #include @@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) { return 1; } @@ -31,42 +31,42 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // initialize the model - llama_model_params model_params = common_model_params_to_llama(params); + jarvis_model_params model_params = common_model_params_to_jarvis(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } - llama_context_params ctx_params = common_context_params_to_llama(params); + jarvis_context_params ctx_params = common_context_params_to_jarvis(params); // ensure enough sequences are available ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); + jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__); return 1; } - const int32_t n_kv_max = llama_n_ctx(ctx); + const int32_t n_kv_max = jarvis_n_ctx(ctx); - llama_batch batch = llama_batch_init(n_kv_max, 0, 1); + jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1); // decode in batches of ctx_params.n_batch tokens - auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { + auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - llama_batch batch_view = { + jarvis_batch batch_view = { n_tokens, batch.token + i, nullptr, @@ -76,13 +76,13 @@ int main(int argc, char ** argv) { batch.logits + i, }; - const int ret = llama_decode(ctx, batch_view); + const int ret = jarvis_decode(ctx, batch_view); if (ret != 0) { LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } - llama_synchronize(ctx); + jarvis_synchronize(ctx); } return true; @@ -95,7 +95,7 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: jarvis_decode() failed\n", __func__); return 1; } } @@ -132,16 +132,16 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(ctx); + jarvis_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: jarvis_decode() failed\n", __func__); return 1; } if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1); } } @@ -157,7 +157,7 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: jarvis_decode() failed\n", __func__); return 1; } } @@ -189,14 +189,14 @@ int main(int argc, char ** argv) { } LOG("\n"); - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); - llama_batch_free(batch); + jarvis_batch_free(batch); - llama_free(ctx); - llama_free_model(model); + jarvis_free(ctx); + jarvis_free_model(model); - llama_backend_free(); + jarvis_backend_free(); LOG("\n\n"); diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile index 1f9156e583fdd..f6efa6ed62536 100755 --- a/examples/batched.swift/Makefile +++ b/examples/batched.swift/Makefile @@ -1,6 +1,6 @@ .PHONY: build build: - xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build - rm -f ./llama-batched-swift - ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift + xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build + rm -f ./jarvis-batched-swift + ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift index 7e8afd0843c5b..8130a77e66ebd 100644 --- a/examples/batched.swift/Package.swift +++ b/examples/batched.swift/Package.swift @@ -4,17 +4,17 @@ import PackageDescription let package = Package( - name: "llama-batched-swift", + name: "jarvis-batched-swift", platforms: [.macOS(.v12)], dependencies: [ - .package(name: "llama", path: "../../"), + .package(name: "jarvis", path: "../../"), ], targets: [ // Targets are the basic building blocks of a package, defining a module or a test suite. // Targets can depend on other targets in this package and products from dependencies. .executableTarget( - name: "llama-batched-swift", - dependencies: ["llama"], + name: "jarvis-batched-swift", + dependencies: ["jarvis"], path: "Sources", linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] ), diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 7f2e2fcdcf4a7..03ec340ab0522 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,4 @@ This is a swift clone of `examples/batched`. $ `make` -$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` +$ `./jarvis-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 10f2e7fd117a1..92eedbac7f6e8 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -1,5 +1,5 @@ import Foundation -import llama +import jarvis let arguments = CommandLine.arguments @@ -17,56 +17,56 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu let n_len: Int = 32 // init LLM -llama_backend_init() +jarvis_backend_init() defer { - llama_backend_free() + jarvis_backend_free() } -let model_params = llama_model_default_params() -guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { +let model_params = jarvis_model_default_params() +guard let model = jarvis_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { print("Failed to load model") exit(1) } defer { - llama_free_model(model) + jarvis_free_model(model) } var tokens = tokenize(text: prompt, add_bos: true) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) -var context_params = llama_context_default_params() +var context_params = jarvis_context_default_params() context_params.n_ctx = n_kv_req context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 context_params.n_threads_batch = 8 -let context = llama_new_context_with_model(model, context_params) +let context = jarvis_new_context_with_model(model, context_params) guard context != nil else { print("Failed to initialize context") exit(1) } defer { - llama_free(context) + jarvis_free(context) } -var sparams = llama_sampler_chain_default_params() +var sparams = jarvis_sampler_chain_default_params() -let smpl = llama_sampler_chain_init(sparams) +let smpl = jarvis_sampler_chain_init(sparams) guard smpl != nil else { print("Failed to initialize sampling") exit(1) } defer { - llama_sampler_free(smpl) + jarvis_sampler_free(smpl) } -llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40)); -llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); -llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4)); -llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234)); +jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_k(40)); +jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_p(0.9, 1)); +jarvis_sampler_chain_add(smpl, jarvis_sampler_init_temp (0.4)); +jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist (1234)); -let n_ctx = llama_n_ctx(context) +let n_ctx = jarvis_n_ctx(context) print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") @@ -76,15 +76,15 @@ if n_kv_req > n_ctx { } var buffer: [CChar] = [] -for id: llama_token in tokens { +for id: jarvis_token in tokens { print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "") } print("\n") -var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1) +var batch = jarvis_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1) defer { - llama_batch_free(batch) + jarvis_batch_free(batch) } // evaluate the initial prompt @@ -102,16 +102,16 @@ for (i, token) in tokens.enumerated() { batch.logits[i] = 0 } -// llama_decode will output logits only for the last token of the prompt +// jarvis_decode will output logits only for the last token of the prompt batch.logits[Int(batch.n_tokens) - 1] = 1 -if llama_decode(context, batch) != 0 { - print("llama_decode() failed") +if jarvis_decode(context, batch) != 0 { + print("jarvis_decode() failed") exit(1) } for i in 1 ..< n_parallel { - llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) + jarvis_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) } if n_parallel > 1 { @@ -138,10 +138,10 @@ while n_cur <= n_len { continue } - let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) + let new_token_id = jarvis_sampler_sample(smpl, context, i_batch[i]) // is it an end of stream? -> mark the stream as finished - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { + if jarvis_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { @@ -183,8 +183,8 @@ while n_cur <= n_len { n_cur += 1 // evaluate the current batch with the transformer model - if llama_decode(context, batch) != 0 { - print("llama_decode() failed") + if jarvis_decode(context, batch) != 0 { + print("jarvis_decode() failed") exit(1) } } @@ -200,15 +200,15 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_sampler_print(smpl) -llama_perf_context_print(context) +jarvis_perf_sampler_print(smpl) +jarvis_perf_context_print(context) -private func tokenize(text: String, add_bos: Bool) -> [llama_token] { +private func tokenize(text: String, add_bos: Bool) -> [jarvis_token] { let utf8Count = text.utf8.count let n_tokens = utf8Count + (add_bos ? 1 : 0) - let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) - var swiftTokens: [llama_token] = [] + let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) + let tokenCount = jarvis_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + var swiftTokens: [jarvis_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) } @@ -216,13 +216,13 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { return swiftTokens } -private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { +private func token_to_piece(token: jarvis_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) + let nTokens = jarvis_token_to_piece(model, token, &result, Int32(result.count), 0, false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) - let check = llama_token_to_piece( + let check = jarvis_token_to_piece( model, token, &result, diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 77e33343b6673..9c78d7f13544d 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-batched) +set(TARGET jarvis-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched/README.md b/examples/batched/README.md index 6013aab01fddc..ebc3ebdab319d 100644 --- a/examples/batched/README.md +++ b/examples/batched/README.md @@ -1,9 +1,9 @@ -# llama.cpp/example/batched +# jarvis.cpp/example/batched The example demonstrates batched generation from a given prompt ```bash -./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 +./jarvis-batched -m ./models/jarvis-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ... @@ -36,9 +36,9 @@ Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s -llama_print_timings: load time = 587.00 ms -llama_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second) -llama_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second) -llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) -llama_print_timings: total time = 4156.04 ms +jarvis_print_timings: load time = 587.00 ms +jarvis_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second) +jarvis_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second) +jarvis_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +jarvis_print_timings: total time = 4156.04 ms ``` diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 3b554033e7ee4..d651730b2c582 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,7 +1,7 @@ #include "arg.h" #include "common.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include #include @@ -20,7 +20,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON, print_usage)) { return 1; } @@ -34,14 +34,14 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // initialize the model - llama_model_params model_params = common_model_params_to_llama(params); + jarvis_model_params model_params = common_model_params_to_jarvis(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: error: unable to load model\n" , __func__); @@ -50,35 +50,35 @@ int main(int argc, char ** argv) { // tokenize the prompt - std::vector tokens_list; + std::vector tokens_list; tokens_list = common_tokenize(model, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = common_context_params_to_llama(params); + jarvis_context_params ctx_params = common_context_params_to_jarvis(params); ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); + jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params); - auto sparams = llama_sampler_chain_default_params(); + auto sparams = jarvis_sampler_chain_default_params(); - llama_sampler * smpl = llama_sampler_chain_init(sparams); + jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); + jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_k(params.sparams.top_k)); + jarvis_sampler_chain_add(smpl, jarvis_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); + jarvis_sampler_chain_add(smpl, jarvis_sampler_init_temp (params.sparams.temp)); + jarvis_sampler_chain_add(smpl, jarvis_sampler_init_dist (params.sparams.seed)); if (ctx == NULL) { - LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: error: failed to create the jarvis_context\n" , __func__); return 1; } - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx = jarvis_n_ctx(ctx); LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); @@ -97,11 +97,11 @@ int main(int argc, char ** argv) { LOG("%s", common_token_to_piece(ctx, id).c_str()); } - // create a llama_batch + // create a jarvis_batch // we use this object to submit token data for decoding - llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); + jarvis_batch batch = jarvis_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); - std::vector seq_ids(n_parallel, 0); + std::vector seq_ids(n_parallel, 0); for (int32_t i = 0; i < n_parallel; ++i) { seq_ids[i] = i; } @@ -112,33 +112,33 @@ int main(int argc, char ** argv) { } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); - if (llama_model_has_encoder(model)) { - if (llama_encode(ctx, batch)) { + if (jarvis_model_has_encoder(model)) { + if (jarvis_encode(ctx, batch)) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } - llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model); if (decoder_start_token_id == -1) { - decoder_start_token_id = llama_token_bos(model); + decoder_start_token_id = jarvis_token_bos(model); } common_batch_clear(batch); common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); } - // llama_decode will output logits only for the last token of the prompt + // jarvis_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; - if (llama_decode(ctx, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + if (jarvis_decode(ctx, batch) != 0) { + LOG_ERR("%s: jarvis_decode() failed\n", __func__); return 1; } //// assign the system KV cache to all parallel sequences //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them //for (int32_t i = 1; i < n_parallel; ++i) { - // llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + // jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1); //} if (n_parallel > 1) { @@ -170,10 +170,10 @@ int main(int argc, char ** argv) { continue; } - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); + const jarvis_token new_token_id = jarvis_sampler_sample(smpl, ctx, i_batch[i]); // is it an end of generation? -> mark the stream as finished - if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { + if (jarvis_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; LOG("\n"); if (n_parallel > 1) { @@ -206,7 +206,7 @@ int main(int argc, char ** argv) { n_cur += 1; // evaluate the current batch with the transformer model - if (llama_decode(ctx, batch)) { + if (jarvis_decode(ctx, batch)) { LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } @@ -226,18 +226,18 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG("\n"); - llama_perf_sampler_print(smpl); - llama_perf_context_print(ctx); + jarvis_perf_sampler_print(smpl); + jarvis_perf_context_print(ctx); fprintf(stderr, "\n"); - llama_batch_free(batch); + jarvis_batch_free(batch); - llama_sampler_free(smpl); - llama_free(ctx); - llama_free_model(model); + jarvis_sampler_free(smpl); + jarvis_free(ctx); + jarvis_free_model(model); - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat index c5c8ac6efa81a..e398912f0f69b 100644 --- a/examples/chat-13B.bat +++ b/examples/chat-13B.bat @@ -10,7 +10,7 @@ if not "%errorlevel%"=="0" ( if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin" if not defined USER_NAME set "USER_NAME=User" -if not defined AI_NAME set "AI_NAME=ChatLLaMa" +if not defined AI_NAME set "AI_NAME=ChatJarvis" rem Adjust to the number of CPU cores you want to use. rem if not defined N_THREAD set "N_THREAD=8" rem Number of tokens to predict (made it larger than default because we want a long interaction) diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh index 1828903c31670..96785bd4b2ccc 100755 --- a/examples/chat-13B.sh +++ b/examples/chat-13B.sh @@ -7,7 +7,7 @@ cd "$(dirname "$0")/.." || exit MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}" PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt} USER_NAME="${USER_NAME:-USER}" -AI_NAME="${AI_NAME:-ChatLLaMa}" +AI_NAME="${AI_NAME:-ChatJarvis}" # Adjust to the number of CPU cores you want to use. N_THREAD="${N_THREAD:-8}" @@ -15,13 +15,13 @@ N_THREAD="${N_THREAD:-8}" N_PREDICTS="${N_PREDICTS:-2048}" # Note: you can also override the generation options by specifying them on the command line: -# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 +# For example, override the context size by doing: ./chatJarvis --ctx_size 1024 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" DATE_TIME=$(date +%H:%M) DATE_YEAR=$(date +%Y) -PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt) +PROMPT_FILE=$(mktemp -t jarviscpp_prompt.XXXXXXX.txt) sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \ @@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ $PROMPT_TEMPLATE > $PROMPT_FILE # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS -./llama-cli $GEN_OPTIONS \ +./jarvis-cli $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --n_predict "$N_PREDICTS" \ diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index d9cab9836482e..016e6d06f58e0 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -9,10 +9,10 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then exit 1 fi -MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}" +MODEL="${MODEL:-./models/jarvis-13b/ggml-model-q4_0.gguf}" PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}" USER_NAME="${USER_NAME:-User}" -AI_NAME="${AI_NAME:-ChatLLaMa}" +AI_NAME="${AI_NAME:-ChatJarvis}" DATE_TIME="$(date +%H:%M)" DATE_YEAR="$(date +%Y)" @@ -62,7 +62,7 @@ fi if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then echo 'Prompt cache does not exist, building...' # Default batch_size to 64 here for better user feedback during initial prompt processing - ./llama-cli 2>>"$LOG" \ + ./jarvis-cli 2>>"$LOG" \ --batch_size 64 \ "${OPTS[@]}" \ --prompt-cache "$PROMPT_CACHE_FILE" \ @@ -109,13 +109,13 @@ while read -e line; do printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE" - ./llama-cli 2>>"$LOG" "${OPTS[@]}" \ + ./jarvis-cli 2>>"$LOG" "${OPTS[@]}" \ --prompt-cache "$CUR_PROMPT_CACHE" \ --prompt-cache-all \ --file "$CUR_PROMPT_FILE" \ --reverse-prompt "${USER_NAME}:" \ --n_predict "$n_predict" | - skip_bytes 1 | # skip BOS token added by ./llama-cli + skip_bytes 1 | # skip BOS token added by ./jarvis-cli tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file skip_bytes "$n_prompt_len_pre" # print generation @@ -133,7 +133,7 @@ while read -e line; do # TODO get both messages in one go if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then - echo >&2 "Couldn't get number of tokens from ./llama-cli output!" + echo >&2 "Couldn't get number of tokens from ./jarvis-cli output!" exit 1 fi @@ -144,7 +144,7 @@ while read -e line; do fi # Update cache for next prompt in background, ideally during user input - ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ + ./jarvis-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ --prompt-cache "$NEXT_PROMPT_CACHE" \ --file "$NEXT_PROMPT_FILE" \ --n_predict 1 & diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh index ffdd200849503..2d059adac0338 100755 --- a/examples/chat-vicuna.sh +++ b/examples/chat-vicuna.sh @@ -15,13 +15,13 @@ N_THREAD="${N_THREAD:-8}" N_PREDICTS="${N_PREDICTS:-2048}" # Note: you can also override the generation options by specifying them on the command line: -# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 +# For example, override the context size by doing: ./chatJarvis --ctx_size 1024 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" DATE_TIME=$(date +%H:%M) DATE_YEAR=$(date +%Y) -PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt) +PROMPT_FILE=$(mktemp -t jarviscpp_prompt.XXXXXXX.txt) sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \ @@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ $PROMPT_TEMPLATE > $PROMPT_FILE # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS -./bin/llama-cli $GEN_OPTIONS \ +./bin/jarvis-cli $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --n_predict "$N_PREDICTS" \ diff --git a/examples/chat.sh b/examples/chat.sh index 9f85d1e265d00..0eb4b2e21bbce 100755 --- a/examples/chat.sh +++ b/examples/chat.sh @@ -11,6 +11,6 @@ cd .. # # "--keep 48" is based on the contents of prompts/chat-with-bob.txt # -./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ +./jarvis-cli -m ./models/jarvis-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ --repeat_penalty 1.0 --color -i \ -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/convert-jarvis2c-to-ggml/CMakeLists.txt b/examples/convert-jarvis2c-to-ggml/CMakeLists.txt new file mode 100644 index 0000000000000..f88ca32c7d617 --- /dev/null +++ b/examples/convert-jarvis2c-to-ggml/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET jarvis-convert-jarvis2c-to-ggml) +add_executable(${TARGET} convert-jarvis2c-to-ggml.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-jarvis2c-to-ggml/README.md b/examples/convert-jarvis2c-to-ggml/README.md new file mode 100644 index 0000000000000..0cb1cbbe7cebb --- /dev/null +++ b/examples/convert-jarvis2c-to-ggml/README.md @@ -0,0 +1,28 @@ +## Convert jarvis2.c model to ggml + +This example reads weights from project [jarvis2.c](https://github.com/karpathy/jarvis2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. + +To convert the model first download the models from the [jarvis2.c](https://github.com/karpathy/jarvis2.c) repository: + +`$ make -j` + +After successful compilation, following usage options are available: +``` +usage: ./jarvis-convert-jarvis2c-to-ggml [options] + +options: + -h, --help show this help message and exit + --copy-vocab-from-model FNAME path of gguf jarvis model or jarvis2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') + --jarvis2c-model FNAME [REQUIRED] model path from which to load Karpathy's jarvis2.c model + --jarvis2c-output-model FNAME model path to save the converted jarvis2.c model (default ak_jarvis_model.bin') +``` + +An example command using a model from [karpathy/tinyjarviss](https://huggingface.co/karpathy/tinyjarviss) is as follows: + +`$ ./jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model jarvis-2-7b-chat.gguf.q2_K.bin --jarvis2c-model stories42M.bin --jarvis2c-output-model stories42M.gguf.bin` + +Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyjarviss/stories260K](https://huggingface.co/karpathy/tinyjarviss/tree/main/stories260K). + +Now you can use the model with a command like: + +`$ ./jarvis-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp similarity index 87% rename from examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp rename to examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp index 988a584c99a25..6eb760a0939e3 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp @@ -1,5 +1,5 @@ #include "ggml.h" -#include "llama.h" +#include "jarvis.h" #include "common.h" #include "log.h" @@ -33,14 +33,14 @@ #define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" #define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" -#define KV_CONTEXT_LENGTH "llama.context_length" -#define KV_EMBEDDING_LENGTH "llama.embedding_length" -#define KV_BLOCK_COUNT "llama.block_count" -#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" -#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" -#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" -#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" -#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" +#define KV_CONTEXT_LENGTH "jarvis.context_length" +#define KV_EMBEDDING_LENGTH "jarvis.embedding_length" +#define KV_BLOCK_COUNT "jarvis.block_count" +#define KV_FEED_FORWARD_LENGTH "jarvis.feed_forward_length" +#define KV_ATTENTION_HEAD_COUNT "jarvis.attention.head_count" +#define KV_ATTENTION_HEAD_COUNT_KV "jarvis.attention.head_count_kv" +#define KV_ATTENTION_LAYERNORM_RMS_EPS "jarvis.attention.layer_norm_rms_epsilon" +#define KV_ROPE_DIMENSION_COUNT "jarvis.rope.dimension_count" #define TN_TOKEN_EMBD "token_embd.weight" #define TN_OUTPUT_NORM "output_norm.weight" @@ -59,15 +59,15 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -#define LLAMA_FILE_VERSION_GGJT_V3 3 +#define JARVIS_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define JARVIS_FILE_VERSION_GGJT_V3 3 -#define TOKENIZER_NAME "llama" +#define TOKENIZER_NAME "jarvis" #define UNKNOWN_TOKEN_ID 0 #define BOS_TOKEN_ID 1 #define EOS_TOKEN_ID 2 -//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. +//////////////////////////////////////// jarvis2.c model structs and functions to load models, alloc memory etc. typedef struct { int dim; // transformer dimension int hidden_dim; // for ffn layers @@ -201,10 +201,10 @@ static void print_sample_weights(TransformerWeights *w){ //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. -struct my_llama_vocab { +struct my_jarvis_vocab { using id = int32_t; using token = std::string; - using ttype = llama_token_type; + using ttype = jarvis_token_type; struct token_data { token text; @@ -216,7 +216,7 @@ struct my_llama_vocab { std::vector id_to_token; }; -struct my_llama_hparams { +struct my_jarvis_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; @@ -227,12 +227,12 @@ struct my_llama_hparams { uint32_t n_layer = 32; uint32_t n_rot = 64; - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(my_llama_hparams)); + bool operator!=(const my_jarvis_hparams& other) const { + return memcmp(this, &other, sizeof(my_jarvis_hparams)); } }; -struct my_llama_layer { +struct my_jarvis_layer { // normalization struct ggml_tensor * attention_norm; @@ -251,19 +251,19 @@ struct my_llama_layer { struct ggml_tensor * w3; }; -struct my_llama_model { +struct my_jarvis_model { struct ggml_context * ctx = NULL; std::string name; - my_llama_hparams hparams; + my_jarvis_hparams hparams; struct ggml_tensor * tok_embeddings; struct ggml_tensor * norm; struct ggml_tensor * output; - std::vector layers; + std::vector layers; uint32_t train_its = 0; uint32_t train_samples = 0; @@ -272,8 +272,8 @@ struct my_llama_model { struct train_params { const char * fn_vocab_model; - const char * fn_llama2c_model; - const char * fn_llama2c_output_model; + const char * fn_jarvis2c_model; + const char * fn_jarvis2c_output_model; const char * fn_train_data; const char * fn_checkpoint_in; const char * fn_checkpoint_out; @@ -318,7 +318,7 @@ struct train_params { int mem_compute1_gb; }; -static void print_params(struct my_llama_hparams * params) { +static void print_params(struct my_jarvis_hparams * params) { LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab); LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx); LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd); @@ -345,7 +345,7 @@ static void print_tensor_info(const struct ggml_context * ctx) { } } -static void init_model(struct my_llama_model * model) { +static void init_model(struct my_jarvis_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) { } } -struct llama_file { +struct jarvis_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; - llama_file(const char * fname, const char * mode) { + jarvis_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { size = 0; @@ -500,7 +500,7 @@ struct llama_file { return std::string(chars.data(), len); } - ~llama_file() { + ~jarvis_file() { if (fp) { std::fclose(fp); } @@ -508,7 +508,7 @@ struct llama_file { }; static bool is_ggml_file(const char * filename) { - llama_file file(filename, "rb"); + jarvis_file file(filename, "rb"); if (file.size < 4) { return false; } @@ -516,7 +516,7 @@ static bool is_ggml_file(const char * filename) { return magic == GGUF_MAGIC; } -static std::string llama_escape_whitespaces(const std::string & text) { +static std::string jarvis_escape_whitespaces(const std::string & text) { std::ostringstream out; for (char c : text) { if (c == ' ') out << "\xe2\x96\x81"; @@ -525,7 +525,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) { +static void load_vocab(const char * filename, const Config * config, struct my_jarvis_vocab * vocab) { if (is_ggml_file(filename)) { LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; @@ -556,7 +556,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); if (n_vocab != static_cast(config->vocab_size)) { - die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size); + die_fmt("vocab size mismatch: (gguf) %u != (jarvis2c) %d", n_vocab, config->vocab_size); } vocab->id_to_token.resize(n_vocab); @@ -569,45 +569,45 @@ static void load_vocab(const char * filename, const Config * config, struct my_l auto & token_data = vocab->id_to_token[i]; token_data.text = std::move(word); token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; + token_data.type = (jarvis_token_type) toktypes[i]; } ggml_free(ctx_data); gguf_free(ctx); } else { - // assume llama2.c vocabulary - LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); - llama_file file(filename, "rb"); + // assume jarvis2.c vocabulary + LOG_INF("%s: Assuming jarvis2.c vocabulary since %s is not a gguf file\n", __func__, filename); + jarvis_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (my_llama_vocab::id id=0; id", &byte_val) == 1) { // Text of byte tokens is already in the expected format. - type = LLAMA_TOKEN_TYPE_BYTE; + type = JARVIS_TOKEN_TYPE_BYTE; } else { - type = LLAMA_TOKEN_TYPE_NORMAL; + type = JARVIS_TOKEN_TYPE_NORMAL; } - text = llama_escape_whitespaces(text); + text = jarvis_escape_whitespaces(text); vocab->id_to_token[id].text = text; vocab->id_to_token[id].score = score; @@ -630,8 +630,8 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa } } -static void save_as_llama_model( - struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename +static void save_as_jarvis_model( + struct my_jarvis_vocab * vocab, struct my_jarvis_model * model, TransformerWeights* w, const char * filename ) { // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings @@ -670,8 +670,8 @@ static void save_as_llama_model( std::vector tokens; std::vector scores; - std::vector token_types; - for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) { + std::vector token_types; + for (const my_jarvis_vocab::token_data & token_data : vocab->id_to_token) { tokens.push_back(token_data.text.c_str()); scores.push_back(token_data.score); token_types.push_back(token_data.type); @@ -682,8 +682,8 @@ static void save_as_llama_model( gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); - gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); - gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); + gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "jarvis"); + gguf_set_val_str(ctx, KV_GENERAL_NAME, "jarvis"); // special tokens gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); @@ -750,7 +750,7 @@ static void save_as_llama_model( static struct train_params get_default_train_params() { struct train_params params; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; - params.fn_llama2c_output_model = "ak_llama_model.bin"; + params.fn_jarvis2c_output_model = "ak_jarvis_model.bin"; params.fn_train_data = "shakespeare.txt"; params.fn_checkpoint_in = "checkpoint.bin"; params.fn_checkpoint_out = "checkpoint.bin"; @@ -802,9 +802,9 @@ static void print_usage(int /*argc*/, char ** argv, const struct train_params * fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); - fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); - fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); + fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf jarvis model or jarvis2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --jarvis2c-model FNAME [REQUIRED] model path from which to load Karpathy's jarvis2.c model\n"); + fprintf(stderr, " --jarvis2c-output-model FNAME model path to save the converted jarvis2.c model (default %s')\n", params->fn_jarvis2c_output_model); fprintf(stderr, "\n"); } @@ -827,19 +827,19 @@ static bool params_parse(int argc, char ** argv, struct train_params * params) { break; } params->fn_vocab_model = argv[i]; - } else if (arg == "--llama2c-model") { + } else if (arg == "--jarvis2c-model") { if (++i >= argc) { invalid_param = true; break; } reqd_param_found = true; - params->fn_llama2c_model = argv[i]; - } else if (arg == "--llama2c-output-model") { + params->fn_jarvis2c_model = argv[i]; + } else if (arg == "--jarvis2c-output-model") { if (++i >= argc) { invalid_param = true; break; } - params->fn_llama2c_output_model = argv[i]; + params->fn_jarvis2c_output_model = argv[i]; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, &default_params); exit(0); @@ -855,7 +855,7 @@ static bool params_parse(int argc, char ** argv, struct train_params * params) { exit(1); } if (!reqd_param_found){ - fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n"); + fprintf(stderr, "error: please specify a jarvis2.c .bin file to be converted with argument --jarvis2c-model\n"); print_usage(argc, argv, &default_params); exit(1); } @@ -882,15 +882,15 @@ int main(int argc, char ** argv) { Config config; TransformerWeights weights = {}; { - LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); - FILE * file = fopen(params.fn_llama2c_model, "rb"); + LOG_INF("%s: Loading jarvis2c model from %s\n", __func__, params.fn_jarvis2c_model); + FILE * file = fopen(params.fn_jarvis2c_model, "rb"); if (!file) { - LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_jarvis2c_model); return 1; } // read in the config header if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to read jarvis2c config from %s!\n",__func__,params.fn_jarvis2c_model); return 1; } auto shared_weights = config.vocab_size > 0; @@ -899,17 +899,17 @@ int main(int argc, char ** argv) { // read in the Transformer weights alloc_weights(&weights, &config, shared_weights); if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_jarvis2c_model); return 1; } fclose(file); } - struct my_llama_vocab vocab; + struct my_jarvis_vocab vocab; load_vocab(params.fn_vocab_model, &config, &vocab); - struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + struct my_jarvis_model model; + model.hparams.n_vocab = config.vocab_size; //jarvis_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_ff = config.hidden_dim; @@ -929,10 +929,10 @@ int main(int argc, char ** argv) { model.ctx = ggml_init(lcparams); init_model(&model); - model.name = basename(params.fn_llama2c_model); - save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); + model.name = basename(params.fn_jarvis2c_model); + save_as_jarvis_model(&vocab, &model, &weights, params.fn_jarvis2c_output_model); - LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG_INF("%s: Saving jarvis.c model file %s in ggml format at %s\n", __func__, params.fn_jarvis2c_model, params.fn_jarvis2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt deleted file mode 100644 index a6790e617217e..0000000000000 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-convert-llama2c-to-ggml) -add_executable(${TARGET} convert-llama2c-to-ggml.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md deleted file mode 100644 index 5774ac83c32c8..0000000000000 --- a/examples/convert-llama2c-to-ggml/README.md +++ /dev/null @@ -1,28 +0,0 @@ -## Convert llama2.c model to ggml - -This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. - -To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository: - -`$ make -j` - -After successful compilation, following usage options are available: -``` -usage: ./llama-convert-llama2c-to-ggml [options] - -options: - -h, --help show this help message and exit - --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') - --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model - --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') -``` - -An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: - -`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` - -Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). - -Now you can use the model with a command like: - -`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index 9ab9ab06edf8f..df8508790211a 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -33,7 +33,7 @@ sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py')) import gguf -from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab +from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, JarvisHfVocab if TYPE_CHECKING: from typing_extensions import Self, TypeAlias @@ -45,7 +45,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]' -ARCH = gguf.MODEL_ARCH.LLAMA +ARCH = gguf.MODEL_ARCH.JARVIS DEFAULT_CONCURRENCY = 8 @@ -130,8 +130,8 @@ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: 'I32': DT_I32, } -# TODO: match this with `llama_ftype` -# TODO: rename to LLAMAFileType +# TODO: match this with `jarvis_ftype` +# TODO: rename to JARVISFileType # TODO: move to `gguf.py` @@ -276,7 +276,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: rope_finetuned = rope_finetuned, ) - # LLaMA v2 70B params.json + # JARVIS v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: @@ -288,18 +288,18 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: f_rope_freq_base = None n_ff = None - # hack to determine LLaMA v1 vs v2 vs CodeLlama + # hack to determine JARVIS v1 vs v2 vs CodeJarvis if config.get("moe"): # Mixtral n_ctx = 32768 elif config.get("rope_theta") == 1000000: - # CodeLlama + # CodeJarvis n_ctx = 16384 elif config["norm_eps"] == 1e-05: - # LLaMA v2 + # JARVIS v2 n_ctx = 4096 else: - # LLaMA v1 + # JARVIS v1 n_ctx = 2048 if "layers.0.feed_forward.w1.weight" in model: @@ -467,7 +467,7 @@ class ModelPlus: def merge_sharded(models: list[LazyModel]) -> LazyModel: - # Original LLaMA models have each file contain one part of each tensor. + # Original JARVIS models have each file contain one part of each tensor. # Use a dict instead of a set to preserve order. names = {name: None for model in models for name in model} @@ -772,14 +772,14 @@ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian. def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None: # Metadata About The Model And Its Provenence - name = "LLaMA" + name = "JARVIS" if metadata is not None and metadata.name is not None: name = metadata.name elif params.path_model is not None: name = params.path_model.name elif params.n_ctx == 4096: - # Heuristic detection of LLaMA v2 model - name = "LLaMA v2" + # Heuristic detection of JARVIS v2 model + name = "JARVIS v2" self.gguf.add_name(name) @@ -1199,7 +1199,7 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: - _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab] + _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, JarvisHfVocab] def __init__(self, path: Path): self.path = path @@ -1289,7 +1289,7 @@ def main(args_in: list[str] | None = None) -> None: if np.uint32(1) == np.uint32(1).newbyteorder("<"): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") - parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") + parser = argparse.ArgumentParser(description="Convert a JARVIS model to a GGML compatible file") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") @@ -1366,8 +1366,8 @@ def main(args_in: list[str] | None = None) -> None: msg = """\ The model doesn't have a context size, and you didn't specify one with --ctx Please specify one with --ctx: - - LLaMA v1: --ctx 2048 - - LLaMA v2: --ctx 4096""" + - JARVIS v1: --ctx 2048 + - JARVIS v2: --ctx 4096""" parser.error(textwrap.dedent(msg)) params.n_ctx = args.ctx diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt index 0a559d60c2a6d..ed3bb6abba599 100644 --- a/examples/cvector-generator/CMakeLists.txt +++ b/examples/cvector-generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-cvector-generator) +set(TARGET jarvis-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index be4dd5250f15f..e7a4f734761e6 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -3,24 +3,24 @@ This example demonstrates how to generate a control vector using gguf models. Related PRs: -- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) -- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) +- [Add support for control vectors](https://github.com/ggerganov/jarvis.cpp/pull/5970) +- (Issue) [Generate control vector using jarvis.cpp](https://github.com/ggerganov/jarvis.cpp/issues/6880) +- [Add cvector-generator example](https://github.com/ggerganov/jarvis.cpp/pull/7514) ## Examples ```sh # CPU only -./cvector-generator -m ./llama-3.Q4_K_M.gguf +./cvector-generator -m ./jarvis-3.Q4_K_M.gguf # With GPU -./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 +./cvector-generator -m ./jarvis-3.Q4_K_M.gguf -ngl 99 # With advanced options -./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 +./cvector-generator -m ./jarvis-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 # Using mean value instead of PCA -./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean +./cvector-generator -m ./jarvis-3.Q4_K_M.gguf --method mean # To see help message ./cvector-generator -h @@ -36,10 +36,10 @@ If you have multiple lines per prompt, you can escape the newline character (cha <|im_start|>system\nYou are in a very good mood today<|im_end|> ``` -Example to use output file with `llama-cli`: +Example to use output file with `jarvis-cli`: (Tips: The control vector works better when apply to layers higher than 10) ```sh -./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 +./jarvis-cli -m ./jarvis-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 ``` diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index d1731bba64e1b..e09304aed1058 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,6 +1,6 @@ #include "arg.h" #include "common.h" -#include "llama.h" +#include "jarvis.h" #include "ggml.h" #include "pca.hpp" #include "mean.hpp" @@ -28,7 +28,7 @@ // utils template -static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { +static std::string tokens_to_str(jarvis_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { ret += common_token_to_piece(ctx, *begin); @@ -39,10 +39,10 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); - printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); - printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); - printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); + printf("\n CPU only: %s -m ./jarvis-3.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./jarvis-3.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./jarvis-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); + printf("\n using mean: %s -m ./jarvis-3.Q4_K_M.gguf --method mean\n", argv[0]); printf("\n"); } @@ -266,12 +266,12 @@ struct train_context { }; struct tokenized_prompt { - std::vector tokens_pos; - std::vector tokens_neg; + std::vector tokens_pos; + std::vector tokens_neg; size_t max_seq_len; - tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + tokenized_prompt(jarvis_context * ctx, std::string pos, std::string neg) { + const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx)); tokens_pos = common_tokenize(ctx, pos, add_bos, true); tokens_neg = common_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); @@ -279,10 +279,10 @@ struct tokenized_prompt { padding_seq(ctx, tokens_neg, max_seq_len); } - void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { + void padding_seq(jarvis_context * ctx, std::vector & tokens, size_t len) { // TODO: customize padding token - std::vector pad_tokens = common_tokenize(ctx, " ", false); - llama_token pad_tok = pad_tokens.back(); + std::vector pad_tokens = common_tokenize(ctx, " ", false); + jarvis_token pad_tok = pad_tokens.back(); while (tokens.size() < len) { tokens.push_back(pad_tok); } @@ -337,9 +337,9 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache_clear(ctx); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { +static bool get_hidden_layers(jarvis_context * ctx, std::vector & tokens) { + jarvis_kv_cache_clear(ctx); + if (jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -390,7 +390,7 @@ static int prepare_entries(common_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } @@ -409,21 +409,21 @@ int main(int argc, char ** argv) { params.warmup = false; print_build_info(); - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // load the model to get hparams - common_init_result llama_init = common_init_from_params(params); + common_init_result jarvis_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + jarvis_model * model = jarvis_init.model; + jarvis_context * ctx = jarvis_init.context; - // int n_ctx = llama_n_ctx(ctx); - int n_layers = llama_n_layer(model); - int n_embd = llama_n_embd(model); + // int n_ctx = jarvis_n_ctx(ctx); + int n_layers = jarvis_n_layer(model); + int n_embd = jarvis_n_embd(model); // get model hint param (a.k.a model arch name) char model_hint[128]; - llama_model_meta_val_str(model, "general.architecture", model_hint, 128); + jarvis_model_meta_val_str(model, "general.architecture", model_hint, 128); // init train_context train_context ctx_train(n_embd, n_layers); @@ -474,8 +474,8 @@ int main(int argc, char ** argv) { // done with the model, we can now free it to make gain some memory printf("Done evaluate prompts, unload model...\n"); - llama_free(ctx); - llama_free_model(model); + jarvis_free(ctx); + jarvis_free_model(model); bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; @@ -497,7 +497,7 @@ int main(int argc, char ** argv) { // write output vectors to gguf export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp index 16be5ce3eecf1..f95fb2dcce6cf 100644 --- a/examples/cvector-generator/mean.hpp +++ b/examples/cvector-generator/mean.hpp @@ -1,5 +1,5 @@ #include "common.h" -#include "llama.h" +#include "jarvis.h" #include "ggml.h" #include diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index f6e307fbc4970..3ea5dc4738570 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -1,5 +1,5 @@ #include "common.h" -#include "llama.h" +#include "jarvis.h" #include "ggml.h" #ifdef GGML_USE_CUDA @@ -290,7 +290,7 @@ static void power_iteration( ggml_gallocr_free(allocr); // TODO @ngxson : The output vector is randomly inverted - // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171 + // Solution: https://github.com/ggerganov/jarvis.cpp/pull/8069#issuecomment-2185328171 } static void run_pca( diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 59918ec2bbf72..2790c72fb7052 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -1,7 +1,7 @@ # Migration notice for binary filenames > [!IMPORTANT] -[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809) +[2024 Jun 12] Binaries have been renamed w/ a `jarvis-` prefix. `main` is now `jarvis-cli`, `server` is `jarvis-server`, etc (https://github.com/ggerganov/jarvis.cpp/pull/7809) This migration was important, but it is a breaking change that may not always be immediately obvious to users. @@ -9,41 +9,41 @@ Please update all scripts and workflows to use the new binary names. | Old Filename | New Filename | | ---- | ---- | -| main | llama-cli | -| server | llama-server | -| llama-bench | llama-bench | -| embedding | llama-embedding | -| quantize | llama-quantize | -| tokenize | llama-tokenize | -| export-lora | llama-export-lora | +| main | jarvis-cli | +| server | jarvis-server | +| jarvis-bench | jarvis-bench | +| embedding | jarvis-embedding | +| quantize | jarvis-quantize | +| tokenize | jarvis-tokenize | +| export-lora | jarvis-export-lora | | libllava.a | libllava.a | -| baby-llama | llama-baby-llama | -| batched | llama-batched | -| batched-bench | llama-batched-bench | -| benchmark-matmult | llama-benchmark-matmult | -| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml | -| eval-callback | llama-eval-callback | -| gbnf-validator | llama-gbnf-validator | -| gguf | llama-gguf | -| gguf-split | llama-gguf-split | -| gritlm | llama-gritlm | -| imatrix | llama-imatrix | -| infill | llama-infill | -| llava-cli | llama-llava-cli | -| lookahead | llama-lookahead | -| lookup | llama-lookup | -| lookup-create | llama-lookup-create | -| lookup-merge | llama-lookup-merge | -| lookup-stats | llama-lookup-stats | -| parallel | llama-parallel | -| passkey | llama-passkey | -| perplexity | llama-perplexity | -| q8dot | llama-q8dot | -| quantize-stats | llama-quantize-stats | -| retrieval | llama-retrieval | -| save-load-state | llama-save-load-state | -| simple | llama-simple | -| speculative | llama-speculative | -| vdot | llama-vdot | +| baby-jarvis | jarvis-baby-jarvis | +| batched | jarvis-batched | +| batched-bench | jarvis-batched-bench | +| benchmark-matmult | jarvis-benchmark-matmult | +| convert-jarvis2c-to-ggml | jarvis-convert-jarvis2c-to-ggml | +| eval-callback | jarvis-eval-callback | +| gbnf-validator | jarvis-gbnf-validator | +| gguf | jarvis-gguf | +| gguf-split | jarvis-gguf-split | +| gritlm | jarvis-gritlm | +| imatrix | jarvis-imatrix | +| infill | jarvis-infill | +| llava-cli | jarvis-llava-cli | +| lookahead | jarvis-lookahead | +| lookup | jarvis-lookup | +| lookup-create | jarvis-lookup-create | +| lookup-merge | jarvis-lookup-merge | +| lookup-stats | jarvis-lookup-stats | +| parallel | jarvis-parallel | +| passkey | jarvis-passkey | +| perplexity | jarvis-perplexity | +| q8dot | jarvis-q8dot | +| quantize-stats | jarvis-quantize-stats | +| retrieval | jarvis-retrieval | +| save-load-state | jarvis-save-load-state | +| simple | jarvis-simple | +| speculative | jarvis-speculative | +| vdot | jarvis-vdot | | tests/test-c.o | tests/test-c.o | diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp index 11b35d2c22500..088364cd4105c 100644 --- a/examples/deprecation-warning/deprecation-warning.cpp +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -17,18 +17,18 @@ int main(int argc, char** argv) { filename = filename.substr(pos+1); } - // Append "llama-" to the beginning of filename to get the replacemnt filename - auto replacement_filename = "llama-" + filename; + // Append "jarvis-" to the beginning of filename to get the replacemnt filename + auto replacement_filename = "jarvis-" + filename; - // The exception is if the filename is "main", then our replacement filename is "llama-cli" + // The exception is if the filename is "main", then our replacement filename is "jarvis-cli" if (filename == "main") { - replacement_filename = "llama-cli"; + replacement_filename = "jarvis-cli"; } fprintf(stdout, "\n"); fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str()); - fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n"); + fprintf(stdout, " See https://github.com/ggerganov/jarvis.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n"); fprintf(stdout, "\n"); return EXIT_FAILURE; diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 8256e789ad33a..3c43d82e38f4f 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-embedding) +set(TARGET jarvis-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 12b372bf1df42..40589f6ce4f81 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -1,6 +1,6 @@ -# llama.cpp/example/embedding +# jarvis.cpp/example/embedding -This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp. +This example demonstrates generate high-dimensional embedding vector of a given text with jarvis.cpp. ## Quick Start @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null +./jarvis-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null +jarvis-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. @@ -50,11 +50,11 @@ The above command will output space-separated float values. ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +./jarvis-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +jarvis-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 3f18fc6a70878..77dafad011a79 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,7 +1,7 @@ #include "arg.h" #include "common.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include @@ -25,30 +25,30 @@ static std::vector split_lines(const std::string & s, const std::st return lines; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { +static void batch_add_seq(jarvis_batch & batch, const std::vector & tokens, jarvis_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { common_batch_add(batch, tokens[i], i, { seq_id }, true); } } -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const struct llama_model * model = llama_get_model(ctx); +static void batch_decode(jarvis_context * ctx, jarvis_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { + const enum jarvis_pooling_type pooling_type = jarvis_pooling_type(ctx); + const struct jarvis_model * model = jarvis_get_model(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + jarvis_kv_cache_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { + if (jarvis_model_has_encoder(model) && !jarvis_model_has_decoder(model)) { // encoder-only model - if (llama_encode(ctx, batch) < 0) { + if (jarvis_encode(ctx, batch) < 0) { LOG_ERR("%s : failed to encode\n", __func__); } - } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + } else if (!jarvis_model_has_encoder(model) && jarvis_model_has_decoder(model)) { // decoder-only model - if (llama_decode(ctx, batch) < 0) { + if (jarvis_decode(ctx, batch) < 0) { LOG_ERR("%s : failed to decode\n", __func__); } } @@ -61,14 +61,14 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu const float * embd = nullptr; int embd_pos = 0; - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + if (pooling_type == JARVIS_POOLING_TYPE_NONE) { // try to get token embeddings - embd = llama_get_embeddings_ith(ctx, i); + embd = jarvis_get_embeddings_ith(ctx, i); embd_pos = i; GGML_ASSERT(embd != NULL && "failed to get token embeddings"); } else { // try to get sequence embeddings - supported only when pooling_type is not NONE - embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + embd = jarvis_get_embeddings_seq(ctx, batch.seq_id[i][0]); embd_pos = batch.seq_id[i][0]; GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); } @@ -81,7 +81,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_EMBEDDING)) { return 1; } @@ -91,25 +91,25 @@ int main(int argc, char ** argv) { // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + common_init_result jarvis_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + jarvis_model * model = jarvis_init.model; + jarvis_context * ctx = jarvis_init.context; if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx_train = jarvis_n_ctx_train(model); + const int n_ctx = jarvis_n_ctx(ctx); - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + const enum jarvis_pooling_type pooling_type = jarvis_pooling_type(ctx); - if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + if (jarvis_model_has_encoder(model) && jarvis_model_has_decoder(model)) { LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__); return 1; } @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { // check if the last token is SEP // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_sep(model)) { + if (inp.empty() || inp.back() != jarvis_token_sep(model)) { LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } @@ -167,11 +167,11 @@ int main(int argc, char ** argv) { // initialize batch const int n_prompts = prompts.size(); - struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + struct jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1); // count number of embeddings int n_embd_count = 0; - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + if (pooling_type == JARVIS_POOLING_TYPE_NONE) { for (int k = 0; k < n_prompts; k++) { n_embd_count += inputs[k].size(); } @@ -180,7 +180,7 @@ int main(int argc, char ** argv) { } // allocate output - const int n_embd = llama_n_embd(model); + const int n_embd = jarvis_n_embd(model); std::vector embeddings(n_embd_count * n_embd, 0); float * emb = embeddings.data(); @@ -197,7 +197,7 @@ int main(int argc, char ** argv) { if (batch.n_tokens + n_toks > n_batch) { float * out = emb + e * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; + e += pooling_type == JARVIS_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; common_batch_clear(batch); } @@ -214,7 +214,7 @@ int main(int argc, char ** argv) { if (params.embd_out.empty()) { LOG("\n"); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + if (pooling_type == JARVIS_POOLING_TYPE_NONE) { for (int j = 0; j < n_embd_count; j++) { LOG("embedding %d: ", j); for (int i = 0; i < std::min(3, n_embd); i++) { @@ -234,7 +234,7 @@ int main(int argc, char ** argv) { } LOG("\n"); } - } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) { + } else if (pooling_type == JARVIS_POOLING_TYPE_RANK) { for (int j = 0; j < n_embd_count; j++) { // NOTE: if you change this log - update the tests in ci/run.sh LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); @@ -312,13 +312,13 @@ int main(int argc, char ** argv) { } LOG("\n"); - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); // clean up - llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); + jarvis_batch_free(batch); + jarvis_free(ctx); + jarvis_free_model(model); + jarvis_backend_free(); return 0; } diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..46b47b90b94ba 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -1,9 +1,9 @@ -set(TARGET llama-eval-callback) +set(TARGET jarvis-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} COMMAND jarvis-eval-callback --hf-repo ggml-org/models --hf-file tinyjarviss/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md index 63a57ad6b68e5..df7946f3abc3a 100644 --- a/examples/eval-callback/README.md +++ b/examples/eval-callback/README.md @@ -1,4 +1,4 @@ -# llama.cpp/examples/eval-callback +# jarvis.cpp/examples/eval-callback A simple example which demonstrates how to use callback during the inference. It simply prints to the console all operations and tensor data. @@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data. Usage: ```shell -llama-eval-callback \ +jarvis-eval-callback \ --hf-repo ggml-org/models \ --hf-file phi-2/ggml-model-q4_0.gguf \ --model phi-2-q4_0.gguf \ @@ -20,12 +20,12 @@ Will print: ```shell llm_load_tensors: offloaded 33/33 layers to GPU ... -llama_new_context_with_model: n_ctx = 512 +jarvis_new_context_with_model: n_ctx = 512 ... -llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB -llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB -llama_new_context_with_model: graph nodes = 1225 -llama_new_context_with_model: graph splits = 2 +jarvis_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB +jarvis_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB +jarvis_new_context_with_model: graph nodes = 1225 +jarvis_new_context_with_model: graph splits = 2 ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} [ [ diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c08e3e5f675ed..a4cb2d6131438 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,7 +1,7 @@ #include "arg.h" #include "common.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include "ggml.h" #include @@ -126,12 +126,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool run(llama_context * ctx, const common_params & params) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); +static bool run(jarvis_context * ctx, const common_params & params) { + const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx)); - std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { + if (jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), tokens.size()))) { LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -144,14 +144,14 @@ int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) { return 1; } common_init(); - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // pass the callback to the backend scheduler // it will be executed for each node during the graph computation @@ -160,10 +160,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); + common_init_result jarvis_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + jarvis_model * model = jarvis_init.model; + jarvis_context * ctx = jarvis_init.context; if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); return 1; @@ -182,12 +182,12 @@ int main(int argc, char ** argv) { } LOG("\n"); - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); - llama_free(ctx); - llama_free_model(model); + jarvis_free(ctx); + jarvis_free_model(model); - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index 1cef6e71694e2..babb850e94ede 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-export-lora) +set(TARGET jarvis-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 7dce99c9a9e61..7df4426e973d2 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -3,7 +3,7 @@ Apply LORA adapters to base model and export the resulting model. ``` -usage: llama-export-lora [options] +usage: jarvis-export-lora [options] options: -m, --model model path from which to load base model (default '') @@ -16,16 +16,16 @@ options: For example: ```bash -./bin/llama-export-lora \ - -m open-llama-3b-v2.gguf \ - -o open-llama-3b-v2-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf +./bin/jarvis-export-lora \ + -m open-jarvis-3b-v2.gguf \ + -o open-jarvis-3b-v2-english2tokipona-chat.gguf \ + --lora lora-open-jarvis-3b-v2-english2tokipona-chat-LATEST.gguf ``` Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: ```bash -./bin/llama-export-lora \ +./bin/jarvis-export-lora \ -m your_base_model.gguf \ -o your_merged_model.gguf \ --lora-scaled lora_task_A.gguf 0.5 \ diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 67662313d075c..d024a7e85d574 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -186,10 +186,10 @@ struct lora_merge_ctx { // prepare metadata gguf_set_kv(ctx_out, base_model.ctx_gguf); // output is forced to f16 for now - gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); + gguf_set_val_u32(ctx_out, "general.file_type", JARVIS_FTYPE_MOSTLY_F16); // check if all lora adapters have the same tensors - // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777 + // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/jarvis.cpp/pull/8607#discussion_r1686027777 static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; if (adapters.size() > 1) { for (size_t i = 1; i < adapters.size(); ++i) { @@ -402,7 +402,7 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index 4edd6ec7394c5..870d93220a544 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-gbnf-validator) +set(TARGET jarvis-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 7493af9d3aec3..bc4e028e3342d 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -1,5 +1,5 @@ #include "unicode.h" -#include "llama-grammar.h" +#include "jarvis-grammar.h" #include #include @@ -8,17 +8,17 @@ #include #include -static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { +static bool jarvis_grammar_validate(struct jarvis_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { const auto cpts = unicode_cpts_from_utf8(input_str); - const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); + const jarvis_grammar_rules & rules = jarvis_grammar_get_rules (grammar); + jarvis_grammar_stacks & stacks_cur = jarvis_grammar_get_stacks(grammar); size_t pos = 0; for (const auto & cpt : cpts) { - const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy + const jarvis_grammar_stacks stacks_prev = jarvis_grammar_get_stacks(grammar); // copy - llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); + jarvis_grammar_accept(rules, stacks_prev, cpt, stacks_cur); if (stacks_cur.empty()) { error_pos = pos; @@ -80,9 +80,9 @@ int main(int argc, char** argv) { grammar_str = buffer.str(); } - llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); + jarvis_grammar * grammar = jarvis_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); + throw std::runtime_error("Failed to initialize jarvis_grammar"); } // Read the input file std::string input_str; @@ -97,7 +97,7 @@ int main(int argc, char** argv) { // Validate the input string against the grammar size_t error_pos; std::string error_msg; - bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg); + bool is_valid = jarvis_grammar_validate(grammar, input_str, error_pos, error_msg); if (is_valid) { fprintf(stdout, "Input string is valid according to the grammar.\n"); @@ -106,7 +106,7 @@ int main(int argc, char** argv) { } // Clean up - llama_grammar_free_impl(grammar); + jarvis_grammar_free_impl(grammar); return 0; } diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt index c94cda7764341..45c2a215c43c1 100644 --- a/examples/gen-docs/CMakeLists.txt +++ b/examples/gen-docs/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-gen-docs) +set(TARGET jarvis-gen-docs) add_executable(${TARGET} gen-docs.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 77c59a836e50a..b02918844f690 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -47,7 +47,7 @@ static void write_table(std::ofstream & file, std::vector & opts) } } -static void export_md(std::string fname, llama_example ex) { +static void export_md(std::string fname, jarvis_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); common_params params; @@ -57,7 +57,7 @@ static void export_md(std::string fname, llama_example ex) { std::vector sparam_options; std::vector specific_options; for (auto & opt : ctx_arg.options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + // in case multiple JARVIS_EXAMPLE_* are set, we prioritize the JARVIS_EXAMPLE_* matching current example if (opt.is_sparam) { sparam_options.push_back(&opt); } else if (opt.in_example(ctx_arg.ex)) { @@ -76,8 +76,8 @@ static void export_md(std::string fname, llama_example ex) { } int main(int, char **) { - export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); - export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + export_md("autogen-main.md", JARVIS_EXAMPLE_MAIN); + export_md("autogen-server.md", JARVIS_EXAMPLE_SERVER); return 0; } diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 633f4553594bb..c51249495fccf 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET llama-gguf-hash) +set(TARGET jarvis-gguf-hash) add_executable(${TARGET} gguf-hash.cpp) install(TARGETS ${TARGET} RUNTIME) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index 9871651e38ba8..a9ceb24af3183 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -1,5 +1,5 @@ -# llama-gguf-hash +# jarvis-gguf-hash CLI to hash GGUF files to detect difference on a per model and per tensor level. @@ -38,8 +38,8 @@ For Maintainers: For Model Creators: - Optional consistent UUID generation based on model tensor content - This is served by UUIDv5 which is useful for databases keys - - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5` - - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp` + - jarvis.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5` + - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Jarvis.cpp` For Model Users: - Assurance of tensor layer integrity even if metadata was updated @@ -57,14 +57,14 @@ For Model Users: ## Compile Example ```bash -cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON +cmake -B build -DCMAKE_BUILD_TYPE=Debug -DJARVIS_FATAL_WARNINGS=ON make -C build clean -make -C build llama-gguf-hash VERBOSE=1 -./build/bin/llama-gguf-hash test.gguf -./build/bin/llama-gguf-hash --xxh64 test.gguf -./build/bin/llama-gguf-hash --sha1 test.gguf -./build/bin/llama-gguf-hash --uuid test.gguf -./build/bin/llama-gguf-hash --sha256 test.gguf +make -C build jarvis-gguf-hash VERBOSE=1 +./build/bin/jarvis-gguf-hash test.gguf +./build/bin/jarvis-gguf-hash --xxh64 test.gguf +./build/bin/jarvis-gguf-hash --sha1 test.gguf +./build/bin/jarvis-gguf-hash --uuid test.gguf +./build/bin/jarvis-gguf-hash --sha256 test.gguf ``` ## Generation and Verification Example @@ -72,7 +72,7 @@ make -C build llama-gguf-hash VERBOSE=1 To generate we may use this command ```bash -./llama-gguf-hash --all test.gguf > test.gguf.manifest +./jarvis-gguf-hash --all test.gguf > test.gguf.manifest ``` Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well @@ -117,7 +117,7 @@ sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test We can then use the normal check command which will by default check for the highest security strength hash and verify against that: ```bash -$ ./llama-gguf-hash --check test.gguf.manifest test.gguf +$ ./jarvis-gguf-hash --check test.gguf.manifest test.gguf manifest test.gguf.manifest sha256 sha1 xxh64 sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok @@ -137,7 +137,7 @@ Verification results for test.gguf.manifest - Success Or we may explicitly ask for a faster hash like: ```bash -$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf +$ ./jarvis-gguf-hash --check test.gguf.manifest --xxh64 test.gguf manifest test.gguf.manifest sha256 sha1 xxh64 xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok @@ -157,7 +157,7 @@ Verification results for test.gguf.manifest - Success Or maybe we want to just check that all the hash is valid: ```bash -$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest +$./jarvis-gguf-hash --check test.gguf.manifest --all test.gguf.manifest manifest test.gguf.manifest sha256 sha1 xxh64 xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index e96c75117f533..e7e3cd576c3da 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -24,9 +24,9 @@ extern "C" { #endif -// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp') -#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5" -#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5 +// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Jarvis.cpp') +#define UUID_NAMESPACE_JARVIS_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5" +#define UUID_NAMESPACE_JARVIS_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5 #define HASH_TYPE_SHA256_STR "sha256" @@ -320,7 +320,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { // sha1 for uuid init SHA1_CTX sha1_for_uuid_ctx; if (hash_params.uuid) { - unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX}; + unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_JARVIS_CPP_HEX}; SHA1Init(&sha1_for_uuid_ctx); SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); } diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index f63887da7dfca..e1ed69f8df477 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-gguf-split) +set(TARGET jarvis-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 7e62657e118a4..e44fc83f1bed9 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,4 +1,4 @@ -#include "llama.h" +#include "jarvis.h" #include "common.h" #include @@ -99,8 +99,8 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p split_print_usage(argv[0]); exit(0); } else if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + fprintf(stderr, "version: %d (%s)\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT); + fprintf(stderr, "built with %s for %s\n", JARVIS_COMPILER, JARVIS_BUILD_TARGET); exit(0); } else if (arg == "--dry-run") { arg_found = true; @@ -308,7 +308,7 @@ struct split_strategy { for (auto & ctx_out : ctx_outs) { // construct file path char split_path[PATH_MAX] = {0}; - llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); + jarvis_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); // open the output file printf("Writing file %s ... ", split_path); @@ -430,7 +430,7 @@ static void gguf_merge(const split_params & split_params) { }; if (i_split > 0) { - llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + jarvis_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); } fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); @@ -470,7 +470,7 @@ static void gguf_merge(const split_params & split_params) { } // Verify the file naming and extract split_prefix - if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { + if (!jarvis_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { fprintf(stderr, "\n%s: unexpected input file name: %s" " i_split=%d" " n_split=%d\n", __func__, @@ -508,7 +508,7 @@ static void gguf_merge(const split_params & split_params) { // Write tensors data for (int i_split = 0; i_split < n_split; i_split++) { - llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + jarvis_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); std::ifstream f_input(split_path, std::ios::binary); if (!f_input.is_open()) { fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index d5a92d6051063..246e9a3573ec6 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -18,8 +18,8 @@ fi set -x -SPLIT=$1/llama-gguf-split -MAIN=$1/llama-cli +SPLIT=$1/jarvis-gguf-split +MAIN=$1/jarvis-cli WORK_PATH=$TMP_DIR/gguf-split ROOT_DIR=$(realpath $(dirname $0)/../../) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index a9569b411956b..3cb82c8919c3b 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET llama-gguf) +set(TARGET jarvis-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index 86dfddca346fe..0039c26030fcf 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-gritlm) +set(TARGET jarvis-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md index 786ba57363def..88fde2e28aafc 100644 --- a/examples/gritlm/README.md +++ b/examples/gritlm/README.md @@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou Run the example using the downloaded model: ```console -$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf +$ ./jarvis-gritlm -m models/gritlm-7b_q4_1.gguf Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 6e42fa0734ecb..58df109196ff9 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -1,39 +1,39 @@ #include "arg.h" #include "common.h" -#include "llama.h" +#include "jarvis.h" #include #include // #define GRIT_DEBUG -static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { +static std::vector> encode(jarvis_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; - const llama_model * model = llama_get_model(ctx); + const jarvis_model * model = jarvis_get_model(ctx); - llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); + jarvis_batch batch = jarvis_batch_init(jarvis_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { common_batch_clear(batch); const std::string input_string = instruction + sentences[i]; - std::vector inputs = common_tokenize(model, input_string, true, false); + std::vector inputs = common_tokenize(model, input_string, true, false); const int32_t n_toks = inputs.size(); // GritLM seems to have EOS = "" // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 - // inputs.push_back(llama_token_eos(model)); + // inputs.push_back(jarvis_token_eos(model)); // we want to ignore instruction tokens for mean pooling const int32_t n_inst = common_tokenize(model, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample - std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) { - std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str()); + std::for_each(inputs.begin(), inputs.end(), [&ctx](jarvis_token t) { + std::printf("[%u:%s]", t, jarvis_token_to_piece(ctx, t).c_str()); }); std::printf("\n"); #endif @@ -44,22 +44,22 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); - llama_set_embeddings(ctx, true); - llama_set_causal_attn(ctx, false); + jarvis_kv_cache_clear(ctx); + jarvis_set_embeddings(ctx, true); + jarvis_set_causal_attn(ctx, false); // run model - llama_decode(ctx, batch); + jarvis_decode(ctx, batch); // get embedding dimensions - uint64_t n_embd = llama_n_embd(model); + uint64_t n_embd = jarvis_n_embd(model); // allocate embedding output std::vector emb_unorm(n_embd, 0.0f); // sum up all token embeddings for (int32_t k = n_inst; k < n_toks; k++) { - float * emb = llama_get_embeddings_ith(ctx, k); + float * emb = jarvis_get_embeddings_ith(ctx, k); for (uint64_t j = 0; j < n_embd; j++) { emb_unorm[j] += emb[j]; } @@ -88,24 +88,24 @@ static std::vector> encode(llama_context * ctx, const std::ve #endif } - llama_batch_free(batch); + jarvis_batch_free(batch); return result; } -static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) { +static std::string generate(jarvis_context * ctx, jarvis_sampler * smpl, const std::string & prompt, bool stream) { std::string result; - const llama_model * model = llama_get_model(ctx); - llama_token eos_token = llama_token_eos(model); + const jarvis_model * model = jarvis_get_model(ctx); + jarvis_token eos_token = jarvis_token_eos(model); - llama_kv_cache_clear(ctx); - llama_set_embeddings(ctx, false); - llama_set_causal_attn(ctx, true); + jarvis_kv_cache_clear(ctx); + jarvis_set_embeddings(ctx, false); + jarvis_set_causal_attn(ctx, true); - llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); + jarvis_batch bat = jarvis_batch_init(jarvis_n_batch(ctx), 0, 1); - std::vector inputs = common_tokenize(model, prompt, false, true); + std::vector inputs = common_tokenize(model, prompt, false, true); int32_t i_current_token = 0; while (true) { @@ -119,9 +119,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std } inputs.clear(); - llama_decode(ctx, bat); + jarvis_decode(ctx, bat); - llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1); + jarvis_token token = jarvis_sampler_sample(smpl, ctx, bat.n_tokens - 1); if (token == eos_token) { break; @@ -142,7 +142,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std std::printf("\n"); } - llama_batch_free(bat); + jarvis_batch_free(bat); return result; } @@ -154,29 +154,29 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_COMMON)) { return 1; } common_init(); - llama_model_params mparams = common_model_params_to_llama(params); - llama_context_params cparams = common_context_params_to_llama(params); + jarvis_model_params mparams = common_model_params_to_jarvis(params); + jarvis_context_params cparams = common_context_params_to_jarvis(params); - llama_backend_init(); + jarvis_backend_init(); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), mparams); // create generation context - llama_context * ctx = llama_new_context_with_model(model, cparams); + jarvis_context * ctx = jarvis_new_context_with_model(model, cparams); - auto sparams = llama_sampler_chain_default_params(); + auto sparams = jarvis_sampler_chain_default_params(); sparams.no_perf = false; - llama_sampler * smpl = llama_sampler_chain_init(sparams); + jarvis_sampler * smpl = jarvis_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + jarvis_sampler_chain_add(smpl, jarvis_sampler_init_greedy()); // ### Embedding/Representation ### // samples taken from: https://github.com/ContextualAI/gritlm#basic @@ -197,7 +197,7 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const int n_embd = llama_n_embd(model); + const int n_embd = jarvis_n_embd(model); const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); @@ -217,10 +217,10 @@ int main(int argc, char * argv[]) { std::string response = generate(ctx, smpl, prompt, true); } - llama_sampler_free(smpl); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); + jarvis_sampler_free(smpl); + jarvis_free(ctx); + jarvis_free_model(model); + jarvis_backend_free(); return 0; } diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index d4c8265bdb9d2..c03c64826c129 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-imatrix) +set(TARGET jarvis-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index bb5faec94c20a..2781dce75e951 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,12 +1,12 @@ -# llama.cpp/examples/imatrix +# jarvis.cpp/examples/imatrix Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. -More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 +More information is available here: https://github.com/ggerganov/jarvis.cpp/pull/4861 ## Usage ``` -./llama-imatrix \ +./jarvis-imatrix \ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] @@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument GGML_CUDA=1 make -j # generate importance matrix (imatrix.dat) -./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 +./jarvis-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization -./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +./jarvis-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 70ff47768c02b..437651a750227 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,7 +1,7 @@ #include "arg.h" #include "common.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include #include @@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); // this has been adapted to the new format of storing merged experts in a single 3d tensor - // ref: https://github.com/ggerganov/llama.cpp/pull/6387 + // ref: https://github.com/ggerganov/jarvis.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { // ids -> [n_experts_used, n_tokens] // src1 -> [cols, n_expert_used, n_tokens] @@ -428,15 +428,15 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const common_params & params) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - const int n_ctx = llama_n_ctx(ctx); +static bool compute_imatrix(jarvis_context * ctx, const common_params & params) { + const bool add_bos = jarvis_add_bos_token(jarvis_get_model(ctx)); + GGML_ASSERT(!jarvis_add_eos_token(jarvis_get_model(ctx))); + const int n_ctx = jarvis_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = common_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -467,7 +467,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx)); const int n_batch = params.n_batch; int count = 0; @@ -494,9 +494,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + jarvis_kv_cache_clear(ctx); - llama_batch batch = llama_batch_init(n_batch, 0, 1); + jarvis_batch batch = jarvis_batch_init(n_batch, 0, 1); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -507,7 +507,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + tokens[batch_start] = jarvis_token_bos(jarvis_get_model(ctx)); } common_batch_clear(batch); @@ -515,9 +515,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); } - if (llama_decode(ctx, batch)) { + if (jarvis_decode(ctx, batch)) { LOG_ERR("%s : failed to eval\n", __func__); - llama_batch_free(batch); + jarvis_batch_free(batch); return false; } @@ -525,12 +525,12 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { tokens[batch_start] = token_org; if (params.compute_ppl && num_batches > 1) { - const auto * batch_logits = llama_get_logits(ctx); + const auto * batch_logits = jarvis_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } } - llama_batch_free(batch); + jarvis_batch_free(batch); const auto t_end = std::chrono::high_resolution_clock::now(); @@ -547,7 +547,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { if (params.compute_ppl) { const int first = n_ctx/2; - const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + const auto * all_logits = num_batches > 1 ? logits.data() : jarvis_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += n_ctx - first - 1; @@ -583,7 +583,7 @@ int main(int argc, char ** argv) { params.logits_all = true; params.escape = false; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_IMATRIX, print_usage)) { return 1; } @@ -606,8 +606,8 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); } - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); // pass the callback to the backend scheduler // it will be executed for each node during the graph computation @@ -616,16 +616,16 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); + common_init_result jarvis_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + jarvis_model * model = jarvis_init.model; + jarvis_context * ctx = jarvis_init.context; if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx_train = jarvis_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); @@ -644,12 +644,12 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG("\n"); - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); - llama_free(ctx); - llama_free_model(model); + jarvis_free(ctx); + jarvis_free_model(model); - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 9b1aa3b63c920..f9ad699135e60 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-infill) +set(TARGET jarvis-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/infill/README.md b/examples/infill/README.md index 810a0c5e76697..ba2ab2bae41a9 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -1,25 +1,25 @@ -# llama.cpp/example/infill +# jarvis.cpp/example/infill -This example shows how to use the infill mode with Code Llama models supporting infill mode. +This example shows how to use the infill mode with Code Jarvis models supporting infill mode. Currently the 7B and 13B models support infill mode. Infill supports most of the options available in the main example. -For further information have a look at the main README.md in llama.cpp/example/main/README.md +For further information have a look at the main README.md in jarvis.cpp/example/main/README.md ## Common Options -In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models: +In this section, we cover the most commonly used options for running the `infill` program with the JARVIS models: -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m FNAME, --model FNAME`: Specify the path to the JARVIS model file (e.g., `models/7B/ggml-model.bin`). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but JARVIS models were built with a context of 2048, which will provide better results for longer input/inference. - `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts -The `infill` program provides several ways to interact with the LLaMA models using input prompts: +The `infill` program provides several ways to interact with the JARVIS models using input prompts: - `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option. - `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option. @@ -27,7 +27,7 @@ The `infill` program provides several ways to interact with the LLaMA models usi ## Interaction -The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first` +The `infill` program offers a seamless way to interact with JARVIS models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first` ### Interaction Options @@ -37,11 +37,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi ### Example -Download a model that supports infill, for example CodeLlama: +Download a model that supports infill, for example CodeJarvis: ```console -scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models +scripts/hf.sh --repo TheBloke/CodeJarvis-13B-GGUF --file codejarvis-13b.Q5_K_S.gguf --outdir models ``` ```bash -./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " +./jarvis-infill -t 10 -ngl 0 -m models/codejarvis-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " ``` diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index f18362c91c7bf..8c27eef10ef6a 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -3,7 +3,7 @@ #include "console.h" #include "sampling.h" #include "log.h" -#include "llama.h" +#include "jarvis.h" #include #include @@ -33,20 +33,20 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static llama_context ** g_ctx; -static llama_model ** g_model; +static jarvis_context ** g_ctx; +static jarvis_model ** g_model; static common_sampler ** g_smpl; static common_params * g_params; -static std::vector * g_input_tokens; +static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; +static std::vector * g_output_tokens; static bool is_interacting = false; static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens + const jarvis_context * ctx, const common_params & params, const jarvis_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens ) { if (params.logdir.empty()) { return; @@ -71,7 +71,7 @@ static void write_logfile( fprintf(logfile, "binary: infill\n"); char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); + jarvis_model_desc(model, model_desc, sizeof(model_desc)); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); fprintf(logfile, "\n"); @@ -83,7 +83,7 @@ static void write_logfile( yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - llama_perf_dump_yaml(logfile, ctx); + jarvis_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -112,7 +112,7 @@ int main(int argc, char ** argv) { common_params params; g_params = ¶ms; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { + if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_INFILL)) { return 1; } @@ -160,12 +160,12 @@ int main(int argc, char ** argv) { LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_INF("%s: llama backend init\n", __func__); - llama_backend_init(); - llama_numa_init(params.numa); + LOG_INF("%s: jarvis backend init\n", __func__); + jarvis_backend_init(); + jarvis_numa_init(params.numa); - llama_model * model = nullptr; - llama_context * ctx = nullptr; + jarvis_model * model = nullptr; + jarvis_context * ctx = nullptr; common_sampler * smpl = nullptr; g_model = &model; @@ -174,18 +174,18 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); + common_init_result jarvis_init = common_init_from_params(params); - model = llama_init.model; - ctx = llama_init.context; + model = jarvis_init.model; + ctx = jarvis_init.context; if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx_train = jarvis_n_ctx_train(model); + const int n_ctx = jarvis_n_ctx(ctx); LOG_DBG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { @@ -197,28 +197,28 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - const bool add_bos = llama_add_bos_token(model); - GGML_ASSERT(!llama_add_eos_token(model)); + const bool add_bos = jarvis_add_bos_token(model); + GGML_ASSERT(!jarvis_add_eos_token(model)); - std::vector embd_inp; - std::vector embd_end; - std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); + std::vector embd_inp; + std::vector embd_end; + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_token_fim_pre(model) >= 0); - GGML_ASSERT(llama_token_fim_suf(model) >= 0); + GGML_ASSERT(jarvis_token_fim_pre(model) >= 0); + GGML_ASSERT(jarvis_token_fim_suf(model) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); + inp_pfx.insert(inp_pfx.begin(), jarvis_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), jarvis_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), jarvis_token_bos(model)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_fim_mid(model); + const jarvis_token middle_token = jarvis_token_fim_mid(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } @@ -230,7 +230,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(jarvis_token_bos(model)); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } @@ -311,10 +311,10 @@ int main(int argc, char ** argv) { if (params.interactive) { const char *control_message; if (params.multiline_input) { - control_message = " - To return control to LLaMA, end your input with '\\'.\n" + control_message = " - To return control to JARVIS, end your input with '\\'.\n" " - To return control without starting a new line, end your input with '/'.\n"; } else { - control_message = " - Press Return to return control to LLaMA.\n" + control_message = " - Press Return to return control to JARVIS.\n" " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } @@ -340,7 +340,7 @@ int main(int argc, char ** argv) { // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); - std::vector embd; + std::vector embd; while (n_remain != 0 || params.interactive) { // predict @@ -375,8 +375,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + jarvis_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + jarvis_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -396,7 +396,7 @@ int main(int argc, char ** argv) { LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + if (jarvis_decode(ctx, jarvis_batch_get_one(&embd[i], n_eval))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -411,7 +411,7 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = common_sampler_sample(smpl, ctx, -1); + const jarvis_token id = common_sampler_sample(smpl, ctx, -1); common_sampler_accept(smpl, id, true); @@ -465,10 +465,10 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((common_sampler_last(smpl) == jarvis_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, jarvis_token_eot(model)).c_str()); } LOG("\n"); console::set_display(console::user_input); @@ -505,16 +505,16 @@ int main(int argc, char ** argv) { } // tokenize new prefix and suffix - std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); + inp_pfx.insert(inp_pfx.begin(), jarvis_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), jarvis_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), jarvis_token_bos(model)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); @@ -529,7 +529,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, common_sampler_last(smpl))) { + else if (jarvis_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found EOS token\n"); if (params.interactive) { @@ -545,7 +545,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(jarvis_token_bos(model)); } std::string buffer; @@ -585,7 +585,7 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); for (size_t i = original_size; i < embd_inp.size(); ++i) { - const llama_token token = embd_inp[i]; + const jarvis_token token = embd_inp[i]; output_tokens.push_back(token); output_ss << common_token_to_piece(ctx, token); } @@ -608,7 +608,7 @@ int main(int argc, char ** argv) { } // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { + if (!embd.empty() && jarvis_token_is_eog(model, embd.back()) && !params.interactive) { break; } @@ -620,18 +620,18 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, jarvis_token_eot(model)).c_str()); } LOG("\n"); common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - llama_free(ctx); - llama_free_model(model); + jarvis_free(ctx); + jarvis_free_model(model); common_sampler_free(smpl); - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/jarvis-bench/CMakeLists.txt b/examples/jarvis-bench/CMakeLists.txt new file mode 100644 index 0000000000000..e081060a8a1be --- /dev/null +++ b/examples/jarvis-bench/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET jarvis-bench) +add_executable(${TARGET} jarvis-bench.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llama-bench/README.md b/examples/jarvis-bench/README.md similarity index 56% rename from examples/llama-bench/README.md rename to examples/jarvis-bench/README.md index 6bbe4bb75fbf8..ca550fe7eff86 100644 --- a/examples/llama-bench/README.md +++ b/examples/jarvis-bench/README.md @@ -1,6 +1,6 @@ -# llama.cpp/examples/llama-bench +# jarvis.cpp/examples/jarvis-bench -Performance testing tool for llama.cpp. +Performance testing tool for jarvis.cpp. ## Table of contents @@ -20,7 +20,7 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: ./llama-bench [options] +usage: ./jarvis-bench [options] options: -h, --help @@ -56,7 +56,7 @@ options: Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` -llama-bench can perform three types of tests: +jarvis-bench can perform three types of tests: - Prompt processing (pp): processing a prompt in batches (`-p`) - Text generation (tg): generating a sequence of tokens (`-n`) @@ -77,108 +77,108 @@ Note: ### Text generation with different models ```sh -$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 +$ ./jarvis-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 ``` | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | +| jarvis 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | +| jarvis 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | +| jarvis 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | ### Prompt processing with different batch sizes ```sh -$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 +$ ./jarvis-bench -n 0 -p 1024 -b 128,256,512,1024 ``` | model | size | params | backend | ngl | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | ### Different numbers of threads ```sh -$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 +$ ./jarvis-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 ``` | model | size | params | backend | threads | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || ### Different numbers of layers offloaded to the GPU ```sh -$ ./llama-bench -ngl 10,20,30,31,32,33,34,35 +$ ./jarvis-bench -ngl 10,20,30,31,32,33,34,35 ``` | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | ## Output formats -By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. +By default, jarvis-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. ### Markdown ```sh -$ ./llama-bench -o md +$ ./jarvis-bench -o md ``` | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | +| jarvis 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | ### CSV ```sh -$ ./llama-bench -o csv +$ ./jarvis-bench -o csv ``` ```csv build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","jarvis 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" +"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","jarvis 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" ``` ### JSON ```sh -$ ./llama-bench -o json +$ ./jarvis-bench -o json ``` ```json @@ -193,7 +193,7 @@ $ ./llama-bench -o json "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", "gpu_info": "NVIDIA GeForce RTX 3090 Ti", "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", + "model_type": "jarvis 7B mostly Q4_0", "model_size": 3825065984, "model_n_params": 6738415616, "n_batch": 512, @@ -223,7 +223,7 @@ $ ./llama-bench -o json "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", "gpu_info": "NVIDIA GeForce RTX 3090 Ti", "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", + "model_type": "jarvis 7B mostly Q4_0", "model_size": 3825065984, "model_n_params": 6738415616, "n_batch": 512, @@ -250,12 +250,12 @@ $ ./llama-bench -o json ### JSONL ```sh -$ ./llama-bench -o jsonl +$ ./jarvis-bench -o jsonl ``` ```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} +{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"jarvis 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} +{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"jarvis 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} ``` @@ -264,7 +264,7 @@ $ ./llama-bench -o jsonl SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. ```sh -$ ./llama-bench -o sql +$ ./jarvis-bench -o sql ``` ```sql @@ -297,6 +297,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_ts REAL ); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'jarvis 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); +INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'jarvis 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); ``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/jarvis-bench/jarvis-bench.cpp similarity index 93% rename from examples/llama-bench/llama-bench.cpp rename to examples/jarvis-bench/jarvis-bench.cpp index 4a8ea96764630..c1a3368a09f96 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/jarvis-bench/jarvis-bench.cpp @@ -19,7 +19,7 @@ #include #include "ggml.h" -#include "llama.h" +#include "jarvis.h" #include "common.h" #include "ggml-cuda.h" #include "ggml-sycl.h" @@ -207,11 +207,11 @@ static bool output_format_from_str(const std::string & s, output_formats & forma return true; } -static const char * split_mode_str(llama_split_mode mode) { +static const char * split_mode_str(jarvis_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; + case JARVIS_SPLIT_MODE_NONE: return "none"; + case JARVIS_SPLIT_MODE_LAYER: return "layer"; + case JARVIS_SPLIT_MODE_ROW: return "row"; default: GGML_ABORT("invalid split mode"); } } @@ -237,7 +237,7 @@ struct cmd_params { std::vector poll; std::vector n_gpu_layers; std::vector rpc_servers; - std::vector split_mode; + std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector flash_attn; @@ -269,11 +269,11 @@ static const cmd_params cmd_params_defaults = { /* poll */ {50}, /* n_gpu_layers */ {99}, /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, + /* split_mode */ {JARVIS_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* tensor_split */ {std::vector(jarvis_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, @@ -304,7 +304,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - if (llama_supports_rpc()) { + if (jarvis_supports_rpc()) { printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); } printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); @@ -497,7 +497,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { + } else if (jarvis_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; break; @@ -509,15 +509,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } auto p = string_split(argv[i], split_delim); - std::vector modes; + std::vector modes; for (const auto & m : p) { - llama_split_mode mode; + jarvis_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_MODE_NONE; + mode = JARVIS_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_MODE_LAYER; + mode = JARVIS_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_MODE_ROW; + mode = JARVIS_SPLIT_MODE_ROW; } else { invalid_param = true; break; @@ -583,10 +583,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { const std::regex regex{R"([;/]+)"}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= llama_max_devices()); + GGML_ASSERT(split_arg.size() <= jarvis_max_devices()); - std::vector tensor_split(llama_max_devices()); - for (size_t i = 0; i < llama_max_devices(); ++i) { + std::vector tensor_split(jarvis_max_devices()); + for (size_t i = 0; i < jarvis_max_devices(); ++i) { if (i < split_arg.size()) { tensor_split[i] = std::stof(split_arg[i]); } else { @@ -680,7 +680,7 @@ struct cmd_params_instance { int poll; int n_gpu_layers; std::string rpc_servers; - llama_split_mode split_mode; + jarvis_split_mode split_mode; int main_gpu; bool no_kv_offload; bool flash_attn; @@ -688,8 +688,8 @@ struct cmd_params_instance { bool use_mmap; bool embeddings; - llama_model_params to_llama_mparams() const { - llama_model_params mparams = llama_model_default_params(); + jarvis_model_params to_jarvis_mparams() const { + jarvis_model_params mparams = jarvis_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; if (!rpc_servers.empty()) { @@ -713,8 +713,8 @@ struct cmd_params_instance { tensor_split == other.tensor_split; } - llama_context_params to_llama_cparams() const { - llama_context_params cparams = llama_context_default_params(); + jarvis_context_params to_jarvis_cparams() const { + jarvis_context_params cparams = jarvis_context_default_params(); cparams.n_ctx = n_prompt + n_gen; cparams.n_batch = n_batch; @@ -868,7 +868,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; - llama_split_mode split_mode; + jarvis_split_mode split_mode; int main_gpu; bool no_kv_offload; bool flash_attn; @@ -880,13 +880,13 @@ struct test { std::string test_time; std::vector samples_ns; - test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { + test(const cmd_params_instance & inst, const jarvis_model * lmodel, const jarvis_context * ctx) { model_filename = inst.model; char buf[128]; - llama_model_desc(lmodel, buf, sizeof(buf)); + jarvis_model_desc(lmodel, buf, sizeof(buf)); model_type = buf; - model_size = llama_model_size(lmodel); - model_n_params = llama_model_n_params(lmodel); + model_size = jarvis_model_size(lmodel); + model_n_params = jarvis_model_n_params(lmodel); n_batch = inst.n_batch; n_ubatch = inst.n_ubatch; n_threads = inst.n_threads; @@ -1008,7 +1008,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; int max_nonzero = 0; - for (size_t i = 0; i < llama_max_devices(); i++) { + for (size_t i = 0; i < jarvis_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; } @@ -1050,8 +1050,8 @@ struct test { } }; -const std::string test::build_commit = LLAMA_COMMIT; -const int test::build_number = LLAMA_BUILD_NUMBER; +const std::string test::build_commit = JARVIS_COMMIT; +const int test::build_number = JARVIS_BUILD_NUMBER; const bool test::cuda = !!ggml_cpu_has_cuda(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::kompute = !!ggml_cpu_has_kompute(); @@ -1428,45 +1428,45 @@ struct sql_printer : public printer { } }; -static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); +static void test_prompt(jarvis_context * ctx, int n_prompt, int n_batch, int n_threads) { + jarvis_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const jarvis_model * model = jarvis_get_model(ctx); + const int32_t n_vocab = jarvis_n_vocab(model); - std::vector tokens(n_batch); + std::vector tokens(n_batch); int n_processed = 0; while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && jarvis_add_bos_token(model) ? jarvis_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } - llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens)); + jarvis_decode(ctx, jarvis_batch_get_one(tokens.data(), n_tokens)); n_processed += n_tokens; } - llama_synchronize(ctx); + jarvis_synchronize(ctx); } -static void test_gen(llama_context * ctx, int n_gen, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); +static void test_gen(jarvis_context * ctx, int n_gen, int n_threads) { + jarvis_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const jarvis_model * model = jarvis_get_model(ctx); + const int32_t n_vocab = jarvis_n_vocab(model); - llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + jarvis_token token = jarvis_add_bos_token(model) ? jarvis_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { - llama_decode(ctx, llama_batch_get_one(&token, 1)); - llama_synchronize(ctx); + jarvis_decode(ctx, jarvis_batch_get_one(&token, 1)); + jarvis_synchronize(ctx); token = std::rand() % n_vocab; } } -static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { +static void jarvis_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { (void) level; (void) text; (void) user_data; @@ -1508,12 +1508,12 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); - // initialize llama.cpp + // initialize jarvis.cpp if (!params.verbose) { - llama_log_set(llama_null_log_callback, NULL); + jarvis_log_set(jarvis_null_log_callback, NULL); } - llama_backend_init(); - llama_numa_init(params.numa); + jarvis_backend_init(); + jarvis_numa_init(params.numa); set_process_priority(params.prio); @@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + jarvis_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; int params_idx = 0; @@ -1541,15 +1541,15 @@ int main(int argc, char ** argv) { for (const auto & inst : params_instances) { params_idx ++; if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); + fprintf(stderr, "jarvis-bench: benchmark %d/%ld: starting\n", params_idx, params_count); } // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { - llama_free_model(lmodel); + jarvis_free_model(lmodel); } - lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = jarvis_load_model_from_file(inst.model.c_str(), inst.to_jarvis_mparams()); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -1557,16 +1557,16 @@ int main(int argc, char ** argv) { prev_inst = &inst; } - llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); + jarvis_context * ctx = jarvis_new_context_with_model(lmodel, inst.to_jarvis_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_free_model(lmodel); + jarvis_free_model(lmodel); return 1; } test t(inst, lmodel, ctx); - llama_kv_cache_clear(ctx); + jarvis_kv_cache_clear(ctx); // cool off before the test if (params.delay) { @@ -1588,37 +1588,37 @@ int main(int argc, char ** argv) { exit(1); } - llama_attach_threadpool(ctx, threadpool, NULL); + jarvis_attach_threadpool(ctx, threadpool, NULL); // warmup run if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); + fprintf(stderr, "jarvis-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); + fprintf(stderr, "jarvis-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); } test_gen(ctx, 1, t.n_threads); } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(ctx); + jarvis_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "jarvis-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "jarvis-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); } @@ -1637,14 +1637,14 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_context_print(ctx); + jarvis_perf_context_print(ctx); - llama_free(ctx); + jarvis_free(ctx); ggml_threadpool_free(threadpool); } - llama_free_model(lmodel); + jarvis_free_model(lmodel); if (p) { p->print_footer(); @@ -1654,7 +1654,7 @@ int main(int argc, char ** argv) { p_err->print_footer(); } - llama_backend_free(); + jarvis_backend_free(); return 0; } diff --git a/examples/llama.android/.gitignore b/examples/jarvis.android/.gitignore similarity index 100% rename from examples/llama.android/.gitignore rename to examples/jarvis.android/.gitignore diff --git a/examples/llama.android/README.md b/examples/jarvis.android/README.md similarity index 100% rename from examples/llama.android/README.md rename to examples/jarvis.android/README.md diff --git a/examples/llama.android/app/.gitignore b/examples/jarvis.android/app/.gitignore similarity index 100% rename from examples/llama.android/app/.gitignore rename to examples/jarvis.android/app/.gitignore diff --git a/examples/llama.android/app/build.gradle.kts b/examples/jarvis.android/app/build.gradle.kts similarity index 93% rename from examples/llama.android/app/build.gradle.kts rename to examples/jarvis.android/app/build.gradle.kts index 8d1b37195efd4..faf26959b44a1 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/jarvis.android/app/build.gradle.kts @@ -4,11 +4,11 @@ plugins { } android { - namespace = "com.example.llama" + namespace = "com.example.jarvis" compileSdk = 34 defaultConfig { - applicationId = "com.example.llama" + applicationId = "com.example.jarvis" minSdk = 33 targetSdk = 34 versionCode = 1 @@ -54,7 +54,7 @@ dependencies { implementation("androidx.compose.ui:ui-graphics") implementation("androidx.compose.ui:ui-tooling-preview") implementation("androidx.compose.material3:material3") - implementation(project(":llama")) + implementation(project(":jarvis")) testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/jarvis.android/app/proguard-rules.pro similarity index 100% rename from examples/llama.android/app/proguard-rules.pro rename to examples/jarvis.android/app/proguard-rules.pro diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/jarvis.android/app/src/main/AndroidManifest.xml similarity index 89% rename from examples/llama.android/app/src/main/AndroidManifest.xml rename to examples/jarvis.android/app/src/main/AndroidManifest.xml index 41a358a299154..fcd605d2484b5 100644 --- a/examples/llama.android/app/src/main/AndroidManifest.xml +++ b/examples/jarvis.android/app/src/main/AndroidManifest.xml @@ -12,13 +12,13 @@ android:label="@string/app_name" android:roundIcon="@mipmap/ic_launcher_round" android:supportsRtl="true" - android:theme="@style/Theme.LlamaAndroid" + android:theme="@style/Theme.JarvisAndroid" > + android:theme="@style/Theme.JarvisAndroid"> diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt similarity index 99% rename from examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt index 78c231ae55d8c..1c8320e7a4f15 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/Downloadable.kt @@ -1,4 +1,4 @@ -package com.example.llama +package com.example.jarvis import android.app.DownloadManager import android.net.Uri diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt similarity index 94% rename from examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt index 9da04f7d3c32e..00789cb3bad3e 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainActivity.kt @@ -1,4 +1,4 @@ -package com.example.llama +package com.example.jarvis import android.app.ActivityManager import android.app.DownloadManager @@ -30,7 +30,7 @@ import androidx.compose.runtime.Composable import androidx.compose.ui.Modifier import androidx.compose.ui.unit.dp import androidx.core.content.getSystemService -import com.example.llama.ui.theme.LlamaAndroidTheme +import com.example.jarvis.ui.theme.JarvisAndroidTheme import java.io.File class MainActivity( @@ -77,9 +77,9 @@ class MainActivity( File(extFilesDir, "phi-2-q4_0.gguf"), ), Downloadable( - "TinyLlama 1.1B (f16, 2.2 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), - File(extFilesDir, "tinyllama-1.1-f16.gguf"), + "TinyJarvis 1.1B (f16, 2.2 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf?download=true"), + File(extFilesDir, "tinyjarvis-1.1-f16.gguf"), ), Downloadable( "Phi 2 DPO (Q3_K_M, 1.48 GiB)", @@ -89,7 +89,7 @@ class MainActivity( ) setContent { - LlamaAndroidTheme { + JarvisAndroidTheme { // A surface container using the 'background' color from the theme Surface( modifier = Modifier.fillMaxSize(), diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt similarity index 85% rename from examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt index 45ac29938f441..74dba04fa668a 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/MainViewModel.kt @@ -1,6 +1,6 @@ -package com.example.llama +package com.example.jarvis -import android.llama.cpp.LLamaAndroid +import android.jarvis.cpp.JarvisAndroid import android.util.Log import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf @@ -10,7 +10,7 @@ import androidx.lifecycle.viewModelScope import kotlinx.coroutines.flow.catch import kotlinx.coroutines.launch -class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() { +class MainViewModel(private val jarvisAndroid: JarvisAndroid = JarvisAndroid.instance()): ViewModel() { companion object { @JvmStatic private val NanosPerSecond = 1_000_000_000.0 @@ -29,7 +29,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan viewModelScope.launch { try { - llamaAndroid.unload() + jarvisAndroid.unload() } catch (exc: IllegalStateException) { messages += exc.message!! } @@ -45,7 +45,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan messages += "" viewModelScope.launch { - llamaAndroid.send(text) + jarvisAndroid.send(text) .catch { Log.e(tag, "send() failed", it) messages += it.message!! @@ -58,7 +58,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan viewModelScope.launch { try { val start = System.nanoTime() - val warmupResult = llamaAndroid.bench(pp, tg, pl, nr) + val warmupResult = jarvisAndroid.bench(pp, tg, pl, nr) val end = System.nanoTime() messages += warmupResult @@ -71,7 +71,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan return@launch } - messages += llamaAndroid.bench(512, 128, 1, 3) + messages += jarvisAndroid.bench(512, 128, 1, 3) } catch (exc: IllegalStateException) { Log.e(tag, "bench() failed", exc) messages += exc.message!! @@ -82,7 +82,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan fun load(pathToModel: String) { viewModelScope.launch { try { - llamaAndroid.load(pathToModel) + jarvisAndroid.load(pathToModel) messages += "Loaded $pathToModel" } catch (exc: IllegalStateException) { Log.e(tag, "load() failed", exc) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt similarity index 87% rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt index 40c30e8d97077..84e34456c5b8b 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Color.kt @@ -1,4 +1,4 @@ -package com.example.llama.ui.theme +package com.example.jarvis.ui.theme import androidx.compose.ui.graphics.Color diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt similarity index 97% rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt index e742220a8d719..3298e08c63b08 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Theme.kt @@ -1,4 +1,4 @@ -package com.example.llama.ui.theme +package com.example.jarvis.ui.theme import android.app.Activity import android.os.Build @@ -38,7 +38,7 @@ private val LightColorScheme = lightColorScheme( ) @Composable -fun LlamaAndroidTheme( +fun JarvisAndroidTheme( darkTheme: Boolean = isSystemInDarkTheme(), // Dynamic color is available on Android 12+ dynamicColor: Boolean = true, diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt similarity index 96% rename from examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt rename to examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt index 0b87946ca3ab1..bde5dfbb78802 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt +++ b/examples/jarvis.android/app/src/main/java/com/example/jarvis/ui/theme/Type.kt @@ -1,4 +1,4 @@ -package com.example.llama.ui.theme +package com.example.jarvis.ui.theme import androidx.compose.material3.Typography import androidx.compose.ui.text.TextStyle diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/jarvis.android/app/src/main/res/drawable/ic_launcher_background.xml similarity index 100% rename from examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml rename to examples/jarvis.android/app/src/main/res/drawable/ic_launcher_background.xml diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/jarvis.android/app/src/main/res/drawable/ic_launcher_foreground.xml similarity index 100% rename from examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml rename to examples/jarvis.android/app/src/main/res/drawable/ic_launcher_foreground.xml diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml rename to examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml rename to examples/jarvis.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp rename to examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp rename to examples/jarvis.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp rename to examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp rename to examples/jarvis.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp similarity index 100% rename from examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp rename to examples/jarvis.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/jarvis.android/app/src/main/res/values/colors.xml similarity index 100% rename from examples/llama.android/app/src/main/res/values/colors.xml rename to examples/jarvis.android/app/src/main/res/values/colors.xml diff --git a/examples/jarvis.android/app/src/main/res/values/strings.xml b/examples/jarvis.android/app/src/main/res/values/strings.xml new file mode 100644 index 0000000000000..be0735465a5da --- /dev/null +++ b/examples/jarvis.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + JarvisAndroid + diff --git a/examples/jarvis.android/app/src/main/res/values/themes.xml b/examples/jarvis.android/app/src/main/res/values/themes.xml new file mode 100644 index 0000000000000..6c7456dea61b0 --- /dev/null +++ b/examples/jarvis.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +