diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index 522ee8147d63a..aa2aa0312010e 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -14,9 +14,9 @@ WORKDIR /app COPY . . RUN if [ "$TARGETARCH" = "amd64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ elif [ "$TARGETARCH" = "arm64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \ + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \ else \ echo "Unsupported architecture"; \ exit 1; \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 0bc4e7ee13f66..a196111e61d62 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index af783f5e998eb..e2b381766f196 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ echo "Building with dynamic libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 1e87737abfb71..e8297c6948c5c 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -35,7 +35,7 @@ COPY . . RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 48e7e6aaa5b77..66687a25ba068 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -40,7 +40,7 @@ WORKDIR /app COPY . . RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib \ diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml new file mode 100644 index 0000000000000..5d76da3d79ac5 --- /dev/null +++ b/.github/actions/windows-setup-curl/action.yml @@ -0,0 +1,25 @@ +name: 'Windows - Setup CURL' +description: 'Composite action, to be reused in other workflow' +inputs: + curl_version: + description: 'CURL version' + required: false + default: '8.6.0_6' +outputs: + curl_path: + description: "Path to the downloaded libcurl" + value: ${{ steps.get_libcurl.outputs.curl_path }} + +runs: + using: "composite" + steps: + - name: libCURL + id: get_libcurl + shell: powershell + env: + CURL_VERSION: ${{ inputs.curl_version }} + run: | + curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" + mkdir $env:RUNNER_TEMP/libcurl + tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl + echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index 0370c8943fa0e..75d2714792891 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -104,7 +104,6 @@ jobs: cmake -B build \ -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ -DLLAMA_CUBLAS=ON \ -DCUDAToolkit_ROOT=/usr/local/cuda \ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index d4176fef9cee2..e8639913ea3a6 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -19,7 +19,8 @@ jobs: sudo apt-get install -y --no-install-recommends \ build-essential \ gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu + g++-14-riscv64-linux-gnu \ + libcurl4-openssl-dev:riscv64 - name: Build run: | @@ -59,7 +60,8 @@ jobs: glslc \ gcc-14-riscv64-linux-gnu \ g++-14-riscv64-linux-gnu \ - libvulkan-dev:riscv64 + libvulkan-dev:riscv64 \ + libcurl4-openssl-dev:riscv64 - name: Build run: | @@ -99,7 +101,8 @@ jobs: build-essential \ glslc \ crossbuild-essential-arm64 \ - libvulkan-dev:arm64 + libvulkan-dev:arm64 \ + libcurl4-openssl-dev:arm64 - name: Build run: | diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc2a4c0566840..33f6a4fb483ed 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -54,6 +54,7 @@ jobs: continue-on-error: true run: | brew update + brew install curl - name: Build id: cmake_build @@ -62,7 +63,6 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_RPC=ON @@ -92,7 +92,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - name: Upload artifacts @@ -123,6 +122,7 @@ jobs: continue-on-error: true run: | brew update + brew install curl - name: Build id: cmake_build @@ -133,7 +133,6 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ -DGGML_METAL=OFF \ -DGGML_RPC=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) @@ -162,7 +161,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - name: Upload artifacts @@ -207,7 +205,6 @@ jobs: run: | cmake -B build \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -246,7 +243,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* - name: Upload artifacts @@ -281,7 +277,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build @@ -322,7 +318,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build @@ -360,7 +356,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build @@ -397,7 +393,7 @@ jobs: wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk + sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev - name: Build id: cmake_build @@ -431,7 +427,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/* - name: Upload artifacts @@ -454,7 +449,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev + sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 @@ -530,7 +525,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp + sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - name: install oneAPI MKL library shell: bash @@ -578,7 +573,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp + sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - name: install oneAPI MKL library shell: bash @@ -636,6 +631,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -671,6 +667,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -700,6 +697,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -739,6 +737,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -899,10 +898,17 @@ jobs: -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" cmake --build build-arm64-release --target install --config release + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + - name: Build id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cmake -S . -B build ${{ matrix.defines }} + cmake -S . -B build ${{ matrix.defines }} ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Add libopenblas.dll @@ -962,9 +968,10 @@ jobs: - name: Pack artifacts id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt - Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt + Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - name: Upload artifacts @@ -990,7 +997,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | apt update - apt install -y cmake build-essential ninja-build libgomp1 git + apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 @@ -1092,16 +1099,23 @@ jobs: run: | choco install ninja + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + - name: Build id: cmake_build shell: cmd + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ -DGGML_NATIVE=OFF ^ -DGGML_CUDA=ON ^ - -DGGML_RPC=ON + -DGGML_RPC=ON ^ + -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release @@ -1122,7 +1136,10 @@ jobs: - name: Pack artifacts id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | + cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* - name: Upload artifacts @@ -1177,6 +1194,8 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args + - name: Build id: cmake_build run: examples/sycl/win-build-sycl.bat @@ -1262,8 +1281,14 @@ jobs: key: ${{ github.job }} evict-old-files: 1d + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + - name: Build id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" @@ -1274,9 +1299,11 @@ jobs: -DCMAKE_BUILD_TYPE=Release ` -DGGML_HIP=ON ` -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_RPC=ON + -DGGML_RPC=ON ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + # TODO: reuse windows-latest-cmake-hip instead of duplicating this job windows-latest-cmake-hip-release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} runs-on: windows-latest @@ -1318,8 +1345,14 @@ jobs: run: | & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + - name: Build id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" @@ -1331,7 +1364,8 @@ jobs: -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` - -DGGML_RPC=ON + -DGGML_RPC=ON ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake --build build -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" @@ -1353,7 +1387,10 @@ jobs: - name: Pack artifacts id: pack_artifacts + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | + cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* - name: Upload artifacts @@ -1378,6 +1415,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 092c928bdf552..6c9b5132276fe 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -129,7 +129,6 @@ jobs: cmake -B build \ -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DGGML_OPENMP=OFF ; @@ -142,7 +141,6 @@ jobs: cmake -B build \ -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server @@ -154,7 +152,6 @@ jobs: cmake -B build \ -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server @@ -195,17 +192,14 @@ jobs: - name: libCURL id: get_libcurl - env: - CURL_VERSION: 8.6.0_6 - run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" - mkdir $env:RUNNER_TEMP/libcurl - tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl + uses: ./.github/actions/windows-setup-curl - name: Build id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" + cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup @@ -221,8 +215,10 @@ jobs: - name: Copy Libcurl id: prepare_libcurl + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll + cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll - name: Tests id: server_integration_tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 23cfbce5ae566..de51c0a17b2f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,7 +81,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) +option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) # Required for relocatable CMake package @@ -168,6 +168,11 @@ add_subdirectory(src) # utils, programs, examples and tests # +if (NOT LLAMA_BUILD_COMMON) + message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL") + set(LLAMA_CURL OFF) +endif() + if (LLAMA_BUILD_COMMON) add_subdirectory(common) endif() @@ -242,3 +247,20 @@ configure_file(cmake/llama.pc.in install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + +# +# copy the license files +# + +# Check if running in GitHub Actions +if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true") + message(STATUS "Running inside GitHub Actions - copying license files") + + # Copy all files from licenses/ to build/bin/ + file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*") + foreach(LICENSE_FILE ${LICENSE_FILES}) + get_filename_component(FILENAME ${LICENSE_FILE} NAME) + configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY) + endforeach() +endif() + diff --git a/build-xcframework.sh b/build-xcframework.sh index 2ce3939c43d6c..1b9091d288cc8 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -399,6 +399,7 @@ cmake -B build-ios-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-ios-sim --config Release -- -quiet @@ -411,6 +412,7 @@ cmake -B build-ios-device -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-ios-device --config Release -- -quiet @@ -421,6 +423,7 @@ cmake -B build-macos -G Xcode \ -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-macos --config Release -- -quiet @@ -434,6 +437,7 @@ cmake -B build-visionos -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-visionos --config Release -- -quiet @@ -447,6 +451,7 @@ cmake -B build-visionos-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-visionos-sim --config Release -- -quiet @@ -462,6 +467,7 @@ cmake -B build-tvos-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-tvos-sim --config Release -- -quiet @@ -476,6 +482,7 @@ cmake -B build-tvos-device -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -DLLAMA_CURL=OFF \ -S . cmake --build build-tvos-device --config Release -- -quiet diff --git a/ci/run.sh b/ci/run.sh index ebd662b38bf88..f463d7a8b2009 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,7 +39,7 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 829eb5b7238b9..43533fc86abe2 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -85,7 +85,10 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info) # Use curl to download model url if (LLAMA_CURL) - find_package(CURL REQUIRED) + find_package(CURL) + if (NOT CURL_FOUND) + message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF") + endif() target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) diff --git a/common/arg.cpp b/common/arg.cpp index cdf8970254446..642fefb57548f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -163,6 +163,8 @@ struct common_hf_file_res { # if !defined(PATH_MAX) # define PATH_MAX MAX_PATH # endif +#elif defined(_AIX) +#include #else #include #endif diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cfe94deaf76ef..9549900206b48 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -714,6 +714,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": # ref: https://huggingface.co/inclusionAI/Ling-lite res = "bailingmoe" + if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": + # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + res = "llama4" if res is None: logger.warning("\n") @@ -1608,6 +1611,7 @@ def prepare_tensors(self): @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA + undo_permute = True def set_vocab(self): try: @@ -1672,10 +1676,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -1752,6 +1757,61 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("Llama4ForConditionalGeneration") +class Llama4Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA4 + has_vision: bool = False + undo_permute = False + + # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config" + # same with llama, but we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + if "vision_config" in hparams: + logger.info("Has vision encoder, but it will be ignored") + self.has_vision = True + # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this + self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] + self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] + + def set_vocab(self): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + name = name.replace("feed_forward.", "mlp.") # a bit hacky for now + name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now + + # split the gate_up into gate and up + if "gate_up_proj" in name: + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + dim_half = data_torch.shape[-1] // 2 + gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight) + ] + + if name.endswith("down_proj"): + name += ".weight" + data_torch = data_torch.transpose(-1, -2) + + if "multi_modal_projector" in name or "vision_model" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + @Model.register("Mistral3ForConditionalGeneration") class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 1b86f4c90acf6..ce6104da4a0e9 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -113,6 +113,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", }, {"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", }, {"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", }, + {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, ] diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 28dbc1904888b..5bb6478022039 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -18,6 +18,7 @@ android { } externalNativeBuild { cmake { + arguments += "-DLLAMA_CURL=OFF" arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DGGML_LLAMAFILE=OFF" arguments += "-DCMAKE_BUILD_TYPE=Release" diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 1145c816c83d4..44428cc959553 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -380,6 +380,7 @@ struct clip_ctx { if (backend_cpu != backend) { ggml_backend_free(backend_cpu); } + clip_image_size_free(load_image_size); } }; @@ -1618,6 +1619,12 @@ struct clip_image_f32 * clip_image_f32_init() { return new clip_image_f32(); } +void clip_image_size_free(struct clip_image_size * load_image_size) { + if (load_image_size == nullptr) { + return; + } + delete load_image_size; +} void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { @@ -2270,6 +2277,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { } void clip_free(clip_ctx * ctx) { + if (ctx == nullptr) { + return; + } delete ctx; } @@ -2840,10 +2850,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { bool clip_is_glm(const struct clip_ctx * ctx) { return ctx->has_glm_projector; } + bool clip_is_qwen2vl(const struct clip_ctx * ctx) { return ctx->has_qwen2vl_merger; } +bool clip_is_llava(const struct clip_ctx * ctx) { + return ctx->has_llava_projector; +} + +bool clip_is_gemma3(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; +} + // Determine the number of encoder layers to iterate over int get_deepest_feature_layer(const struct clip_ctx * ctx) { // Get the index of the second to last layer; this is the diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 783bdca3e09f2..87aa61574b1eb 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -77,6 +77,7 @@ CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); +CLIP_API void clip_image_size_free (struct clip_image_size * img_size); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); @@ -106,6 +107,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); +CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); +CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 8c413f7d66e6d..175f2804b5da0 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__); - LOG("\ntask\tacc_norm\n"); + LOG("\ntask\tacc_norm\t95%% confidence interval\n"); double acc = 0.0f; @@ -985,8 +985,22 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { acc += 1.0; } - // Print the accumulated accuracy mean x 100 - LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); + double freq = acc / double(i + 1); + + const double za = 1.95996398454; + + // // Wald normal approx + // double conf =za*sqrt(freq*(1-freq)/double(i + 1)); + // LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0); + + // Wilson score interval, more accurate + double z = za * za / double(i + 1); + double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za); + double a = (freq + z * 0.5 - cnf) / (1.0 + z); + double b = (freq + z * 0.5 + cnf) / (1.0 + z); + + // Print the accumulated accuracy mean x 100 and confidence interval + LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0); } i0 = i1 - 1; diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index cd6b0520e08bb..7cff188ca69f0 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -1,5 +1,16 @@ set(TARGET llama-run) add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp) + +# TODO: avoid copying this code block from common/CMakeLists.txt +set(LLAMA_RUN_EXTRA_LIBS "") +if (LLAMA_CURL) + find_package(CURL REQUIRED) + target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) + include_directories(${CURL_INCLUDE_DIRS}) + find_library(CURL_LIBRARY curl REQUIRED) + set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY}) +endif () + install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 941815c22efb0..674e227571e2d 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 760c3646433ad..1bf1ee876b40f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1705,6 +1705,8 @@ struct server_queue { }; struct server_response { + bool running = true; + // for keeping track of all tasks waiting for the result std::unordered_set waiting_task_ids; @@ -1759,6 +1761,10 @@ struct server_response { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ + if (!running) { + SRV_DBG("%s : queue result stop\n", __func__); + std::terminate(); // we cannot return here since the caller is HTTP code + } return !queue_results.empty(); }); @@ -1789,6 +1795,10 @@ struct server_response { } std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); + if (!running) { + SRV_DBG("%s : queue result stop\n", __func__); + std::terminate(); // we cannot return here since the caller is HTTP code + } if (cr_res == std::cv_status::timeout) { return nullptr; } @@ -1818,6 +1828,12 @@ struct server_response { } } } + + // terminate the waiting loop + void terminate() { + running = false; + condition_results.notify_all(); + } }; struct server_context { @@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) { svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; // clean up function, to be called before exit - auto clean_up = [&svr]() { + auto clean_up = [&svr, &ctx_server]() { SRV_INF("%s: cleaning up before exit...\n", __func__); svr->stop(); + ctx_server.queue_results.terminate(); llama_backend_free(); }; @@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) { if (!ctx_server.load_model(params)) { clean_up(); - // t.join(); // FIXME: see below + t.join(); LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; } @@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.start_loop(); clean_up(); - // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this + t.join(); return 0; } diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 12816bfa8705b..652dea0382ce1 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -17,7 +17,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`. ```shell cd ../../.. -cmake -B build -DLLAMA_CURL=ON +cmake -B build cmake --build build --target llama-server ``` diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index 8b0eb42b0926f..0feb452ccfcd4 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -49,6 +49,26 @@ def test_embedding_multiple(): assert len(d['embedding']) > 1 +def test_embedding_multiple_with_fa(): + server = ServerPreset.bert_bge_small_with_fa() + server.pooling = 'last' + server.start() + # one of these should trigger the FA branch (i.e. context size % 256 == 0) + res = server.make_request("POST", "/v1/embeddings", data={ + "input": [ + "a "*253, + "b "*254, + "c "*255, + "d "*256, + ], + }) + assert res.status_code == 200 + assert len(res.body['data']) == 4 + for d in res.body['data']: + assert 'embedding' in d + assert len(d['embedding']) > 1 + + @pytest.mark.parametrize( "input,is_multi_prompt", [ diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 30aa8660950a1..4dc2062a8e5b9 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -323,6 +323,21 @@ def bert_bge_small() -> ServerProcess: server.server_embeddings = True return server + @staticmethod + def bert_bge_small_with_fa() -> ServerProcess: + server = ServerProcess() + server.model_hf_repo = "ggml-org/models" + server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" + server.model_alias = "bert-bge-small" + server.n_ctx = 1024 + server.n_batch = 300 + server.n_ubatch = 300 + server.n_slots = 2 + server.fa = True + server.seed = 42 + server.server_embeddings = True + return server + @staticmethod def tinyllama_infill() -> ServerProcess: server = ServerProcess() diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx index d12b06e125e5a..29ab5ea64f76f 100644 --- a/examples/server/webui/src/components/ChatScreen.tsx +++ b/examples/server/webui/src/components/ChatScreen.tsx @@ -1,4 +1,4 @@ -import { useEffect, useMemo, useRef, useState } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context'; import ChatMessage from './ChatMessage'; import { CanvasType, Message, PendingMessage } from '../utils/types'; @@ -6,6 +6,7 @@ import { classNames, cleanCurrentUrl, throttle } from '../utils/misc'; import CanvasPyInterpreter from './CanvasPyInterpreter'; import StorageUtils from '../utils/storage'; import { useVSCodeContext } from '../utils/llama-vscode'; +import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts'; /** * A message display is a message node with additional information for rendering. @@ -99,7 +100,8 @@ export default function ChatScreen() { canvasData, replaceMessageAndGenerate, } = useAppContext(); - const textarea = useOptimizedTextarea(prefilledMsg.content()); + + const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content()); const { extraContext, clearExtraContext } = useVSCodeContext(textarea); // TODO: improve this when we have "upload file" feature @@ -248,14 +250,16 @@ export default function ChatScreen() { {/* chat input */} -
+
+ {isGenerating(currConvId ?? '') ? (