Skip to content

Commit a063017

Browse files
committed
Merge branch 'master' into layla-build
2 parents 76fe14c + a90484c commit a063017

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1830
-514
lines changed

.devops/full-rocm.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
1111
# Unless otherwise specified, we make a fat build.
1212
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
1313
# This is mostly tied to rocBLAS supported archs.
14-
ARG ROCM_DOCKER_ARCH=\
14+
ARG ROCM_DOCKER_ARCH="\
1515
gfx803 \
1616
gfx900 \
1717
gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
2121
gfx1030 \
2222
gfx1100 \
2323
gfx1101 \
24-
gfx1102
24+
gfx1102"
2525

2626
COPY requirements.txt requirements.txt
2727
COPY requirements requirements
@@ -34,7 +34,7 @@ WORKDIR /app
3434
COPY . .
3535

3636
# Set nvcc architecture
37-
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
3838
# Enable ROCm
3939
ENV GGML_HIPBLAS=1
4040
ENV CC=/opt/rocm/llvm/bin/clang

.devops/llama-cli-rocm.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
1111
# Unless otherwise specified, we make a fat build.
1212
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
1313
# This is mostly tied to rocBLAS supported archs.
14-
ARG ROCM_DOCKER_ARCH=\
14+
ARG ROCM_DOCKER_ARCH="\
1515
gfx803 \
1616
gfx900 \
1717
gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
2121
gfx1030 \
2222
gfx1100 \
2323
gfx1101 \
24-
gfx1102
24+
gfx1102"
2525

2626
COPY requirements.txt requirements.txt
2727
COPY requirements requirements
@@ -34,7 +34,7 @@ WORKDIR /app
3434
COPY . .
3535

3636
# Set nvcc architecture
37-
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
3838
# Enable ROCm
3939
ENV GGML_HIPBLAS=1
4040
ENV CC=/opt/rocm/llvm/bin/clang

.devops/llama-server-rocm.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
1111
# Unless otherwise specified, we make a fat build.
1212
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
1313
# This is mostly tied to rocBLAS supported archs.
14-
ARG ROCM_DOCKER_ARCH=\
14+
ARG ROCM_DOCKER_ARCH="\
1515
gfx803 \
1616
gfx900 \
1717
gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
2121
gfx1030 \
2222
gfx1100 \
2323
gfx1101 \
24-
gfx1102
24+
gfx1102"
2525

2626
COPY requirements.txt requirements.txt
2727
COPY requirements requirements
@@ -34,7 +34,7 @@ WORKDIR /app
3434
COPY . .
3535

3636
# Set nvcc architecture
37-
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
3838
# Enable ROCm
3939
ENV GGML_HIPBLAS=1
4040
ENV CC=/opt/rocm/llvm/bin/clang

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,7 @@ jobs:
956956
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
957957
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
958958
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
959+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
959960
echo "cp oneAPI running time dll files to ./build/bin done"
960961
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
961962
@@ -1031,7 +1032,7 @@ jobs:
10311032
run: |
10321033
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
10331034
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
1034-
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
1035+
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
10351036
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
10361037
md "build\bin\rocblas\library\"
10371038
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"

.github/workflows/python-type-check.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ on:
44
push:
55
paths:
66
- '.github/workflows/python-type-check.yml'
7+
- 'pyrightconfig.json'
78
- '**.py'
89
- '**/requirements*.txt'
910
pull_request:
1011
paths:
1112
- '.github/workflows/python-type-check.yml'
13+
- 'pyrightconfig.json'
1214
- '**.py'
1315
- '**/requirements*.txt'
1416

@@ -33,6 +35,6 @@ jobs:
3335
- name: Type-check with Pyright
3436
uses: jakebailey/pyright-action@v2
3537
with:
36-
version: 1.1.370
38+
version: 1.1.382
3739
level: warning
3840
warnings: true

CMakeLists.txt

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
6262
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
6363
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
6464

65+
# utils
66+
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
67+
6568
# extra artifacts
6669
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
6770
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -191,15 +194,17 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
191194
DESTINATION lib/pkgconfig)
192195

193196
#
194-
# programs, examples and tests
197+
# utils, programs, examples and tests
195198
#
196199

197-
add_subdirectory(common)
200+
if (LLAMA_BUILD_COMMON)
201+
add_subdirectory(common)
202+
endif()
198203

199204
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
200205
include(CTest)
201206
add_subdirectory(tests)
202-
endif ()
207+
endif()
203208

204209
if (LLAMA_BUILD_EXAMPLES)
205210
add_subdirectory(examples)

CONTRIBUTING.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,8 @@
2727

2828
![matmul](media/matmul.png)
2929

30+
# Resources
31+
32+
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
33+
34+
https://github.com/ggerganov/llama.cpp/projects

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1717

1818
## Hot topics
1919

20-
- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
20+
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
21+
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
2122

2223
----
2324

@@ -173,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
173174
**Tools:**
174175

175176
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
177+
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
176178
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
177179
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
178180
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
@@ -441,7 +443,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
441443
- Contributors can open PRs
442444
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
443445
- Collaborators will be invited based on contributions
444-
- Any help with managing issues and PRs is very appreciated!
446+
- Any help with managing issues, PRs and projects is very appreciated!
445447
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
446448
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
447449
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)

ci/run.sh

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,81 @@ function gg_run_embd_bge_small {
712712
set +e
713713
}
714714

715+
function gg_sum_embd_bge_small {
716+
gg_printf '### %s\n\n' "${ci}"
717+
718+
gg_printf 'BGE Small (BERT):\n'
719+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
720+
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
721+
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
722+
}
723+
724+
# rerank_tiny
725+
726+
function gg_run_rerank_tiny {
727+
cd ${SRC}
728+
729+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
730+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
731+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
732+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
733+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
734+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
735+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
736+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
737+
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
738+
739+
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
740+
741+
path_models="../models-mnt/rerank-tiny"
742+
743+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
744+
745+
set -e
746+
747+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
748+
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
749+
750+
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
751+
752+
model_f16="${path_models}/ggml-model-f16.gguf"
753+
754+
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
755+
756+
# sample output
757+
# rerank score 0: 0.029
758+
# rerank score 1: 0.029
759+
# rerank score 2: 0.135
760+
761+
# check that the score is in the range [$3, $4]
762+
function check_score {
763+
qnt="$1"
764+
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
765+
766+
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
767+
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
768+
return 20
769+
fi
770+
771+
printf ' - %s @ %s OK\n' "$qnt" "$score"
772+
return 0
773+
}
774+
775+
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
776+
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
777+
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
778+
779+
set +e
780+
}
781+
782+
function gg_sum_rerank_tiny {
783+
gg_printf '### %s\n\n' "${ci}"
784+
785+
gg_printf 'Rerank Tiny (Jina):\n'
786+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
787+
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
788+
}
789+
715790
function gg_check_build_requirements {
716791
if ! command -v cmake &> /dev/null; then
717792
gg_printf 'cmake not found, please install'
@@ -726,15 +801,6 @@ function gg_check_build_requirements {
726801
fi
727802
}
728803

729-
function gg_sum_embd_bge_small {
730-
gg_printf '### %s\n\n' "${ci}"
731-
732-
gg_printf 'BGE Small (BERT):\n'
733-
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
734-
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
735-
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
736-
}
737-
738804
## main
739805

740806
export LLAMA_LOG_PREFIX=1
@@ -762,6 +828,7 @@ test $ret -eq 0 && gg_run ctest_release
762828

763829
if [ -z ${GG_BUILD_LOW_PERF} ]; then
764830
test $ret -eq 0 && gg_run embd_bge_small
831+
test $ret -eq 0 && gg_run rerank_tiny
765832

766833
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
767834
test $ret -eq 0 && gg_run test_scripts_debug

common/arg.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,10 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
284284
params.kv_overrides.back().key[0] = 0;
285285
}
286286

287+
if (params.reranking && params.embedding) {
288+
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
289+
}
290+
287291
return true;
288292
}
289293

@@ -391,7 +395,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
391395
[](gpt_params & params) {
392396
params.verbose_prompt = true;
393397
}
394-
).set_examples({LLAMA_EXAMPLE_MAIN}));
398+
));
395399
add_opt(llama_arg(
396400
{"--no-display-prompt"},
397401
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
@@ -1093,13 +1097,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
10931097
}
10941098
).set_sparam());
10951099
add_opt(llama_arg(
1096-
{"--pooling"}, "{none,mean,cls,last}",
1100+
{"--pooling"}, "{none,mean,cls,last,rank}",
10971101
"pooling type for embeddings, use model default if unspecified",
10981102
[](gpt_params & params, const std::string & value) {
10991103
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
11001104
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1101-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
1105+
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
11021106
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1107+
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
11031108
else { throw std::invalid_argument("invalid value"); }
11041109
}
11051110
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
@@ -1749,6 +1754,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17491754
params.embedding = true;
17501755
}
17511756
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
1757+
add_opt(llama_arg(
1758+
{"--reranking", "--rerank"},
1759+
format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
1760+
[](gpt_params & params) {
1761+
params.reranking = true;
1762+
}
1763+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
17521764
add_opt(llama_arg(
17531765
{"--api-key"}, "KEY",
17541766
"API key to use for authentication (default: none)",

0 commit comments

Comments
 (0)