Skip to content

Commit 64c16c4

Browse files
committed
Merge branch 'master' into vulkan
2 parents 60bbd4e + ba1cb19 commit 64c16c4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+12155
-813
lines changed

.devops/tools.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,23 @@ arg1="$1"
88
shift
99

1010
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
11-
python3 ./convert_hf_to_gguf.py "$@"
11+
exec python3 ./convert_hf_to_gguf.py "$@"
1212
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
13-
./llama-quantize "$@"
13+
exec ./llama-quantize "$@"
1414
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
15-
./llama-cli "$@"
15+
exec ./llama-cli "$@"
1616
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
1717
echo "Converting PTH to GGML..."
1818
for i in `ls $1/$2/ggml-model-f16.bin*`; do
1919
if [ -f "${i/f16/q4_0}" ]; then
2020
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
2121
else
2222
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
23-
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
23+
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
2424
fi
2525
done
2626
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
27-
./llama-server "$@"
27+
exec ./llama-server "$@"
2828
else
2929
echo "Unknown command: $arg1"
3030
echo "Available commands: "

.github/workflows/build.yml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,8 @@ jobs:
662662
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
663663
- build: 'msvc-arm64'
664664
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
665+
- build: 'llvm-arm64-opencl-adreno'
666+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
665667

666668
steps:
667669
- name: Clone
@@ -703,6 +705,28 @@ jobs:
703705
run: |
704706
choco install ninja
705707
708+
- name: Install OpenCL Headers and Libs
709+
id: install_opencl
710+
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
711+
run: |
712+
git clone https://github.com/KhronosGroup/OpenCL-Headers
713+
cd OpenCL-Headers
714+
mkdir build && cd build
715+
cmake .. `
716+
-DBUILD_TESTING=OFF `
717+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
718+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
719+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
720+
cmake --build . --target install
721+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
722+
cd OpenCL-ICD-Loader
723+
mkdir build-arm64-release && cd build-arm64-release
724+
cmake .. `
725+
-A arm64 `
726+
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
727+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
728+
cmake --build . --target install --config release
729+
706730
- name: Build
707731
id: cmake_build
708732
run: |
@@ -732,7 +756,7 @@ jobs:
732756
- name: Test
733757
id: cmake_test
734758
# not all machines have native AVX-512
735-
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
759+
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
736760
run: |
737761
cd build
738762
ctest -L main -C Release --verbose --timeout 900

CODEOWNERS

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
22

3-
ci/ @ggerganov
3+
/ci/ @ggerganov
4+
/.devops/ @ngxson
5+
/examples/server/ @ngxson

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ BUILD_TARGETS = \
2222
llama-infill \
2323
llama-llava-cli \
2424
llama-minicpmv-cli\
25+
llama-qwen2vl-cli\
2526
llama-lookahead \
2627
llama-lookup \
2728
llama-lookup-create \
@@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
14041405
$(OBJ_ALL)
14051406
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
14061407

1408+
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
1409+
examples/llava/llava.cpp \
1410+
examples/llava/llava.h \
1411+
examples/llava/clip.cpp \
1412+
examples/llava/clip.h \
1413+
$(OBJ_ALL)
1414+
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
1415+
14071416
ifeq ($(UNAME_S),Darwin)
14081417
swift: examples/batched.swift
14091418
(cd examples/batched.swift; make build)

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
110110
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
111111
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
112112
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
113+
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
113114

114115
</details>
115116

@@ -433,6 +434,20 @@ To learn more about model quantization, [read this documentation](examples/quant
433434

434435
</details>
435436

437+
## [`llama-run`](examples/run)
438+
439+
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
440+
441+
- <details>
442+
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
443+
444+
```bash
445+
llama-run granite-code
446+
```
447+
448+
</details>
449+
450+
[^3]: [https://github.com/containers/ramalama](RamaLama)
436451
437452
## [`llama-simple`](examples/simple)
438453

common/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
8181
# Use curl to download model url
8282
if (LLAMA_CURL)
8383
find_package(CURL REQUIRED)
84-
add_definitions(-DLLAMA_USE_CURL)
84+
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
8585
include_directories(${CURL_INCLUDE_DIRS})
8686
find_library(CURL_LIBRARY curl REQUIRED)
8787
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})

common/arg.cpp

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145
}
146146
}
147147

148+
const std::vector<ggml_type> kv_cache_types = {
149+
GGML_TYPE_F32,
150+
GGML_TYPE_F16,
151+
GGML_TYPE_BF16,
152+
GGML_TYPE_Q8_0,
153+
GGML_TYPE_Q4_0,
154+
GGML_TYPE_Q4_1,
155+
GGML_TYPE_IQ4_NL,
156+
GGML_TYPE_Q5_0,
157+
GGML_TYPE_Q5_1,
158+
};
159+
160+
static ggml_type kv_cache_type_from_str(const std::string & s) {
161+
for (const auto & type : kv_cache_types) {
162+
if (ggml_type_name(type) == s) {
163+
return type;
164+
}
165+
}
166+
throw std::runtime_error("Unsupported cache type: " + s);
167+
}
168+
169+
static std::string get_all_kv_cache_types() {
170+
std::ostringstream msg;
171+
for (const auto & type : kv_cache_types) {
172+
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
173+
}
174+
return msg.str();
175+
}
176+
148177
//
149178
// CLI argument parsing functions
150179
//
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11741203
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
11751204
add_opt(common_arg(
11761205
{"-ctk", "--cache-type-k"}, "TYPE",
1177-
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1206+
string_format(
1207+
"KV cache data type for K\n"
1208+
"allowed values: %s\n"
1209+
"(default: %s)",
1210+
get_all_kv_cache_types().c_str(),
1211+
ggml_type_name(params.cache_type_k)
1212+
),
11781213
[](common_params & params, const std::string & value) {
1179-
// TODO: get the type right here
1180-
params.cache_type_k = value;
1214+
params.cache_type_k = kv_cache_type_from_str(value);
11811215
}
11821216
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
11831217
add_opt(common_arg(
11841218
{"-ctv", "--cache-type-v"}, "TYPE",
1185-
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1219+
string_format(
1220+
"KV cache data type for V\n"
1221+
"allowed values: %s\n"
1222+
"(default: %s)",
1223+
get_all_kv_cache_types().c_str(),
1224+
ggml_type_name(params.cache_type_v)
1225+
),
11861226
[](common_params & params, const std::string & value) {
1187-
// TODO: get the type right here
1188-
params.cache_type_v = value;
1227+
params.cache_type_v = kv_cache_type_from_str(value);
11891228
}
11901229
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
11911230
add_opt(common_arg(
@@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832122
[](common_params & params, int value) {
20842123
params.speculative.n_max = value;
20852124
}
2086-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2125+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
20872126
add_opt(common_arg(
20882127
{"--draft-min", "--draft-n-min"}, "N",
20892128
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
20902129
[](common_params & params, int value) {
20912130
params.speculative.n_min = value;
20922131
}
2093-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2132+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
20942133
add_opt(common_arg(
20952134
{"--draft-p-split"}, "P",
20962135
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
20972136
[](common_params & params, const std::string & value) {
20982137
params.speculative.p_split = std::stof(value);
20992138
}
2100-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2139+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
21012140
add_opt(common_arg(
21022141
{"--draft-p-min"}, "P",
21032142
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
21042143
[](common_params & params, const std::string & value) {
21052144
params.speculative.p_min = std::stof(value);
21062145
}
2107-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2146+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
21082147
add_opt(common_arg(
21092148
{"-cd", "--ctx-size-draft"}, "N",
21102149
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
21112150
[](common_params & params, int value) {
21122151
params.speculative.n_ctx = value;
21132152
}
2114-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2153+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
21152154
add_opt(common_arg(
21162155
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
21172156
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312170
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
21322171
}
21332172
}
2134-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2173+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
21352174
add_opt(common_arg(
21362175
{"-md", "--model-draft"}, "FNAME",
21372176
"draft model for speculative decoding (default: unused)",
21382177
[](common_params & params, const std::string & value) {
21392178
params.speculative.model = value;
21402179
}
2141-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2180+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
21422181

21432182
return ctx_arg;
21442183
}

common/common.cpp

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10151015
return mparams;
10161016
}
10171017

1018-
static ggml_type kv_cache_type_from_str(const std::string & s) {
1019-
if (s == "f32") {
1020-
return GGML_TYPE_F32;
1021-
}
1022-
if (s == "f16") {
1023-
return GGML_TYPE_F16;
1024-
}
1025-
if (s == "bf16") {
1026-
return GGML_TYPE_BF16;
1027-
}
1028-
if (s == "q8_0") {
1029-
return GGML_TYPE_Q8_0;
1030-
}
1031-
if (s == "q4_0") {
1032-
return GGML_TYPE_Q4_0;
1033-
}
1034-
if (s == "q4_1") {
1035-
return GGML_TYPE_Q4_1;
1036-
}
1037-
if (s == "iq4_nl") {
1038-
return GGML_TYPE_IQ4_NL;
1039-
}
1040-
if (s == "q5_0") {
1041-
return GGML_TYPE_Q5_0;
1042-
}
1043-
if (s == "q5_1") {
1044-
return GGML_TYPE_Q5_1;
1045-
}
1046-
1047-
throw std::runtime_error("Unsupported cache type: " + s);
1048-
}
1049-
10501018
struct llama_context_params common_context_params_to_llama(const common_params & params) {
10511019
auto cparams = llama_context_default_params();
10521020

@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10811049
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
10821050
}
10831051

1084-
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1085-
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1052+
cparams.type_k = params.cache_type_k;
1053+
cparams.type_v = params.cache_type_v;
10861054

10871055
return cparams;
10881056
}
@@ -1108,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
11081076
#define CURL_MAX_RETRY 3
11091077
#define CURL_RETRY_DELAY_SECONDS 2
11101078

1111-
1112-
static bool starts_with(const std::string & str, const std::string & prefix) {
1113-
// While we wait for C++20's std::string::starts_with...
1114-
return str.rfind(prefix, 0) == 0;
1115-
}
1116-
11171079
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
11181080
int remaining_attempts = max_attempts;
11191081

common/common.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
3737

3838
// build info
3939
extern int LLAMA_BUILD_NUMBER;
40-
extern char const * LLAMA_COMMIT;
41-
extern char const * LLAMA_COMPILER;
42-
extern char const * LLAMA_BUILD_TARGET;
40+
extern const char * LLAMA_COMMIT;
41+
extern const char * LLAMA_COMPILER;
42+
extern const char * LLAMA_BUILD_TARGET;
4343

4444
struct common_control_vector_load_info;
4545

@@ -286,8 +286,8 @@ struct common_params {
286286
bool warmup = true; // warmup run
287287
bool check_tensors = false; // validate tensor data
288288

289-
std::string cache_type_k = "f16"; // KV cache data type for the K
290-
std::string cache_type_v = "f16"; // KV cache data type for the V
289+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
290+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
291291

292292
// multimodal models (see examples/llava)
293293
std::string mmproj = ""; // path to multimodal projector // NOLINT
@@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
437437
return parts;
438438
}
439439

440+
static bool string_starts_with(const std::string & str,
441+
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
442+
return str.rfind(prefix, 0) == 0;
443+
}
444+
440445
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
441446
void string_process_escapes(std::string & input);
442447

0 commit comments

Comments
 (0)