Skip to content

Commit d7de64b

Browse files
authored
Merge branch 'master' into add_sched_dot_dump
2 parents 6ee7599 + 152610e commit d7de64b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+5271
-904
lines changed

.devops/nix/package.nix

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
# Increases the runtime closure size by ~700M
3232
useMpi ? false,
3333
useRocm ? config.rocmSupport,
34+
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
3435
enableCurl ? true,
3536
useVulkan ? false,
3637
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
188189
]
189190
++ optionals useRocm [
190191
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
191-
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
192+
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
192193
]
193194
++ optionals useMetalKit [
194195
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")

.github/workflows/build.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ jobs:
317317
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
318318
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
319319
sudo apt-get update -y
320-
sudo apt-get install -y build-essential vulkan-sdk
320+
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
321321
322322
- name: Build
323323
id: cmake_build
@@ -327,6 +327,12 @@ jobs:
327327
cmake -DGGML_VULKAN=ON ..
328328
cmake --build . --config Release -j $(nproc)
329329
330+
- name: Test
331+
id: cmake_test
332+
run: |
333+
cd build
334+
ctest -L main --verbose --timeout 900
335+
330336
ubuntu-22-cmake-hip:
331337
runs-on: ubuntu-22.04
332338
container: rocm/dev-ubuntu-22.04:6.0.2

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ BUILD_TARGETS = \
2222
llama-infill \
2323
llama-llava-cli \
2424
llama-minicpmv-cli\
25+
llama-qwen2vl-cli\
2526
llama-lookahead \
2627
llama-lookup \
2728
llama-lookup-create \
@@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
14041405
$(OBJ_ALL)
14051406
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
14061407

1408+
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
1409+
examples/llava/llava.cpp \
1410+
examples/llava/llava.h \
1411+
examples/llava/clip.cpp \
1412+
examples/llava/clip.h \
1413+
$(OBJ_ALL)
1414+
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
1415+
14071416
ifeq ($(UNAME_S),Darwin)
14081417
swift: examples/batched.swift
14091418
(cd examples/batched.swift; make build)

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
9898
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9999
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
100100
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
101+
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
101102

102103
#### Multimodal
103104

@@ -110,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
110111
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
111112
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
112113
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
114+
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
113115

114116
</details>
115117

@@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
219221
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
220222
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
221223
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
222-
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
224+
| [HIP](docs/build.md#hip) | AMD GPU |
223225
| [Vulkan](docs/build.md#vulkan) | GPU |
224226
| [CANN](docs/build.md#cann) | Ascend NPU |
225227

@@ -412,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
412414
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
413415
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
414416

415-
## [`llama-bench`](example/bench)
417+
## [`llama-bench`](examples/llama-bench)
416418

417419
#### Benchmark the performance of the inference for various parameters.
418420

common/arg.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -855,13 +855,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
855855
params.sampling.ignore_eos = true;
856856
}
857857
).set_sparam());
858-
add_opt(common_arg(
859-
{"--penalize-nl"},
860-
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
861-
[](common_params & params) {
862-
params.sampling.penalize_nl = true;
863-
}
864-
).set_sparam());
865858
add_opt(common_arg(
866859
{"--temp"}, "N",
867860
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -916,6 +909,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
916909
{"--repeat-last-n"}, "N",
917910
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
918911
[](common_params & params, int value) {
912+
if (value < -1) {
913+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
914+
}
919915
params.sampling.penalty_last_n = value;
920916
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
921917
}
@@ -970,6 +966,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
970966
{"--dry-penalty-last-n"}, "N",
971967
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
972968
[](common_params & params, int value) {
969+
if (value < -1) {
970+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
971+
}
973972
params.sampling.dry_penalty_last_n = value;
974973
}
975974
).set_sparam());

common/common.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
940940
params.sampling.ignore_eos = false;
941941
}
942942

943+
if (params.sampling.ignore_eos) {
944+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945+
if (llama_token_is_eog(model, i)) {
946+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947+
params.sampling.logit_bias.push_back({i, -INFINITY});
948+
}
949+
}
950+
}
951+
952+
if (params.sampling.penalty_last_n == -1) {
953+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
954+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
955+
}
956+
957+
if (params.sampling.dry_penalty_last_n == -1) {
958+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
959+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
960+
}
961+
943962
if (params.warmup) {
944963
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
945964

@@ -1761,7 +1780,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
17611780
break;
17621781
case 0: // max absolute
17631782
for (int i = 0; i < n; i++) {
1764-
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1783+
if (sum < std::abs(inp[i])) {
1784+
sum = std::abs(inp[i]);
1785+
}
17651786
}
17661787
sum /= 32760.0; // make an int16 range
17671788
break;

common/common.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ enum common_sampler_type {
9595
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9696
COMMON_SAMPLER_TYPE_XTC = 8,
9797
COMMON_SAMPLER_TYPE_INFILL = 9,
98+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
9899
};
99100

100101
// dimensionality reduction methods, used by cvector-generator
@@ -130,7 +131,6 @@ struct common_params_sampling {
130131
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
131132
float mirostat_tau = 5.00f; // target entropy
132133
float mirostat_eta = 0.10f; // learning rate
133-
bool penalize_nl = false; // consider newlines as a repeatable token
134134
bool ignore_eos = false;
135135
bool no_perf = false; // disable performance metrics
136136
bool timing_per_token = false;
@@ -139,6 +139,7 @@ struct common_params_sampling {
139139

140140

141141
std::vector<enum common_sampler_type> samplers = {
142+
COMMON_SAMPLER_TYPE_PENALTIES,
142143
COMMON_SAMPLER_TYPE_DRY,
143144
COMMON_SAMPLER_TYPE_TOP_K,
144145
COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -193,11 +194,13 @@ struct common_params {
193194
float defrag_thold = 0.1f; // KV cache defragmentation threshold
194195

195196
// offload params
196-
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
197-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
198-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
199-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
200-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
197+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
198+
199+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
200+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
201+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
202+
203+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
201204

202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
@@ -593,7 +596,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
593596
// Embedding utils
594597
//
595598

596-
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
599+
// TODO: repace embd_norm with an enum
600+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
597601

598602
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
599603

common/sampling.cpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -161,32 +161,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
161161
params.logit_bias.size(),
162162
params.logit_bias.data()));
163163

164-
llama_sampler_chain_add(result->chain,
165-
llama_sampler_init_penalties(
166-
llama_n_vocab (model),
167-
llama_token_eos(model),
168-
llama_token_nl (model),
169-
params.penalty_last_n,
170-
params.penalty_repeat,
171-
params.penalty_freq,
172-
params.penalty_present,
173-
params.penalize_nl,
174-
params.ignore_eos));
175-
176164
if (params.mirostat == 0) {
177165
for (const auto & cnstr : params.samplers) {
178166
switch (cnstr) {
179-
case COMMON_SAMPLER_TYPE_DRY:
167+
case COMMON_SAMPLER_TYPE_DRY:
180168
{
181-
std::vector<const char*> c_breakers;
169+
std::vector<const char *> c_breakers;
182170
c_breakers.reserve(params.dry_sequence_breakers.size());
183-
for (const auto& str : params.dry_sequence_breakers) {
171+
for (const auto & str : params.dry_sequence_breakers) {
184172
c_breakers.push_back(str.c_str());
185173
}
186174

187175
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188176
}
189-
break;
177+
break;
190178
case COMMON_SAMPLER_TYPE_TOP_K:
191179
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192180
break;
@@ -208,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
208196
case COMMON_SAMPLER_TYPE_INFILL:
209197
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210198
break;
199+
case COMMON_SAMPLER_TYPE_PENALTIES:
200+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
201+
break;
211202
default:
212203
GGML_ASSERT(false && "unknown sampler type");
213204
}
@@ -415,6 +406,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
415406
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
416407
case COMMON_SAMPLER_TYPE_XTC: return 'x';
417408
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
409+
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
418410
default : return '?';
419411
}
420412
}
@@ -429,6 +421,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
429421
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
430422
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
431423
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
424+
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
432425
default : return "";
433426
}
434427
}
@@ -443,6 +436,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
443436
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
444437
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
445438
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
439+
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
446440
};
447441

448442
// since samplers names are written multiple ways
@@ -489,6 +483,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
489483
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
490484
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
491485
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
486+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
492487
};
493488

494489
std::vector<common_sampler_type> samplers;

0 commit comments

Comments
 (0)