Skip to content

Commit 12aded6

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents ecad966 + 5cd85b5 commit 12aded6

27 files changed

+1056
-465
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ To learn more about model quantization, [read this documentation](examples/quant
448448
449449
</details>
450450
451-
[^3]: [https://github.com/containers/ramalama](RamaLama)
451+
[^3]: [RamaLama](https://github.com/containers/ramalama)
452452
453453
## [`llama-simple`](examples/simple)
454454

common/arg.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
626626
[](common_params & params) {
627627
params.ctx_shift = false;
628628
}
629-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
629+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
630630
add_opt(common_arg(
631631
{"--chunks"}, "N",
632632
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -2206,5 +2206,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22062206
}
22072207
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
22082208

2209+
// model-specific
2210+
add_opt(common_arg(
2211+
{"--tts-oute-default"},
2212+
string_format("use default OuteTTS models (note: can download weights from the internet)"),
2213+
[](common_params & params) {
2214+
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2215+
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2216+
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
2217+
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
2218+
}
2219+
).set_examples({LLAMA_EXAMPLE_TTS}));
2220+
22092221
return ctx_arg;
22102222
}

convert_hf_to_gguf.py

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2378,6 +2378,15 @@ class Phi3MiniModel(Model):
23782378
model_arch = gguf.MODEL_ARCH.PHI3
23792379

23802380
def set_vocab(self):
2381+
# Phi-4 model uses GPT2Tokenizer
2382+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2383+
if tokenizer_config_file.is_file():
2384+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2385+
tokenizer_config_json = json.load(f)
2386+
tokenizer_class = tokenizer_config_json['tokenizer_class']
2387+
if tokenizer_class == 'GPT2Tokenizer':
2388+
return self._set_vocab_gpt2()
2389+
23812390
from sentencepiece import SentencePieceProcessor
23822391

23832392
tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2494,7 +2503,11 @@ def set_gguf_parameters(self):
24942503
self.gguf_writer.add_rope_dimension_count(rope_dims)
24952504
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
24962505
self.gguf_writer.add_file_type(self.ftype)
2497-
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2506+
sliding_window = self.hparams.get("sliding_window")
2507+
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2508+
if sliding_window is None:
2509+
sliding_window = 0
2510+
self.gguf_writer.add_sliding_window(sliding_window)
24982511

24992512
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
25002513
n_embd = self.find_hparam(["hidden_size", "n_embd"])
@@ -2793,7 +2806,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27932806
return [(self.map_tensor_name(name), data_torch)]
27942807

27952808

2796-
@Model.register("BertModel", "CamembertModel", "RobertaModel")
2809+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
27972810
class BertModel(Model):
27982811
model_arch = gguf.MODEL_ARCH.BERT
27992812

@@ -2859,13 +2872,73 @@ def phantom(tok):
28592872
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
28602873
del bid # unused
28612874

2875+
if name.startswith("bert."):
2876+
name = name[5:]
2877+
2878+
if name.endswith(".gamma"):
2879+
name = name[:-6] + ".weight"
2880+
2881+
if name.endswith(".beta"):
2882+
name = name[:-5] + ".bias"
2883+
28622884
# we are only using BERT for embeddings so we don't need the pooling layer
28632885
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
28642886
return [] # we don't need these
28652887

2888+
if name.startswith("cls.predictions"):
2889+
return []
2890+
2891+
if name.startswith("cls.seq_relationship"):
2892+
return []
2893+
28662894
return [(self.map_tensor_name(name), data_torch)]
28672895

28682896

2897+
@Model.register("RobertaModel")
2898+
class RobertaModel(BertModel):
2899+
model_arch = gguf.MODEL_ARCH.BERT
2900+
2901+
def __init__(self, *args, **kwargs):
2902+
super().__init__(*args, **kwargs)
2903+
2904+
# we need the pad_token_id to know how to chop down position_embd matrix
2905+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
2906+
self._position_offset = 1 + pad_token_id
2907+
if "max_position_embeddings" in self.hparams:
2908+
self.hparams["max_position_embeddings"] -= self._position_offset
2909+
else:
2910+
self._position_offset = None
2911+
2912+
def set_vocab(self):
2913+
"""Support BPE tokenizers for roberta models"""
2914+
bpe_tok_path = self.dir_model / "tokenizer.json"
2915+
if bpe_tok_path.exists():
2916+
self._set_vocab_gpt2()
2917+
self.gguf_writer.add_add_bos_token(True)
2918+
self.gguf_writer.add_add_eos_token(True)
2919+
2920+
# we need this to validate the size of the token_type embeddings
2921+
# though currently we are passing all zeros to the token_type embeddings
2922+
# "Sequence A" or "Sequence B"
2923+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2924+
2925+
else:
2926+
return super().set_vocab()
2927+
2928+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2929+
# if name starts with "roberta.", remove the prefix
2930+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2931+
if name.startswith("roberta."):
2932+
name = name[8:]
2933+
2934+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2935+
if name == "embeddings.position_embeddings.weight":
2936+
if self._position_offset is not None:
2937+
data_torch = data_torch[self._position_offset:,:]
2938+
2939+
return super().modify_tensors(data_torch, name, bid)
2940+
2941+
28692942
@Model.register("NomicBertModel")
28702943
class NomicBertModel(BertModel):
28712944
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -3185,6 +3258,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
31853258
if new_name.endswith("time_mix_w2.weight"):
31863259
data_torch = data_torch.permute(0, 2, 1)
31873260

3261+
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
3262+
data_torch = data_torch.squeeze()
3263+
31883264
rescale_every_n_layers = self.hparams["rescale_every"]
31893265
if rescale_every_n_layers > 0:
31903266
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):

examples/gbnf-validator/gbnf-validator.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,15 @@
1111
static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
1212
const auto cpts = unicode_cpts_from_utf8(input_str);
1313

14-
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
15-
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
14+
auto & stacks_cur = llama_grammar_get_stacks(grammar);
1615

1716
size_t pos = 0;
1817
for (const auto & cpt : cpts) {
19-
const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
20-
21-
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
18+
llama_grammar_accept(grammar, cpt);
2219

2320
if (stacks_cur.empty()) {
2421
error_pos = pos;
2522
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
26-
stacks_cur = stacks_prev;
2723
return false;
2824
}
2925
++pos;
@@ -82,7 +78,8 @@ int main(int argc, char** argv) {
8278

8379
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
8480
if (grammar == nullptr) {
85-
throw std::runtime_error("Failed to initialize llama_grammar");
81+
fprintf(stdout, "Failed to initialize llama_grammar\n");
82+
return 1;
8683
}
8784
// Read the input file
8885
std::string input_str;

examples/llava/clip.cpp

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,25 @@
88
#include "ggml-alloc.h"
99
#include "ggml-backend.h"
1010

11-
#ifdef GGML_USE_CUDA
12-
#include "ggml-cuda.h"
13-
#endif
14-
15-
#ifdef GGML_USE_SYCL
16-
#include "ggml-sycl.h"
17-
#endif
18-
19-
#ifdef GGML_USE_METAL
20-
#include "ggml-metal.h"
21-
#endif
22-
23-
#ifdef GGML_USE_CANN
24-
#include "ggml-cann.h"
25-
#endif
26-
27-
#ifdef GGML_USE_VULKAN
28-
#include "ggml-vulkan.h"
29-
#endif
11+
//#ifdef GGML_USE_CUDA
12+
//#include "ggml-cuda.h"
13+
//#endif
14+
//
15+
//#ifdef GGML_USE_SYCL
16+
//#include "ggml-sycl.h"
17+
//#endif
18+
//
19+
//#ifdef GGML_USE_METAL
20+
//#include "ggml-metal.h"
21+
//#endif
22+
//
23+
//#ifdef GGML_USE_CANN
24+
//#include "ggml-cann.h"
25+
//#endif
26+
//
27+
//#ifdef GGML_USE_VULKAN
28+
//#include "ggml-vulkan.h"
29+
//#endif
3030

3131
#define STB_IMAGE_IMPLEMENTATION
3232
#include "stb_image.h"
@@ -1222,30 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12221222
}
12231223
}
12241224

1225-
#ifdef GGML_USE_CUDA
1226-
new_clip->backend = ggml_backend_cuda_init(0);
1227-
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1228-
#endif
1229-
1230-
#ifdef GGML_USE_METAL
1231-
new_clip->backend = ggml_backend_metal_init();
1232-
LOG_INF("%s: CLIP using Metal backend\n", __func__);
1233-
#endif
1234-
1235-
#ifdef GGML_USE_CANN
1236-
new_clip->backend = ggml_backend_cann_init(0);
1237-
LOG_INF("%s: CLIP using CANN backend\n", __func__);
1238-
#endif
1239-
1240-
#ifdef GGML_USE_VULKAN
1241-
new_clip->backend = ggml_backend_vk_init(0);
1242-
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1243-
#endif
1244-
1245-
#ifdef GGML_USE_SYCL
1246-
new_clip->backend = ggml_backend_sycl_init(0);
1247-
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1248-
#endif
1225+
//#ifdef GGML_USE_CUDA
1226+
// new_clip->backend = ggml_backend_cuda_init(0);
1227+
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1228+
//#endif
1229+
//
1230+
//#ifdef GGML_USE_METAL
1231+
// new_clip->backend = ggml_backend_metal_init();
1232+
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
1233+
//#endif
1234+
//
1235+
//#ifdef GGML_USE_CANN
1236+
// new_clip->backend = ggml_backend_cann_init(0);
1237+
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
1238+
//#endif
1239+
//
1240+
//#ifdef GGML_USE_VULKAN
1241+
// new_clip->backend = ggml_backend_vk_init(0);
1242+
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1243+
//#endif
1244+
//
1245+
//#ifdef GGML_USE_SYCL
1246+
// new_clip->backend = ggml_backend_sycl_init(0);
1247+
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1248+
//#endif
12491249

12501250
if (!new_clip->backend) {
12511251
new_clip->backend = ggml_backend_cpu_init();

examples/run/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for r
44

55
```bash
66
llama-run granite-code
7-
...
7+
```
88

99
```bash
1010
llama-run -h
@@ -19,6 +19,8 @@ Options:
1919
Context size (default: 2048)
2020
-n, --ngl <value>
2121
Number of GPU layers (default: 0)
22+
-v, --verbose, --log-verbose
23+
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
2224
-h, --help
2325
Show help message
2426

@@ -42,6 +44,6 @@ Examples:
4244
llama-run https://example.com/some-file1.gguf
4345
llama-run some-file2.gguf
4446
llama-run file://some-file3.gguf
45-
llama-run --ngl 99 some-file4.gguf
46-
llama-run --ngl 99 some-file5.gguf Hello World
47-
...
47+
llama-run --ngl 999 some-file4.gguf
48+
llama-run --ngl 999 some-file5.gguf Hello World
49+
```

0 commit comments

Comments
 (0)