Skip to content

Commit 0fcce31

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 4235644 + a53f7f7 commit 0fcce31

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+3445
-684
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ jobs:
13791379
id: pack_artifacts
13801380
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
13811381
run: |
1382-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1382+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
13831383
13841384
- name: Upload artifacts
13851385
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ else()
2929
set(LLAMA_STANDALONE OFF)
3030
endif()
3131

32+
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
33+
3234
if (EMSCRIPTEN)
3335
set(BUILD_SHARED_LIBS_DEFAULT OFF)
3436

@@ -145,7 +147,13 @@ endif()
145147
# 3rd-party
146148
#
147149

148-
if (NOT TARGET ggml)
150+
if (LLAMA_USE_SYSTEM_GGML)
151+
message(STATUS "Using system-provided libggml, skipping ggml build")
152+
find_package(ggml REQUIRED)
153+
add_library(ggml ALIAS ggml::ggml)
154+
endif()
155+
156+
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
149157
add_subdirectory(ggml)
150158
# ... otherwise assume ggml is added by a parent CMakeLists.txt
151159
endif()

cmake/common.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
include("ggml/cmake/common.cmake")
2+
13
function(llama_add_compile_flags)
24
if (LLAMA_FATAL_WARNINGS)
35
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")

convert_hf_to_gguf.py

Lines changed: 197 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,40 @@ def _set_vocab_llama_hf(self):
908908
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
909909
special_vocab.add_to_gguf(self.gguf_writer)
910910

911+
def _set_vocab_rwkv_world(self):
912+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
913+
vocab_size = self.hparams.get("vocab_size", 65536)
914+
915+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
916+
toktypes: list[int] = [gguf.TokenType.CONTROL]
917+
918+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
919+
lines = f.readlines()
920+
for line in lines:
921+
parts = line.split(' ')
922+
assert len(parts) >= 3
923+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
924+
token = token.encode("utf-8") if isinstance(token, str) else token
925+
assert isinstance(token, bytes)
926+
assert len(token) == token_len
927+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
928+
tokens.append(token_text.encode("utf-8"))
929+
toktypes.append(gguf.TokenType.NORMAL)
930+
remainder = vocab_size - len(tokens)
931+
assert remainder >= 0
932+
for i in range(len(tokens), vocab_size):
933+
tokens.append(f"[PAD{i}]".encode("utf-8"))
934+
toktypes.append(gguf.TokenType.UNUSED)
935+
936+
self.gguf_writer.add_tokenizer_model("rwkv")
937+
self.gguf_writer.add_token_list(tokens)
938+
self.gguf_writer.add_token_types(toktypes)
939+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
940+
special_vocab.chat_template = "rwkv-world"
941+
# hack: Add '\n\n' as the EOT token to make it chat normally
942+
special_vocab._set_special_token("eot", 261)
943+
special_vocab.add_to_gguf(self.gguf_writer)
944+
911945
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
912946
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
913947
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
@@ -3412,38 +3446,7 @@ class Rwkv6Model(Model):
34123446
model_arch = gguf.MODEL_ARCH.RWKV6
34133447

34143448
def set_vocab(self):
3415-
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
3416-
vocab_size = self.hparams.get("vocab_size", 65536)
3417-
3418-
tokens: list[bytes] = ['<s>'.encode("utf-8")]
3419-
toktypes: list[int] = [gguf.TokenType.CONTROL]
3420-
3421-
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
3422-
lines = f.readlines()
3423-
for line in lines:
3424-
parts = line.split(' ')
3425-
assert len(parts) >= 3
3426-
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
3427-
token = token.encode("utf-8") if isinstance(token, str) else token
3428-
assert isinstance(token, bytes)
3429-
assert len(token) == token_len
3430-
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
3431-
tokens.append(token_text.encode("utf-8"))
3432-
toktypes.append(gguf.TokenType.NORMAL)
3433-
remainder = vocab_size - len(tokens)
3434-
assert remainder >= 0
3435-
for i in range(len(tokens), vocab_size):
3436-
tokens.append(f"[PAD{i}]".encode("utf-8"))
3437-
toktypes.append(gguf.TokenType.UNUSED)
3438-
3439-
self.gguf_writer.add_tokenizer_model("rwkv")
3440-
self.gguf_writer.add_token_list(tokens)
3441-
self.gguf_writer.add_token_types(toktypes)
3442-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
3443-
special_vocab.chat_template = "rwkv-world"
3444-
# hack: Add '\n\n' as the EOT token to make it chat normally
3445-
special_vocab._set_special_token("eot", 261)
3446-
special_vocab.add_to_gguf(self.gguf_writer)
3449+
self._set_vocab_rwkv_world()
34473450

34483451
def set_gguf_parameters(self):
34493452
block_count = self.hparams["num_hidden_layers"]
@@ -3565,6 +3568,168 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35653568
yield (new_name, data)
35663569

35673570

3571+
@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
3572+
class Rwkv7Model(Model):
3573+
model_arch = gguf.MODEL_ARCH.RWKV7
3574+
3575+
def set_vocab(self):
3576+
self._set_vocab_rwkv_world()
3577+
3578+
def calc_lora_rank(self, hidden_size, exponent, multiplier):
3579+
return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
3580+
3581+
def set_gguf_parameters(self):
3582+
block_count = self.hparams["num_hidden_layers"]
3583+
try:
3584+
head_size = self.hparams["head_size"]
3585+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
3586+
except KeyError:
3587+
head_size = self.hparams["head_dim"]
3588+
layer_norm_eps = self.hparams["norm_eps"]
3589+
hidden_size = self.hparams["hidden_size"]
3590+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
3591+
3592+
# ICLR: In-Context-Learning-Rate
3593+
try:
3594+
lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3595+
lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3596+
lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
3597+
lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
3598+
except KeyError:
3599+
lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3600+
lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3601+
lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
3602+
lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
3603+
3604+
# RWKV isn't context limited
3605+
self.gguf_writer.add_context_length(1048576)
3606+
self.gguf_writer.add_embedding_length(hidden_size)
3607+
self.gguf_writer.add_block_count(block_count)
3608+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
3609+
self.gguf_writer.add_wkv_head_size(head_size)
3610+
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
3611+
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
3612+
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
3613+
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
3614+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3615+
self.gguf_writer.add_file_type(self.ftype)
3616+
3617+
# required by llama.cpp, unused
3618+
self.gguf_writer.add_head_count(0)
3619+
3620+
lerp_weights: dict[int, dict[str, Tensor]] = {}
3621+
lora_needs_transpose: bool = True
3622+
3623+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3624+
# unify tensor names here to make life easier
3625+
name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
3626+
name = name.replace("self_attn", "attention").replace("attn", "attention")
3627+
name = name.replace("time_mixer.", "")
3628+
# lora layer names in fla-hub's impl
3629+
if "_lora.lora" in name:
3630+
self.lora_needs_transpose = False
3631+
name = name.replace("_lora.lora.0.weight", "1.weight")
3632+
name = name.replace("_lora.lora.2.weight", "2.weight")
3633+
name = name.replace("_lora.lora.2.bias", "0.weight")
3634+
3635+
name = name.replace("feed_forward_norm", "ln2")
3636+
name = name.replace("g_norm", "ln_x")
3637+
3638+
if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
3639+
# some models have dummy v0/v1/v2 on first layer while others don't
3640+
# ignore them all since they are not used
3641+
return
3642+
3643+
wkv_has_gate = self.hparams.get("wkv_has_gate", True)
3644+
lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
3645+
3646+
if bid is not None and "attention.x_" in name:
3647+
if "attention.x_x" in name:
3648+
# already concatenated
3649+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3650+
data = data_torch.reshape(len(lerp_list), 1, 1, -1)
3651+
yield (new_name, data)
3652+
else:
3653+
try:
3654+
self.lerp_weights[bid][name] = data_torch
3655+
except KeyError:
3656+
self.lerp_weights[bid] = {name: data_torch}
3657+
if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
3658+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3659+
data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
3660+
yield (new_name, data)
3661+
return
3662+
else:
3663+
data_torch = data_torch.squeeze()
3664+
new_name = self.map_tensor_name(name)
3665+
3666+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
3667+
new_name += ".weight"
3668+
3669+
if self.lora_needs_transpose and any(
3670+
new_name.endswith(t) for t in [
3671+
"time_mix_w1.weight", "time_mix_w2.weight",
3672+
"time_mix_a1.weight", "time_mix_a2.weight",
3673+
"time_mix_v1.weight", "time_mix_v2.weight",
3674+
"time_mix_g1.weight", "time_mix_g2.weight",
3675+
]
3676+
):
3677+
data_torch = data_torch.transpose(0, 1)
3678+
3679+
if 'r_k' in new_name:
3680+
data_torch = data_torch.flatten()
3681+
3682+
if bid == 0 and "time_mix_a" in new_name:
3683+
# dummy v0/v1/v2 on first layer
3684+
# easist way to make llama happy
3685+
yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
3686+
3687+
yield (new_name, data_torch)
3688+
3689+
3690+
@Model.register("RwkvHybridForCausalLM")
3691+
class ARwkv7Model(Rwkv7Model):
3692+
model_arch = gguf.MODEL_ARCH.ARWKV7
3693+
3694+
def set_vocab(self):
3695+
try:
3696+
self._set_vocab_sentencepiece()
3697+
except FileNotFoundError:
3698+
self._set_vocab_gpt2()
3699+
3700+
def set_gguf_parameters(self):
3701+
block_count = self.hparams["num_hidden_layers"]
3702+
hidden_size = self.hparams["hidden_size"]
3703+
head_size = self.hparams["head_size"]
3704+
rms_norm_eps = self.hparams["rms_norm_eps"]
3705+
intermediate_size = self.hparams["intermediate_size"]
3706+
wkv_has_gate = self.hparams["wkv_has_gate"]
3707+
assert self.hparams["wkv_version"] == 7
3708+
3709+
# ICLR: In-Context-Learning-Rate
3710+
lora_rank_decay = 64
3711+
lora_rank_iclr = 64
3712+
lora_rank_value_residual_mix = 32
3713+
lora_rank_gate = 128 if wkv_has_gate else 0
3714+
3715+
# RWKV isn't context limited
3716+
self.gguf_writer.add_context_length(1048576)
3717+
self.gguf_writer.add_embedding_length(hidden_size)
3718+
self.gguf_writer.add_block_count(block_count)
3719+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3720+
self.gguf_writer.add_wkv_head_size(head_size)
3721+
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
3722+
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
3723+
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
3724+
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
3725+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3726+
self.gguf_writer.add_file_type(self.ftype)
3727+
self.gguf_writer.add_token_shift_count(1)
3728+
3729+
# required by llama.cpp, unused
3730+
self.gguf_writer.add_head_count(0)
3731+
3732+
35683733
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
35693734
class MambaModel(Model):
35703735
model_arch = gguf.MODEL_ARCH.MAMBA

examples/main/README.md

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,24 @@ Once downloaded, place your model in the models folder in llama.cpp.
2727
##### Input prompt (One-and-done)
2828

2929
```bash
30-
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
30+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
3131
```
3232
##### Conversation mode (Allow for continuous interaction with the model)
3333

3434
```bash
35-
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
35+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
36+
```
37+
38+
##### Conversation mode using built-in jinja chat template
39+
40+
```bash
41+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
42+
```
43+
44+
##### One-and-done query using jinja with custom system prompt and a starting prompt
45+
46+
```bash
47+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
3648
```
3749

3850
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -44,12 +56,24 @@ Once downloaded, place your model in the models folder in llama.cpp.
4456

4557
##### Input prompt (One-and-done)
4658
```powershell
47-
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
59+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
4860
```
4961
##### Conversation mode (Allow for continuous interaction with the model)
5062

5163
```powershell
52-
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
64+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
65+
```
66+
67+
##### Conversation mode using built-in jinja chat template
68+
69+
```powershell
70+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
71+
```
72+
73+
##### One-and-done query using jinja with custom system prompt and a starting prompt
74+
75+
```powershell
76+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
5377
```
5478

5579
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -77,6 +101,8 @@ The `llama-cli` program provides several ways to interact with the LLaMA models
77101

78102
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
79103
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
104+
- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)).
105+
- `--system-prompt-file FNAME`: Provide a file containing a system prompt.
80106
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
81107

82108
## Interaction
@@ -89,7 +115,10 @@ In interactive mode, users can participate in text generation by injecting their
89115

90116
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
91117
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
92-
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
118+
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found)
119+
- `-no-cnv`: Disable conversation mode (default: false)
120+
- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit.
121+
- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false)
93122
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
94123

95124
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -125,6 +154,8 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t
125154

126155
Example usage: `--chat-template gemma`
127156

157+
`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py
158+
128159
## Context Management
129160

130161
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

ggml/cmake/common.cmake

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
function(ggml_get_flags CCID CCVER)
2+
set(C_FLAGS "")
3+
set(CXX_FLAGS "")
4+
5+
if (CCID MATCHES "Clang")
6+
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
7+
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
8+
9+
if (
10+
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
11+
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
12+
)
13+
list(APPEND C_FLAGS -Wdouble-promotion)
14+
endif()
15+
elseif (CCID STREQUAL "GNU")
16+
set(C_FLAGS -Wdouble-promotion)
17+
set(CXX_FLAGS -Wno-array-bounds)
18+
19+
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
20+
list(APPEND CXX_FLAGS -Wextra-semi)
21+
endif()
22+
endif()
23+
24+
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
25+
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
26+
endfunction()

0 commit comments

Comments
 (0)