Skip to content

Commit 1910793

Browse files
committed
Merge branch 'main' into v0.2-wip
2 parents ac47d55 + c7901f1 commit 1910793

File tree

11 files changed

+252
-145
lines changed

11 files changed

+252
-145
lines changed

README.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest
2121
> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`. Old model files can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
2222
2323

24-
## Installation from PyPI (recommended)
24+
## Installation from PyPI
2525

2626
Install from PyPI (requires a c compiler):
2727

@@ -45,7 +45,7 @@ bash Miniforge3-MacOSX-arm64.sh
4545
```
4646
Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
4747

48-
### Installation with OpenBLAS / cuBLAS / CLBlast / Metal
48+
### Installation with Hardware Acceleration
4949

5050
`llama.cpp` supports multiple BLAS backends for faster processing.
5151
Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
@@ -74,6 +74,12 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
7474
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
7575
```
7676

77+
To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on` environment variable before installing:
78+
79+
```bash
80+
CMAKE_ARGS="-DLLAMA_HIPBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
81+
```
82+
7783
#### Windows remarks
7884

7985
To set the variables `CMAKE_ARGS` and `FORCE_CMAKE` in PowerShell, follow the next steps (Example using, OpenBLAS):
@@ -181,7 +187,8 @@ Below is a short example demonstrating how to use the low-level API to tokenize
181187
>>> import ctypes
182188
>>> params = llama_cpp.llama_context_default_params()
183189
# use bytes for char * params
184-
>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
190+
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
191+
>>> ctx = llama_cpp.llama_new_context_with_model(model, params)
185192
>>> max_tokens = params.n_ctx
186193
# use ctypes arrays for array params
187194
>>> tokens = (llama_cpp.llama_token * int(max_tokens))()

examples/low_level_api/low_level_api_chat_cpp.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ class LLaMAInteract:
2424
def __init__(self, params: GptParams) -> None:
2525
# input args
2626
self.params = params
27+
if self.params.path_session is None:
28+
self.params.path_session = ""
29+
if self.params.antiprompt is None:
30+
self.params.antiprompt = ""
2731

2832
if (self.params.perplexity):
2933
raise NotImplementedError("""************
@@ -66,7 +70,9 @@ def __init__(self, params: GptParams) -> None:
6670
self.lparams.use_mlock = self.params.use_mlock
6771
self.lparams.use_mmap = self.params.use_mmap
6872

69-
self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
73+
self.model = llama_cpp.llama_load_model_from_file(
74+
self.params.model.encode("utf8"), self.lparams)
75+
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
7076
if (not self.ctx):
7177
raise RuntimeError(f"error: failed to load model '{self.params.model}'")
7278

@@ -181,12 +187,12 @@ def __init__(self, params: GptParams) -> None:
181187
number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
182188

183189
for i in range(len(self.embd_inp)):
184-
print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
190+
print(f"{self.embd_inp[i]} -> '{self.token_to_str(self.embd_inp[i])}'", file=sys.stderr)
185191

186192
if (self.params.n_keep > 0):
187193
print("static prompt based on n_keep: '")
188194
for i in range(self.params.n_keep):
189-
print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
195+
print(self.token_to_str(self.embd_inp[i]), file=sys.stderr)
190196
print("'", file=sys.stderr)
191197
print(file=sys.stderr)
192198

@@ -339,7 +345,7 @@ def generate(self):
339345
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
340346

341347
# Apply penalties
342-
nl_logit = logits[llama_cpp.llama_token_nl()]
348+
nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
343349
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
344350

345351
_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
@@ -380,7 +386,7 @@ def generate(self):
380386
self.last_n_tokens.append(id)
381387

382388
# replace end of text token with newline token when in interactive mode
383-
if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
389+
if (id == llama_cpp.llama_token_eos(self.ctx) and self.params.interactive and not self.params.instruct):
384390
id = self.llama_token_newline[0]
385391
self.embd.append(id)
386392
if (self.use_antiprompt()):
@@ -437,7 +443,7 @@ def generate(self):
437443
break
438444

439445
# end of text token
440-
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
446+
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(self.ctx):
441447
if (not self.params.instruct):
442448
for i in self.llama_token_eot:
443449
yield i
@@ -464,10 +470,18 @@ def exit(self):
464470
llama_cpp.llama_free(self.ctx)
465471
self.set_color(util.CONSOLE_COLOR_DEFAULT)
466472

473+
def token_to_str(self, token_id: int) -> bytes:
474+
size = 32
475+
buffer = (ctypes.c_char * size)()
476+
n = llama_cpp.llama_token_to_piece_with_model(
477+
self.model, llama_cpp.llama_token(token_id), buffer, size)
478+
assert n <= size
479+
return bytes(buffer[:n])
480+
467481
# return past text
468482
def past(self):
469483
for id in self.last_n_tokens[-self.n_past:]:
470-
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore")
484+
yield self.token_to_str(id).decode("utf8", errors="ignore")
471485

472486
# write input
473487
def input(self, prompt: str):
@@ -481,7 +495,7 @@ def input(self, prompt: str):
481495
def output(self):
482496
self.remaining_tokens = self.params.n_predict
483497
for id in self.generate():
484-
cur_char = llama_cpp.llama_token_to_str(self.ctx, id)
498+
cur_char = self.token_to_str(id)
485499

486500
# Add remainder of missing bytes
487501
if None in self.multibyte_fix:

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
import llama_cpp
2-
1+
import ctypes
2+
import os
33
import multiprocessing
44

55
import llama_cpp
66

77
N_THREADS = multiprocessing.cpu_count()
8+
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
89

910
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
1011

1112
lparams = llama_cpp.llama_context_default_params()
12-
ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
13+
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams)
14+
ctx = llama_cpp.llama_new_context_with_model(model, lparams)
1315

1416
# determine the required inference memory per token:
1517
tmp = [0, 1, 2, 3]
@@ -58,7 +60,8 @@
5860
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
5961
for token_id in range(n_vocab)
6062
])
61-
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
63+
candidates_p = llama_cpp.ctypes.pointer(
64+
llama_cpp.llama_token_data_array(_arr, len(_arr), False))
6265

6366
_arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
6467
llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
@@ -68,9 +71,9 @@
6871
_arr,
6972
last_n_repeat, frequency_penalty, presence_penalty)
7073

71-
llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
72-
llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
73-
llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
74+
llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
75+
llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
76+
llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2)
7477
id = llama_cpp.llama_sample_token(ctx, candidates_p)
7578

7679
last_n_tokens_data = last_n_tokens_data[1:] + [id]
@@ -86,13 +89,18 @@
8689
break
8790
if not input_noecho:
8891
for id in embd:
92+
size = 32
93+
buffer = (ctypes.c_char * size)()
94+
n = llama_cpp.llama_token_to_piece_with_model(
95+
model, llama_cpp.llama_token(id), buffer, size)
96+
assert n <= size
8997
print(
90-
llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
98+
buffer[:n].decode('utf-8'),
9199
end="",
92100
flush=True,
93101
)
94102

95-
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
103+
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
96104
break
97105

98106
print()

llama_cpp/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
3+
4+
from .version import __version__

llama_cpp/llama.py

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -452,10 +452,10 @@ def detokenize(self, tokens: List[int]) -> bytes:
452452
"""
453453
assert self.model is not None
454454
output = b""
455-
size = 8
455+
size = 32
456456
buffer = (ctypes.c_char * size)()
457457
for token in tokens:
458-
n = llama_cpp.llama_token_to_str_with_model(
458+
n = llama_cpp.llama_token_to_piece_with_model(
459459
self.model, llama_cpp.llama_token(token), buffer, size
460460
)
461461
assert n <= size
@@ -1007,13 +1007,15 @@ def _create_completion(
10071007
break
10081008

10091009
token_end_position = 0
1010-
for token in remaining_tokens:
1011-
token_end_position += len(self.detokenize([token]))
1012-
# Check if stop sequence is in the token
1013-
if token_end_position >= (remaining_length - first_stop_position):
1014-
break
1015-
logprobs_or_none: Optional[CompletionLogprobs] = None
1016-
if logprobs is not None:
1010+
1011+
if logprobs is not None:
1012+
# not sure how to handle this branch when dealing
1013+
# with CJK output, so keep it unchanged
1014+
for token in remaining_tokens:
1015+
token_end_position += len(self.detokenize([token]))
1016+
# Check if stop sequence is in the token
1017+
if token_end_position > (remaining_length - first_stop_position):
1018+
break
10171019
token_str = self.detokenize([token]).decode(
10181020
"utf-8", errors="ignore"
10191021
)
@@ -1046,23 +1048,59 @@ def _create_completion(
10461048
"token_logprobs": [current_logprobs[int(token)]],
10471049
"top_logprobs": [top_logprob],
10481050
}
1049-
returned_tokens += 1
1050-
yield {
1051-
"id": completion_id,
1052-
"object": "text_completion",
1053-
"created": created,
1054-
"model": model_name,
1055-
"choices": [
1056-
{
1057-
"text": self.detokenize([token]).decode(
1058-
"utf-8", errors="ignore"
1059-
),
1060-
"index": 0,
1061-
"logprobs": logprobs_or_none,
1062-
"finish_reason": None,
1063-
}
1064-
],
1065-
}
1051+
returned_tokens += 1
1052+
yield {
1053+
"id": completion_id,
1054+
"object": "text_completion",
1055+
"created": created,
1056+
"model": model_name,
1057+
"choices": [
1058+
{
1059+
"text": self.detokenize([token]).decode(
1060+
"utf-8", errors="ignore"
1061+
),
1062+
"index": 0,
1063+
"logprobs": logprobs_or_none,
1064+
"finish_reason": None,
1065+
}
1066+
],
1067+
}
1068+
else:
1069+
while len(remaining_tokens) > 0:
1070+
decode_success = False
1071+
for i in range(1, len(remaining_tokens) + 1):
1072+
try:
1073+
bs = self.detokenize(remaining_tokens[:i])
1074+
ts = bs.decode('utf-8')
1075+
decode_success = True
1076+
break
1077+
except UnicodeError:
1078+
pass
1079+
else:
1080+
break
1081+
if not decode_success:
1082+
# all remaining tokens cannot be decoded to a UTF-8 character
1083+
break
1084+
token_end_position += len(bs)
1085+
if token_end_position > (remaining_length - first_stop_position):
1086+
break
1087+
remaining_tokens = remaining_tokens[i:]
1088+
returned_tokens += i
1089+
1090+
yield {
1091+
"id": completion_id,
1092+
"object": "text_completion",
1093+
"created": created,
1094+
"model": model_name,
1095+
"choices": [
1096+
{
1097+
"text": ts,
1098+
"index": 0,
1099+
"logprobs": None,
1100+
"finish_reason": None,
1101+
}
1102+
],
1103+
}
10661104

10671105
if len(completion_tokens) >= max_tokens:
10681106
text = self.detokenize(completion_tokens)

0 commit comments

Comments
 (0)