Skip to content

Commit c54b67c

Browse files
authored
Merge branch 'ggerganov:master' into avx_opt
2 parents a847973 + 54ef9cf commit c54b67c

File tree

24 files changed

+1558
-698
lines changed

24 files changed

+1558
-698
lines changed

.github/workflows/build.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,13 @@ jobs:
5555
sysctl -a
5656
mkdir build
5757
cd build
58-
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
58+
cmake .. \
59+
-DLLAMA_FATAL_WARNINGS=ON \
60+
-DLLAMA_CURL=ON \
61+
-DGGML_METAL_USE_BF16=ON \
62+
-DGGML_METAL_EMBED_LIBRARY=ON \
63+
-DGGML_RPC=ON \
64+
-DBUILD_SHARED_LIBS=OFF
5965
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
6066
6167
- name: Test
@@ -113,7 +119,12 @@ jobs:
113119
sysctl -a
114120
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
115121
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
116-
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
122+
cmake -B build \
123+
-DLLAMA_FATAL_WARNINGS=ON \
124+
-DLLAMA_CURL=ON \
125+
-DGGML_METAL=OFF \
126+
-DGGML_RPC=ON \
127+
-DBUILD_SHARED_LIBS=OFF
117128
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
118129
119130
- name: Test
@@ -569,6 +580,7 @@ jobs:
569580
mkdir build
570581
cd build
571582
cmake -G Xcode .. \
583+
-DGGML_METAL_USE_BF16=ON \
572584
-DGGML_METAL_EMBED_LIBRARY=ON \
573585
-DLLAMA_BUILD_EXAMPLES=OFF \
574586
-DLLAMA_BUILD_TESTS=OFF \
@@ -599,6 +611,7 @@ jobs:
599611
mkdir build
600612
cd build
601613
cmake -G Xcode .. \
614+
-DGGML_METAL_USE_BF16=ON \
602615
-DGGML_METAL_EMBED_LIBRARY=ON \
603616
-DLLAMA_BUILD_EXAMPLES=OFF \
604617
-DLLAMA_BUILD_TESTS=OFF \

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,10 @@ ifdef GGML_METAL
878878
MK_CPPFLAGS += -DGGML_USE_METAL
879879
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
880880
OBJ_GGML += ggml/src/ggml-metal.o
881+
882+
ifdef GGML_METAL_USE_BF16
883+
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
884+
endif # GGML_METAL_USE_BF16
881885
ifdef GGML_METAL_NDEBUG
882886
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
883887
endif

Package.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,15 @@ let package = Package(
6161
name: "llama",
6262
path: ".",
6363
exclude: [
64+
"build",
6465
"cmake",
6566
"examples",
6667
"scripts",
6768
"models",
6869
"tests",
6970
"CMakeLists.txt",
70-
"Makefile"
71+
"Makefile",
72+
"ggml/src/ggml-metal-embed.metal"
7173
],
7274
sources: sources,
7375
resources: resources,

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ SRC=`pwd`
3939
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
4040

4141
if [ ! -z ${GG_BUILD_METAL} ]; then
42-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
42+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
4343
fi
4444

4545
if [ ! -z ${GG_BUILD_CUDA} ]; then

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ struct common_params {
178178
float yarn_beta_fast = 32.0f; // YaRN low correction dim
179179
float yarn_beta_slow = 1.0f; // YaRN high correction dim
180180
int32_t yarn_orig_ctx = 0; // YaRN original context length
181-
float defrag_thold = -1.0f; // KV cache defragmentation threshold
181+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
182182

183183
struct cpu_params cpuparams;
184184
struct cpu_params cpuparams_batch;

examples/chat-persistent.sh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
2323
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
2424
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
2525

26-
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
27-
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
26+
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
27+
'|'\
28+
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
2829
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
2930

3031
CTX_SIZE=2048
@@ -129,15 +130,12 @@ while read -e line; do
129130

130131
printf ' '
131132

132-
# HACK get num tokens from debug message
133-
# TODO get both messages in one go
134-
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
135-
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
133+
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
136134
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
137135
exit 1
138136
fi
139137

140-
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
138+
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
141139

142140
if ((n_tokens > CTX_ROTATE_POINT)); then
143141
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
256256
if (s == "f16") {
257257
return GGML_TYPE_F16;
258258
}
259+
if (s == "bf16") {
260+
return GGML_TYPE_BF16;
261+
}
259262
if (s == "q8_0") {
260263
return GGML_TYPE_Q8_0;
261264
}

examples/server/README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co
3939
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
4040
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
4141
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
42-
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
42+
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
4343
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
4444
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
4545
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
@@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co
6464
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
6565
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
6666
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
67-
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
67+
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
6868
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
6969
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
7070
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
@@ -99,25 +99,27 @@ The project is under active development, and we are [looking for feedback and co
9999

100100
| Argument | Explanation |
101101
| -------- | ----------- |
102-
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
102+
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
103103
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
104-
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
104+
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
105105
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
106106
| `--penalize-nl` | penalize newline tokens (default: false) |
107107
| `--temp N` | temperature (default: 0.8) |
108108
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
109109
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
110110
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
111+
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
112+
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
111113
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
112114
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
113115
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
114116
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
115117
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
116-
| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
117-
| `--dry-base N` | DRY sampling base value (default: 1.75) |
118-
| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
119-
| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
120-
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
118+
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
119+
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
120+
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
121+
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
122+
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
121123
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
122124
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
123125
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |

0 commit comments

Comments
 (0)