Skip to content

Commit 76fe14c

Browse files
committed
Merge branch 'master' into layla-build
2 parents b5b05a8 + 7691654 commit 76fe14c

37 files changed

+2509
-1622
lines changed

.dockerignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
*.o
22
*.a
33
.cache/
4-
.git/
4+
# Do not ignore .git directory, otherwise the reported build number will always be 0
55
.github/
66
.gitignore
77
.vs/

.github/workflows/docker.yml

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,17 @@ on:
1515
branches:
1616
- master
1717
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
18+
workflow_dispatch: # allows manual triggering, useful for debugging
1819

1920
concurrency:
2021
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
2122
cancel-in-progress: true
2223

24+
# Fine-grant permission
25+
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
26+
permissions:
27+
packages: write
28+
2329
jobs:
2430
push_to_registry:
2531
name: Push Docker image to Docker Hub
@@ -46,6 +52,8 @@ jobs:
4652
steps:
4753
- name: Check out the repo
4854
uses: actions/checkout@v4
55+
with:
56+
fetch-depth: 0 # preserve git history, so we can determine the build number
4957

5058
- name: Set up QEMU
5159
uses: docker/setup-qemu-action@v2
@@ -60,6 +68,34 @@ jobs:
6068
username: ${{ github.repository_owner }}
6169
password: ${{ secrets.GITHUB_TOKEN }}
6270

71+
- name: Determine tag name
72+
id: tag
73+
shell: bash
74+
run: |
75+
BUILD_NUMBER="$(git rev-list --count HEAD)"
76+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
77+
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
78+
REPO_NAME="${{ github.event.repository.name }}"
79+
80+
# determine tag name postfix (build number, commit hash)
81+
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
82+
TAG_POSTFIX="b${BUILD_NUMBER}"
83+
else
84+
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
85+
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
86+
fi
87+
88+
# list all tags possible
89+
TAGS=""
90+
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
91+
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
92+
93+
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
94+
echo "output_tags=$TAGS" # print out for debugging
95+
env:
96+
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
97+
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
98+
6399
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
64100
- name: Free Disk Space (Ubuntu)
65101
uses: jlumbroso/free-disk-space@main
@@ -77,31 +113,13 @@ jobs:
77113
docker-images: true
78114
swap-storage: true
79115

80-
- name: Determine tag name
81-
id: tag
82-
shell: bash
83-
run: |
84-
BUILD_NUMBER="$(git rev-list --count HEAD)"
85-
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
86-
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
87-
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
88-
else
89-
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
90-
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
91-
fi
92-
93-
- name: Downcase github.repository_owner
94-
run: |
95-
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
96-
env:
97-
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
98-
99116
- name: Build and push Docker image (tagged + versioned)
100117
if: github.event_name == 'push'
101118
uses: docker/build-push-action@v6
102119
with:
103120
context: .
104121
push: true
105122
platforms: ${{ matrix.config.platforms }}
106-
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
123+
# tag list is generated from step above
124+
tags: ${{ steps.tag.outputs.output_tags }}
107125
file: ${{ matrix.config.dockerfile }}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
112112
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
113113
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
114114
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
115+
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
115116
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
116117
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
117118
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)

common/arg.cpp

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691691
[](gpt_params & params) {
692692
params.ctx_shift = false;
693693
}
694-
).set_examples({LLAMA_EXAMPLE_MAIN}));
694+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
695695
add_opt(llama_arg(
696696
{"--chunks"}, "N",
697697
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11021102
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
11031103
else { throw std::invalid_argument("invalid value"); }
11041104
}
1105-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1105+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
11061106
add_opt(llama_arg(
11071107
{"--attention"}, "{causal,non,causal}",
11081108
"attention type for embeddings, use model default if unspecified",
@@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11211121
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
11221122
else { throw std::invalid_argument("invalid value"); }
11231123
}
1124-
));
1124+
).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
11251125
add_opt(llama_arg(
11261126
{"--rope-scale"}, "N",
11271127
"RoPE context scaling factor, expands context by a factor of N",
11281128
[](gpt_params & params, const std::string & value) {
11291129
params.rope_freq_scale = 1.0f / std::stof(value);
11301130
}
1131-
));
1131+
).set_env("LLAMA_ARG_ROPE_SCALE"));
11321132
add_opt(llama_arg(
11331133
{"--rope-freq-base"}, "N",
11341134
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
11351135
[](gpt_params & params, const std::string & value) {
11361136
params.rope_freq_base = std::stof(value);
11371137
}
1138-
));
1138+
).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
11391139
add_opt(llama_arg(
11401140
{"--rope-freq-scale"}, "N",
11411141
"RoPE frequency scaling factor, expands context by a factor of 1/N",
11421142
[](gpt_params & params, const std::string & value) {
11431143
params.rope_freq_scale = std::stof(value);
11441144
}
1145-
));
1145+
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
11461146
add_opt(llama_arg(
11471147
{"--yarn-orig-ctx"}, "N",
11481148
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
11491149
[](gpt_params & params, int value) {
11501150
params.yarn_orig_ctx = value;
11511151
}
1152-
));
1152+
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
11531153
add_opt(llama_arg(
11541154
{"--yarn-ext-factor"}, "N",
11551155
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
11561156
[](gpt_params & params, const std::string & value) {
11571157
params.yarn_ext_factor = std::stof(value);
11581158
}
1159-
));
1159+
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
11601160
add_opt(llama_arg(
11611161
{"--yarn-attn-factor"}, "N",
11621162
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
11631163
[](gpt_params & params, const std::string & value) {
11641164
params.yarn_attn_factor = std::stof(value);
11651165
}
1166-
));
1166+
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
11671167
add_opt(llama_arg(
11681168
{"--yarn-beta-slow"}, "N",
11691169
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
11701170
[](gpt_params & params, const std::string & value) {
11711171
params.yarn_beta_slow = std::stof(value);
11721172
}
1173-
));
1173+
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
11741174
add_opt(llama_arg(
11751175
{"--yarn-beta-fast"}, "N",
11761176
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
11771177
[](gpt_params & params, const std::string & value) {
11781178
params.yarn_beta_fast = std::stof(value);
11791179
}
1180-
));
1180+
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
11811181
add_opt(llama_arg(
11821182
{"-gan", "--grp-attn-n"}, "N",
11831183
format("group-attention factor (default: %d)", params.grp_attn_n),
11841184
[](gpt_params & params, int value) {
11851185
params.grp_attn_n = value;
11861186
}
1187-
));
1187+
).set_env("LLAMA_ARG_GRP_ATTN_N"));
11881188
add_opt(llama_arg(
11891189
{"-gaw", "--grp-attn-w"}, "N",
11901190
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
11911191
[](gpt_params & params, int value) {
11921192
params.grp_attn_w = value;
11931193
}
1194-
));
1194+
).set_env("LLAMA_ARG_GRP_ATTN_W"));
11951195
add_opt(llama_arg(
11961196
{"-dkvc", "--dump-kv-cache"},
11971197
"verbose print of the KV cache",
@@ -1205,23 +1205,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
12051205
[](gpt_params & params) {
12061206
params.no_kv_offload = true;
12071207
}
1208-
));
1208+
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
12091209
add_opt(llama_arg(
12101210
{"-ctk", "--cache-type-k"}, "TYPE",
12111211
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
12121212
[](gpt_params & params, const std::string & value) {
12131213
// TODO: get the type right here
12141214
params.cache_type_k = value;
12151215
}
1216-
));
1216+
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
12171217
add_opt(llama_arg(
12181218
{"-ctv", "--cache-type-v"}, "TYPE",
12191219
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
12201220
[](gpt_params & params, const std::string & value) {
12211221
// TODO: get the type right here
12221222
params.cache_type_v = value;
12231223
}
1224-
));
1224+
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
12251225
add_opt(llama_arg(
12261226
{"--perplexity", "--all-logits"},
12271227
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
@@ -1355,22 +1355,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13551355
[](gpt_params & params, const std::string & value) {
13561356
params.rpc_servers = value;
13571357
}
1358-
));
1358+
).set_env("LLAMA_ARG_RPC"));
13591359
#endif
13601360
add_opt(llama_arg(
13611361
{"--mlock"},
13621362
"force system to keep model in RAM rather than swapping or compressing",
13631363
[](gpt_params & params) {
13641364
params.use_mlock = true;
13651365
}
1366-
));
1366+
).set_env("LLAMA_ARG_MLOCK"));
13671367
add_opt(llama_arg(
13681368
{"--no-mmap"},
13691369
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
13701370
[](gpt_params & params) {
13711371
params.use_mmap = false;
13721372
}
1373-
));
1373+
).set_env("LLAMA_ARG_NO_MMAP"));
13741374
add_opt(llama_arg(
13751375
{"--numa"}, "TYPE",
13761376
"attempt optimizations that help on some NUMA systems\n"
@@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13851385
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
13861386
else { throw std::invalid_argument("invalid value"); }
13871387
}
1388-
));
1388+
).set_env("LLAMA_ARG_NUMA"));
13891389
add_opt(llama_arg(
13901390
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
13911391
"number of layers to store in VRAM",
@@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14331433
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
14341434
}
14351435
}
1436-
));
1436+
).set_env("LLAMA_ARG_SPLIT_MODE"));
14371437
add_opt(llama_arg(
14381438
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
14391439
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
@@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14601460
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
14611461
}
14621462
}
1463-
));
1463+
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
14641464
add_opt(llama_arg(
14651465
{"-mg", "--main-gpu"}, "INDEX",
14661466
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
@@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14701470
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
14711471
}
14721472
}
1473-
));
1473+
).set_env("LLAMA_ARG_MAIN_GPU"));
14741474
add_opt(llama_arg(
14751475
{"--check-tensors"},
14761476
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
15331533
[](gpt_params & params, const std::string & value) {
15341534
params.model_alias = value;
15351535
}
1536-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1536+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
15371537
add_opt(llama_arg(
15381538
{"-m", "--model"}, "FNAME",
15391539
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17411741
[](gpt_params & params, const std::string & value) {
17421742
params.public_path = value;
17431743
}
1744-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1744+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
17451745
add_opt(llama_arg(
17461746
{"--embedding", "--embeddings"},
17471747
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -1779,22 +1779,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17791779
[](gpt_params & params, const std::string & value) {
17801780
params.ssl_file_key = value;
17811781
}
1782-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1782+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
17831783
add_opt(llama_arg(
17841784
{"--ssl-cert-file"}, "FNAME",
17851785
"path to file a PEM-encoded SSL certificate",
17861786
[](gpt_params & params, const std::string & value) {
17871787
params.ssl_file_cert = value;
17881788
}
1789-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1789+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
17901790
add_opt(llama_arg(
17911791
{"-to", "--timeout"}, "N",
17921792
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
17931793
[](gpt_params & params, int value) {
17941794
params.timeout_read = value;
17951795
params.timeout_write = value;
17961796
}
1797-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1797+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
17981798
add_opt(llama_arg(
17991799
{"--threads-http"}, "N",
18001800
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),

common/log.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ struct gpt_log_entry {
8282
}
8383
}
8484

85-
if (level != GGML_LOG_LEVEL_NONE && prefix) {
85+
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
8686
if (timestamp) {
8787
// [M.s.ms.us]
8888
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",

common/log.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,10 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
8383
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
8484
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
8585
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
86+
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
8687

8788
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
8889
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
8990
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
9091
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
92+
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)

common/sampling.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,15 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
225225
GGML_ASSERT(false && "unknown mirostat version");
226226
}
227227
} else {
228-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
228+
if (params.n_probs > 0) {
229+
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
230+
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
231+
//
232+
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
233+
// it is much faster, since we avoid sorting all tokens and should give a good approximation
234+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
235+
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
236+
}
229237
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
230238
}
231239

0 commit comments

Comments
 (0)