Skip to content

Commit df080b0

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # README.md # examples/server/README.md # examples/speculative/speculative.cpp # flake.lock # ggml/src/CMakeLists.txt # scripts/sync-ggml.last # tests/test-backend-ops.cpp
2 parents bfa118e + 2a82891 commit df080b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+147491
-145726
lines changed

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ struct common_params {
174174
float yarn_beta_fast = 32.0f; // YaRN low correction dim
175175
float yarn_beta_slow = 1.0f; // YaRN high correction dim
176176
int32_t yarn_orig_ctx = 0; // YaRN original context length
177-
float defrag_thold = -1.0f; // KV cache defragmentation threshold
177+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
178178

179179
struct cpu_params cpuparams;
180180
struct cpu_params cpuparams_batch;

examples/chat-persistent.sh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
2323
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
2424
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
2525

26-
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
27-
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
26+
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
27+
'|'\
28+
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
2829
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
2930

3031
CTX_SIZE=2048
@@ -129,15 +130,12 @@ while read -e line; do
129130

130131
printf ' '
131132

132-
# HACK get num tokens from debug message
133-
# TODO get both messages in one go
134-
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
135-
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
133+
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
136134
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
137135
exit 1
138136
fi
139137

140-
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
138+
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
141139

142140
if ((n_tokens > CTX_ROTATE_POINT)); then
143141
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"

examples/convert_legacy_llama.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,8 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
840840
self.gguf.add_base_model_version(key, base_model_entry["version"])
841841
if "organization" in base_model_entry:
842842
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
843+
if "description" in base_model_entry:
844+
self.gguf.add_base_model_description(key, base_model_entry["description"])
843845
if "url" in base_model_entry:
844846
self.gguf.add_base_model_url(key, base_model_entry["url"])
845847
if "doi" in base_model_entry:
@@ -849,12 +851,32 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
849851
if "repo_url" in base_model_entry:
850852
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
851853

854+
if metadata.datasets is not None:
855+
self.gguf.add_dataset_count(len(metadata.datasets))
856+
for key, dataset_entry in enumerate(metadata.datasets):
857+
if "name" in dataset_entry:
858+
self.gguf.add_dataset_name(key, dataset_entry["name"])
859+
if "author" in dataset_entry:
860+
self.gguf.add_dataset_author(key, dataset_entry["author"])
861+
if "version" in dataset_entry:
862+
self.gguf.add_dataset_version(key, dataset_entry["version"])
863+
if "organization" in dataset_entry:
864+
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
865+
if "description" in dataset_entry:
866+
self.gguf.add_dataset_description(key, dataset_entry["description"])
867+
if "url" in dataset_entry:
868+
self.gguf.add_dataset_url(key, dataset_entry["url"])
869+
if "doi" in dataset_entry:
870+
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
871+
if "uuid" in dataset_entry:
872+
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
873+
if "repo_url" in dataset_entry:
874+
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
875+
852876
if metadata.tags is not None:
853877
self.gguf.add_tags(metadata.tags)
854878
if metadata.languages is not None:
855879
self.gguf.add_languages(metadata.languages)
856-
if metadata.datasets is not None:
857-
self.gguf.add_datasets(metadata.datasets)
858880

859881
def add_meta_arch(self, params: Params) -> None:
860882
# Metadata About The Neural Architecture Itself

examples/server/public/index.html

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -200,23 +200,38 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
200200
<div class="label">System Message</div>
201201
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
202202
</label>
203-
<template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
204-
<label class="input input-bordered flex items-center gap-2 mb-2">
205-
<b>{{ key }}</b>
206-
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
207-
</label>
203+
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
204+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
208205
</template>
209206
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
210-
<div class="collapse collapse-arrow bg-base-200 mb-2">
211-
<input type="checkbox" />
212-
<div class="collapse-title font-bold">Advanced config</div>
207+
<!-- Section: Other sampler settings -->
208+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
209+
<summary class="collapse-title font-bold">Other sampler settings</summary>
210+
<div class="collapse-content">
211+
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
212+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
213+
</template>
214+
</div>
215+
</details>
216+
<!-- Section: Penalties settings -->
217+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
218+
<summary class="collapse-title font-bold">Penalties settings</summary>
219+
<div class="collapse-content">
220+
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
221+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
222+
</template>
223+
</div>
224+
</details>
225+
<!-- Section: Advanced config -->
226+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
227+
<summary class="collapse-title font-bold">Advanced config</summary>
213228
<div class="collapse-content">
214229
<label class="form-control mb-2">
215230
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
216231
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
217232
</label>
218233
</div>
219-
</div>
234+
</details>
220235
</div>
221236

222237
<!-- action buttons -->
@@ -229,6 +244,21 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
229244
</dialog>
230245
</div>
231246

247+
<!-- Template to be used by settings modal -->
248+
<template id="settings-modal-numeric-input">
249+
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
250+
<!-- Show help message on hovering on the input label -->
251+
<div class="dropdown dropdown-hover">
252+
<div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
253+
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
254+
{{ configInfo[configKey] || '(no help message available)' }}
255+
</div>
256+
</div>
257+
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
258+
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
259+
</label>
260+
</template>
261+
232262
<script src="./deps_markdown-it.js"></script>
233263
<script type="module">
234264
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
@@ -245,12 +275,48 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
245275
systemMessage: 'You are a helpful assistant.',
246276
// make sure these default values are in sync with `common.h`
247277
temperature: 0.8,
278+
dynatemp_range: 0.0,
279+
dynatemp_exponent: 1.0,
248280
top_k: 40,
249281
top_p: 0.95,
250282
min_p: 0.05,
283+
xtc_probability: 0.0,
284+
xtc_threshold: 0.1,
285+
typical_p: 1.0,
286+
repeat_last_n: 64,
287+
repeat_penalty: 1.0,
288+
presence_penalty: 0.0,
289+
frequency_penalty: 0.0,
290+
dry_multiplier: 0.0,
291+
dry_base: 1.75,
292+
dry_allowed_length: 2,
293+
dry_penalty_last_n: -1,
251294
max_tokens: -1,
252295
custom: '', // custom json-stringified object
253296
};
297+
const CONFIG_INFO = {
298+
apiKey: '',
299+
systemMessage: 'The starting message that defines how model should behave.',
300+
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
301+
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
302+
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
303+
top_k: 'Keeps only k top tokens.',
304+
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
305+
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
306+
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
307+
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
308+
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
309+
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
310+
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
311+
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
312+
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
313+
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
314+
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
315+
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
316+
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
317+
max_tokens: 'The maximum number of token per output.',
318+
custom: '', // custom json-stringified object
319+
};
254320
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
255321
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
256322
// list of themes supported by daisyui
@@ -269,6 +335,12 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
269335
{ props: ["source", "options", "plugins"] }
270336
);
271337

338+
// inout field to be used by settings modal
339+
const SettingsModalNumericInput = defineComponent({
340+
template: document.getElementById('settings-modal-numeric-input').innerHTML,
341+
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
342+
});
343+
272344
// coversations is stored in localStorage
273345
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
274346
// convId is a string prefixed with 'conv-'
@@ -359,6 +431,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
359431
const mainApp = createApp({
360432
components: {
361433
VueMarkdown,
434+
SettingsModalNumericInput,
362435
},
363436
data() {
364437
return {
@@ -376,6 +449,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
376449
// const
377450
themes: THEMES,
378451
configDefault: {...CONFIG_DEFAULT},
452+
configInfo: {...CONFIG_INFO},
379453
}
380454
},
381455
computed: {},
@@ -452,8 +526,22 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
452526
stream: true,
453527
cache_prompt: true,
454528
temperature: this.config.temperature,
529+
dynatemp_range: this.config.dynatemp_range,
530+
dynatemp_exponent: this.config.dynatemp_exponent,
455531
top_k: this.config.top_k,
456532
top_p: this.config.top_p,
533+
min_p: this.config.min_p,
534+
typical_p: this.config.typical_p,
535+
xtc_probability: this.config.xtc_probability,
536+
xtc_threshold: this.config.xtc_threshold,
537+
repeat_last_n: this.config.repeat_last_n,
538+
repeat_penalty: this.config.repeat_penalty,
539+
presence_penalty: this.config.presence_penalty,
540+
frequency_penalty: this.config.frequency_penalty,
541+
dry_multiplier: this.config.dry_multiplier,
542+
dry_base: this.config.dry_base,
543+
dry_allowed_length: this.config.dry_allowed_length,
544+
dry_penalty_last_n: this.config.dry_penalty_last_n,
457545
max_tokens: this.config.max_tokens,
458546
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
459547
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),

examples/server/server.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -656,11 +656,16 @@ struct server_context {
656656
}
657657

658658
bool validate_model_chat_template() const {
659-
llama_chat_message chat[] = {{"user", "test"}};
660-
661-
const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
662-
663-
return res > 0;
659+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
660+
std::string template_key = "tokenizer.chat_template";
661+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
662+
if (res >= 0) {
663+
llama_chat_message chat[] = {{"user", "test"}};
664+
std::string tmpl = std::string(model_template.data(), model_template.size());
665+
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
666+
return chat_res > 0;
667+
}
668+
return false;
664669
}
665670

666671
void init() {

ggml/src/ggml-cuda/count-equal.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4444

4545
const int64_t ne = ggml_nelements(src0);
4646
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
47-
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
47+
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
4848

4949
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
5050

ggml/src/ggml-metal.m

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -596,17 +596,12 @@ @implementation GGMLMetalClass
596596
ctx->kernels[i].pipeline = nil;
597597
}
598598

599-
/*
600-
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
601-
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
602-
(int) kernel->pipeline.threadExecutionWidth); \
603-
*/
604599
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
605600
if (supported) { \
606601
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
607602
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
608603
kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
609-
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
604+
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
610605
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
611606
(int) kernel->pipeline.threadExecutionWidth); \
612607
[metal_function release]; \
@@ -3046,6 +3041,8 @@ static void ggml_metal_encode_node(
30463041

30473042
bool use_vec_kernel = false;
30483043

3044+
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
3045+
// for now avoiding mainly to keep the number of templates/kernels a bit lower
30493046
if (ne01 >= 4 || (ne00%128 != 0)) {
30503047
switch (src1->type) {
30513048
case GGML_TYPE_F16:

0 commit comments

Comments
 (0)