Skip to content

Commit 5c8aa73

Browse files
authored
Merge branch 'layla-build' into merge
2 parents d79d8f3 + 5ab16ce commit 5c8aa73

19 files changed

+356
-7
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ autogen-*.md
7474
!.github/workflows/*.yml
7575

7676
# Models
77-
7877
models/*
7978
models-mnt
8079
!models/.editorconfig

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,4 +217,4 @@ endif()
217217
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
218218
add_subdirectory(examples)
219219
add_subdirectory(pocs)
220-
endif()
220+
endif()

common/common.cpp

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,3 +1953,213 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
19531953
return result;
19541954
}
19551955

1956+
//
1957+
// YAML utils
1958+
//
1959+
1960+
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1961+
if (data.empty()) {
1962+
fprintf(stream, "%s:\n", prop_name);
1963+
return;
1964+
}
1965+
1966+
fprintf(stream, "%s: [", prop_name);
1967+
for (size_t i = 0; i < data.size() - 1; ++i) {
1968+
fprintf(stream, "%e, ", data[i]);
1969+
}
1970+
fprintf(stream, "%e]\n", data.back());
1971+
}
1972+
1973+
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1974+
if (data.empty()) {
1975+
fprintf(stream, "%s:\n", prop_name);
1976+
return;
1977+
}
1978+
1979+
fprintf(stream, "%s: [", prop_name);
1980+
for (size_t i = 0; i < data.size() - 1; ++i) {
1981+
fprintf(stream, "%d, ", data[i]);
1982+
}
1983+
fprintf(stream, "%d]\n", data.back());
1984+
}
1985+
1986+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1987+
std::string data_str(data == NULL ? "" : data);
1988+
1989+
if (data_str.empty()) {
1990+
fprintf(stream, "%s:\n", prop_name);
1991+
return;
1992+
}
1993+
1994+
size_t pos_start = 0;
1995+
size_t pos_found = 0;
1996+
1997+
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
1998+
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1999+
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2000+
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2001+
data_str = "\"" + data_str + "\"";
2002+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2003+
return;
2004+
}
2005+
2006+
if (data_str.find('\n') == std::string::npos) {
2007+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2008+
return;
2009+
}
2010+
2011+
fprintf(stream, "%s: |\n", prop_name);
2012+
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2013+
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2014+
pos_start = pos_found + 1;
2015+
}
2016+
}
2017+
2018+
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
2019+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2020+
ggml_cpu_init(); // some ARM features are detected at runtime
2021+
2022+
const auto & sparams = params.sampling;
2023+
2024+
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2025+
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2026+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2027+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2028+
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2029+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2030+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2031+
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2032+
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2033+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2034+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2035+
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
2036+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2037+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2038+
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
2039+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2040+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2041+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2042+
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2043+
2044+
#ifdef NDEBUG
2045+
fprintf(stream, "debug: false\n");
2046+
#else
2047+
fprintf(stream, "debug: true\n");
2048+
#endif // NDEBUG
2049+
2050+
fprintf(stream, "model_desc: %s\n", model_desc);
2051+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2052+
2053+
#ifdef __OPTIMIZE__
2054+
fprintf(stream, "optimize: true\n");
2055+
#else
2056+
fprintf(stream, "optimize: false\n");
2057+
#endif // __OPTIMIZE__
2058+
2059+
fprintf(stream, "time: %s\n", timestamp.c_str());
2060+
2061+
fprintf(stream, "\n");
2062+
fprintf(stream, "###############\n");
2063+
fprintf(stream, "# User Inputs #\n");
2064+
fprintf(stream, "###############\n");
2065+
fprintf(stream, "\n");
2066+
2067+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2068+
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2069+
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2070+
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2071+
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2072+
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2073+
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2074+
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2075+
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
2076+
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2077+
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2078+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2079+
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2080+
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2081+
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2082+
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2083+
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2084+
2085+
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2086+
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2087+
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2088+
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2089+
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2090+
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2091+
//fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2092+
2093+
fprintf(stream, "logit_bias:\n");
2094+
for (const auto & logit_bias : sparams.logit_bias) {
2095+
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2096+
}
2097+
2098+
fprintf(stream, "lora:\n");
2099+
for (auto & la : params.lora_adapters) {
2100+
if (la.scale == 1.0f) {
2101+
fprintf(stream, " - %s\n", la.path.c_str());
2102+
}
2103+
}
2104+
fprintf(stream, "lora_scaled:\n");
2105+
for (auto & la : params.lora_adapters) {
2106+
if (la.scale != 1.0f) {
2107+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2108+
}
2109+
}
2110+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2111+
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2112+
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2113+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2114+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2115+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2116+
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2117+
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2118+
//fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2119+
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2120+
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2121+
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2122+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2123+
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2124+
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2125+
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2126+
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2127+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2128+
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2129+
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2130+
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2131+
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2132+
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2133+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2134+
2135+
fprintf(stream, "reverse_prompt:\n");
2136+
for (std::string ap : params.antiprompt) {
2137+
size_t pos = 0;
2138+
while ((pos = ap.find('\n', pos)) != std::string::npos) {
2139+
ap.replace(pos, 1, "\\n");
2140+
pos += 1;
2141+
}
2142+
2143+
fprintf(stream, " - %s\n", ap.c_str());
2144+
}
2145+
2146+
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2147+
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2148+
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2149+
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2150+
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2151+
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2152+
2153+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2154+
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2155+
2156+
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2157+
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2158+
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2159+
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2160+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2161+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2162+
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2163+
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2164+
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2165+
}

common/common.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,3 +640,15 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
640640
static const char * const LLM_KV_SPLIT_NO = "split.no";
641641
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
642642
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
643+
644+
//
645+
// YAML utils
646+
//
647+
648+
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
649+
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
650+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
651+
652+
void yaml_dump_non_result_info(
653+
FILE * stream, const common_params & params, const llama_context * lctx,
654+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

common/sampling.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ struct ring_buffer {
6060
return value;
6161
}
6262

63+
T pop_back() {
64+
if (sz == 0) {
65+
throw std::runtime_error("ring buffer is empty");
66+
}
67+
// Move pos backwards, wrapping around if necessary
68+
pos = (pos == 0) ? capacity - 1 : pos - 1;
69+
T value = data[pos];
70+
sz--;
71+
return value;
72+
}
73+
6374
const T & rat(size_t i) const {
6475
if (i >= sz) {
6576
throw std::runtime_error("ring buffer: index out of bounds");
@@ -161,6 +172,18 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
161172
params.logit_bias.size(),
162173
params.logit_bias.data()));
163174

175+
llama_sampler_chain_add(result->chain,
176+
llama_sampler_init_penalties(
177+
llama_n_vocab (model),
178+
llama_token_eos(model),
179+
llama_token_nl (model),
180+
params.penalty_last_n,
181+
params.penalty_repeat,
182+
params.penalty_freq,
183+
params.penalty_present,
184+
params.penalize_nl,
185+
params.ignore_eos));
186+
164187
if (params.mirostat == 0) {
165188
for (const auto & cnstr : params.samplers) {
166189
switch (cnstr) {
@@ -243,6 +266,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
243266
llama_sampler_reset(gsmpl->chain);
244267
}
245268

269+
void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar) {
270+
llama_sampler_reset(gsmpl->grmr);
271+
272+
gsmpl->grmr = llama_sampler_init_grammar(model, grammar, "root");
273+
}
274+
246275
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
247276
return new common_sampler {
248277
/* .params = */ gsmpl->params,
@@ -396,6 +425,21 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
396425
return result;
397426
}
398427

428+
const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl) {
429+
return gsmpl->prev.to_vector();
430+
}
431+
432+
void common_sampler_rollback(common_sampler * gsmpl, int rollback_num) {
433+
if(rollback_num > gsmpl->prev.size()) {
434+
rollback_num = gsmpl->prev.size();
435+
}
436+
437+
// continuously pop the last token
438+
for(int i = 0; i < rollback_num; i++) {
439+
gsmpl->prev.pop_back();
440+
}
441+
}
442+
399443
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
400444
switch (cnstr) {
401445
case COMMON_SAMPLER_TYPE_DRY: return 'd';

common/sampling.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ void common_sampler_free(struct common_sampler * gsmpl);
4343
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
4444
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
4545
void common_sampler_reset (struct common_sampler * gsmpl);
46+
void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar);
4647
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
4748

4849
// arguments can be nullptr to skip printing
@@ -96,6 +97,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);
9697

9798
// get a string representation of the last accepted tokens
9899
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
100+
const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl);
101+
void common_sampler_rollback(common_sampler * gsmpl, int rollback_num);
99102

100103
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
101104
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

ggml/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,13 @@ set(GGML_PUBLIC_HEADERS
246246
include/ggml-vulkan.h)
247247

248248
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
249+
250+
# link android log library
251+
if(ANDROID)
252+
find_library(log-lib log)
253+
target_link_libraries(ggml PRIVATE ${log-lib})
254+
endif()
255+
249256
#if (GGML_METAL)
250257
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
251258
#endif()

ggml/src/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,14 @@ add_library(ggml-base
217217
ggml-quants.c
218218
ggml-quants.h)
219219

220+
# Search for the 'log' library on Android
221+
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
222+
find_library(log-lib log)
223+
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${log-lib})
224+
225+
target_link_libraries(ggml-base PUBLIC ${GGML_EXTRA_LIBS})
226+
endif()
227+
220228
target_include_directories(ggml-base PRIVATE .)
221229

222230
add_library(ggml

ggml/src/ggml-quants.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5235,4 +5235,4 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
52355235
}
52365236

52375237
return true;
5238-
}
5238+
}

0 commit comments

Comments
 (0)