Skip to content

Commit 289e208

Browse files
authored
Merge branch 'layla-build' into merge
2 parents 46c69e0 + 17b9dd5 commit 289e208

20 files changed

+361
-17
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ autogen-*.md
7474
!.github/workflows/*.yml
7575

7676
# Models
77-
7877
models/*
7978
models-mnt
8079
!models/.editorconfig

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,4 +223,4 @@ endif()
223223
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
224224
add_subdirectory(examples)
225225
add_subdirectory(pocs)
226-
endif()
226+
endif()

common/common.cpp

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1959,3 +1959,213 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
19591959
return result;
19601960
}
19611961

1962+
//
1963+
// YAML utils
1964+
//
1965+
1966+
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1967+
if (data.empty()) {
1968+
fprintf(stream, "%s:\n", prop_name);
1969+
return;
1970+
}
1971+
1972+
fprintf(stream, "%s: [", prop_name);
1973+
for (size_t i = 0; i < data.size() - 1; ++i) {
1974+
fprintf(stream, "%e, ", data[i]);
1975+
}
1976+
fprintf(stream, "%e]\n", data.back());
1977+
}
1978+
1979+
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1980+
if (data.empty()) {
1981+
fprintf(stream, "%s:\n", prop_name);
1982+
return;
1983+
}
1984+
1985+
fprintf(stream, "%s: [", prop_name);
1986+
for (size_t i = 0; i < data.size() - 1; ++i) {
1987+
fprintf(stream, "%d, ", data[i]);
1988+
}
1989+
fprintf(stream, "%d]\n", data.back());
1990+
}
1991+
1992+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1993+
std::string data_str(data == NULL ? "" : data);
1994+
1995+
if (data_str.empty()) {
1996+
fprintf(stream, "%s:\n", prop_name);
1997+
return;
1998+
}
1999+
2000+
size_t pos_start = 0;
2001+
size_t pos_found = 0;
2002+
2003+
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
2004+
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
2005+
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2006+
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2007+
data_str = "\"" + data_str + "\"";
2008+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2009+
return;
2010+
}
2011+
2012+
if (data_str.find('\n') == std::string::npos) {
2013+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2014+
return;
2015+
}
2016+
2017+
fprintf(stream, "%s: |\n", prop_name);
2018+
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2019+
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2020+
pos_start = pos_found + 1;
2021+
}
2022+
}
2023+
2024+
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
2025+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2026+
ggml_cpu_init(); // some ARM features are detected at runtime
2027+
2028+
const auto & sparams = params.sparams;
2029+
2030+
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2031+
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2032+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2033+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2034+
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2035+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2036+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2037+
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2038+
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2039+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2040+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2041+
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
2042+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2043+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2044+
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
2045+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2046+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2047+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2048+
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2049+
2050+
#ifdef NDEBUG
2051+
fprintf(stream, "debug: false\n");
2052+
#else
2053+
fprintf(stream, "debug: true\n");
2054+
#endif // NDEBUG
2055+
2056+
fprintf(stream, "model_desc: %s\n", model_desc);
2057+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2058+
2059+
#ifdef __OPTIMIZE__
2060+
fprintf(stream, "optimize: true\n");
2061+
#else
2062+
fprintf(stream, "optimize: false\n");
2063+
#endif // __OPTIMIZE__
2064+
2065+
fprintf(stream, "time: %s\n", timestamp.c_str());
2066+
2067+
fprintf(stream, "\n");
2068+
fprintf(stream, "###############\n");
2069+
fprintf(stream, "# User Inputs #\n");
2070+
fprintf(stream, "###############\n");
2071+
fprintf(stream, "\n");
2072+
2073+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2074+
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2075+
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2076+
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2077+
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2078+
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2079+
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2080+
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2081+
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
2082+
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2083+
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2084+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2085+
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2086+
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2087+
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2088+
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2089+
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2090+
2091+
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2092+
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2093+
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2094+
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2095+
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2096+
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2097+
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2098+
2099+
fprintf(stream, "logit_bias:\n");
2100+
for (const auto & logit_bias : sparams.logit_bias) {
2101+
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2102+
}
2103+
2104+
fprintf(stream, "lora:\n");
2105+
for (auto & la : params.lora_adapters) {
2106+
if (la.scale == 1.0f) {
2107+
fprintf(stream, " - %s\n", la.path.c_str());
2108+
}
2109+
}
2110+
fprintf(stream, "lora_scaled:\n");
2111+
for (auto & la : params.lora_adapters) {
2112+
if (la.scale != 1.0f) {
2113+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2114+
}
2115+
}
2116+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2117+
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2118+
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2119+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2120+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2121+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2122+
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2123+
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2124+
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2125+
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2126+
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2127+
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2128+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2129+
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2130+
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2131+
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2132+
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2133+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2134+
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2135+
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2136+
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2137+
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2138+
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2139+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2140+
2141+
fprintf(stream, "reverse_prompt:\n");
2142+
for (std::string ap : params.antiprompt) {
2143+
size_t pos = 0;
2144+
while ((pos = ap.find('\n', pos)) != std::string::npos) {
2145+
ap.replace(pos, 1, "\\n");
2146+
pos += 1;
2147+
}
2148+
2149+
fprintf(stream, " - %s\n", ap.c_str());
2150+
}
2151+
2152+
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2153+
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2154+
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2155+
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2156+
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2157+
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2158+
2159+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2160+
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2161+
2162+
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2163+
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2164+
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2165+
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2166+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2167+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2168+
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2169+
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2170+
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2171+
}

common/common.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,3 +610,17 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
610610
static const char * const LLM_KV_SPLIT_NO = "split.no";
611611
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
612612
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
613+
614+
=======
615+
616+
//
617+
// YAML utils
618+
//
619+
620+
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
621+
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
622+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
623+
624+
void yaml_dump_non_result_info(
625+
FILE * stream, const common_params & params, const llama_context * lctx,
626+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

common/sampling.cpp

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ struct ring_buffer {
6060
return value;
6161
}
6262

63+
T pop_back() {
64+
if (sz == 0) {
65+
throw std::runtime_error("ring buffer is empty");
66+
}
67+
// Move pos backwards, wrapping around if necessary
68+
pos = (pos == 0) ? capacity - 1 : pos - 1;
69+
T value = data[pos];
70+
sz--;
71+
return value;
72+
}
73+
6374
const T & rat(size_t i) const {
6475
if (i >= sz) {
6576
throw std::runtime_error("ring buffer: index out of bounds");
@@ -163,15 +174,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
163174

164175
llama_sampler_chain_add(result->chain,
165176
llama_sampler_init_penalties(
166-
llama_n_vocab (model),
167-
llama_token_eos(model),
168-
llama_token_nl (model),
169-
params.penalty_last_n,
170-
params.penalty_repeat,
171-
params.penalty_freq,
172-
params.penalty_present,
173-
params.penalize_nl,
174-
params.ignore_eos));
177+
llama_n_vocab (model),
178+
llama_token_eos(model),
179+
llama_token_nl (model),
180+
params.penalty_last_n,
181+
params.penalty_repeat,
182+
params.penalty_freq,
183+
params.penalty_present,
184+
params.penalize_nl,
185+
params.ignore_eos));
175186

176187
if (params.mirostat == 0) {
177188
for (const auto & cnstr : params.samplers) {
@@ -252,6 +263,16 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
252263
llama_sampler_reset(gsmpl->chain);
253264
}
254265

266+
void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar) {
267+
llama_sampler_reset(gsmpl->grmr);
268+
269+
gsmpl->grmr = llama_sampler_init_grammar(model, grammar, "root");
270+
}
271+
272+
void common_sampler_reset_grammar(struct common_sampler * gsmpl) {
273+
llama_sampler_reset(gsmpl->grmr);
274+
}
275+
255276
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
256277
return new common_sampler {
257278
/* .params = */ gsmpl->params,
@@ -405,6 +426,21 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
405426
return result;
406427
}
407428

429+
const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl) {
430+
return gsmpl->prev.to_vector();
431+
}
432+
433+
void common_sampler_rollback(common_sampler * gsmpl, int rollback_num) {
434+
if(rollback_num > gsmpl->prev.size()) {
435+
rollback_num = gsmpl->prev.size();
436+
}
437+
438+
// continuously pop the last token
439+
for(int i = 0; i < rollback_num; i++) {
440+
gsmpl->prev.pop_back();
441+
}
442+
}
443+
408444
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
409445
switch (cnstr) {
410446
case COMMON_SAMPLER_TYPE_DRY: return 'd';

common/sampling.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ void common_sampler_free(struct common_sampler * gsmpl);
4343
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
4444
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
4545
void common_sampler_reset (struct common_sampler * gsmpl);
46+
void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar);
47+
void common_sampler_reset_grammar(struct common_sampler * gsmpl);
4648
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
4749

4850
// arguments can be nullptr to skip printing
@@ -96,6 +98,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);
9698

9799
// get a string representation of the last accepted tokens
98100
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
101+
const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl);
102+
void common_sampler_rollback(common_sampler * gsmpl, int rollback_num);
99103

100104
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
101105
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

ggml/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,13 @@ set(GGML_PUBLIC_HEADERS
234234
include/ggml-vulkan.h)
235235

236236
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
237+
238+
# link android log library
239+
if(ANDROID)
240+
find_library(log-lib log)
241+
target_link_libraries(ggml PRIVATE ${log-lib})
242+
endif()
243+
237244
#if (GGML_METAL)
238245
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
239246
#endif()

ggml/src/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,14 @@ add_library(ggml-base
224224
ggml-aarch64.c
225225
ggml-aarch64.h)
226226

227+
# Search for the 'log' library on Android
228+
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
229+
find_library(log-lib log)
230+
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${log-lib})
231+
232+
target_link_libraries(ggml-base PUBLIC ${GGML_EXTRA_LIBS})
233+
endif()
234+
227235
target_include_directories(ggml-base PRIVATE .)
228236

229237
add_library(ggml

ggml/src/ggml-aarch64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,4 +126,4 @@ size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_
126126
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
127127
UNUSED(quant_weights);
128128
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
129-
}
129+
}

ggml/src/ggml-quants.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5244,4 +5244,4 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
52445244
}
52455245

52465246
return true;
5247-
}
5247+
}

0 commit comments

Comments
 (0)