diff --git a/common/arg.cpp b/common/arg.cpp index 33ed7ae857269..a25743c899862 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ed99dc8477231..7b49969c02149 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -29,12 +29,29 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf from gguf.vocab import MistralTokenizerType, MistralVocab -from mistral_common.tokens.tokenizers.base import TokenizerVersion -from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD -from mistral_common.tokens.tokenizers.tekken import Tekkenizer -from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, -) + +try: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + SentencePieceTokenizer, + ) + + _mistral_common_installed = True + _mistral_import_error_msg = "" +except ImportError: + _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) + _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + + _mistral_common_installed = False + TokenizerVersion = None + Tekkenizer = None + SentencePieceTokenizer = None + _mistral_import_error_msg = ( + "Mistral format requires `mistral-common` to be installed. Please run " + "`pip install mistral-common[image,audio]` to install it." + ) logger = logging.getLogger("hf-to-gguf") @@ -107,6 +124,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, type(self) is MmprojModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + if self.is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out @@ -1363,8 +1383,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) # preprocessor config - image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] - image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] self.gguf_writer.add_vision_image_mean(image_mean) self.gguf_writer.add_vision_image_std(image_std) @@ -2033,6 +2053,9 @@ def __init__(self, *args, **kwargs): self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) def _set_vocab_mistral(self): + if not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + vocab = MistralVocab(self.dir_model) logger.info( f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." @@ -9212,7 +9235,7 @@ class MistralModel(LlamaModel): @staticmethod def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): - assert TokenizerVersion is not None, "mistral_common is not installed" + assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" ) @@ -9594,6 +9617,8 @@ def main() -> None: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") is_mistral_format = args.mistral_format + if is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) disable_mistral_community_chat_template = args.disable_mistral_community_chat_template with torch.inference_mode(): diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 6e7c5aedbc55a..f5a6a751acfd5 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true); if (ops.size() == topk_moe_ops_with_norm.size() && - ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) { + ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) { ggml_tensor * softmax = cgraph->nodes[node_idx]; ggml_tensor * weights = cgraph->nodes[node_idx+8]; @@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, } if (ops.size() == topk_moe_ops.size() && - ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) { + ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) { ggml_tensor * softmax = cgraph->nodes[node_idx]; ggml_tensor * weights = cgraph->nodes[node_idx+4]; if (ggml_cuda_should_use_topk_moe(softmax, weights)) { @@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, } if (ops.size() == topk_moe_ops_delayed_softmax.size() && - ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) { + ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) { ggml_tensor * softmax = cgraph->nodes[node_idx + 4]; ggml_tensor * weights = cgraph->nodes[node_idx + 5]; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 33f9035075ba7..b695ba051b025 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -30,6 +30,9 @@ #include #include +#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC +# include +#endif #include #include "ggml-sycl.h" @@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0; int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; +int g_ggml_sycl_use_async_mem_op = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -237,7 +241,20 @@ static void ggml_check_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - + // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be + // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in + // other places. +#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC + g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph; + if (g_ggml_sycl_use_async_mem_op) { + for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) { + if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) { + g_ggml_sycl_use_async_mem_op = 0; + break; + } + } + } +#endif if (CHECK_TRY_ERROR(g_all_sycl_device_count = dpct::dev_mgr::instance().device_count()) != 0) { initialized = true; @@ -3031,19 +3048,51 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { } } +// Helper functions to unify device memory allocation for both async and sync paths +static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) { + bool use_async = g_ggml_sycl_use_async_mem_op; +#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC + if (use_async) { + return syclex::async_malloc(*stream, sycl::usm::alloc::device, size); + } +#else + // If async allocation extension is not available, use_async should always be false. + GGML_ASSERT(!use_async); +#endif + return sycl::malloc(size, *stream, sycl::usm::alloc::device); +} + +static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) { + bool use_async = g_ggml_sycl_use_async_mem_op; +#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC + if (use_async) { + syclex::async_free(*stream, ptr); + return; + } +#else + // If async allocation extension is not available, use_async should always be false. + GGML_ASSERT(!use_async); +#endif + sycl::free(ptr, *stream); +} + static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset, dpct::queue_ptr stream) { - auto * tmp_buf = sycl::malloc_shared(size, *stream); - SYCL_CHECK( - CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) - .wait())); + uint8_t * tmp_buf = static_cast(sycl_ext_malloc_device(stream, size)); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } + GGML_ASSERT((size % sizeof(block_q4_0) == 0)); GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); int offset_blks = offset / sizeof(block_q4_0); auto qs_ptr = data_device + offset_blks * QK4_0 / 2; auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; - stream->parallel_for( + auto reorder_event = stream->parallel_for( size / sizeof(block_q4_0), [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { const block_q4_0* x = (const block_q4_0*)tmp_buf; @@ -3054,9 +3103,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; } *(d_ptr + ib) = x[ib].d; - }).wait_and_throw(); - - sycl::free(tmp_buf, *stream); + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + sycl_ext_free(stream, tmp_buf); } static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { @@ -3065,14 +3116,19 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d const int nblocks = size / sizeof(block_q4_K); - auto * tmp_buf = sycl::malloc_shared(size, *stream); - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait())); + uint8_t * tmp_buf = static_cast(sycl_ext_malloc_device(stream, size)); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } auto * qs_ptr = data_device; auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks; auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks); - stream->parallel_for(nblocks, [=](auto i) { + auto reorder_event = stream->parallel_for(nblocks, [=](auto i) { const block_q4_K * x = (const block_q4_K *) tmp_buf; const int ib = i; @@ -3085,9 +3141,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d } dm_ptr[ib] = x[ib].dm; - }).wait_and_throw(); - - sycl::free(tmp_buf, *stream); + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + sycl_ext_free(stream, tmp_buf); } static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { @@ -3096,42 +3154,46 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d const int nblocks = size / sizeof(block_q6_K); - auto * tmp_buf = sycl::malloc_shared(size, *stream); - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait())); + uint8_t * tmp_buf = static_cast(sycl_ext_malloc_device(stream, size)); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } auto * ql_ptr = data_device; auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks; auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks; sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks); - stream - ->parallel_for(nblocks, - [=](auto i) { - const block_q6_K * x = (const block_q6_K *) tmp_buf; - const int ib = i; - - const uint8_t * ql = x[ib].ql; - const uint8_t * qh = x[ib].qh; - uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib; - uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib; - uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib; + auto reorder_event = stream->parallel_for(nblocks, [=](auto i) { + const block_q6_K * x = (const block_q6_K *) tmp_buf; + const int ib = i; - for (int j = 0; j < QK_K / 2; ++j) { - base_ql_ptr[j] = ql[j]; - } - for (int j = 0; j < QK_K / 4; ++j) { - base_qh_ptr[j] = qh[j]; - } + const uint8_t * ql = x[ib].ql; + const uint8_t * qh = x[ib].qh; + uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib; + uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib; + uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib; - for (int j = 0; j < QK_K / 16; ++j) { - base_scales_ptr[j] = x[ib].scales[j]; - } + for (int j = 0; j < QK_K / 2; ++j) { + base_ql_ptr[j] = ql[j]; + } + for (int j = 0; j < QK_K / 4; ++j) { + base_qh_ptr[j] = qh[j]; + } - dm_ptr[ib] = x[ib].d; - }) - .wait_and_throw(); + for (int j = 0; j < QK_K / 16; ++j) { + base_scales_ptr[j] = x[ib].scales[j]; + } - sycl::free(tmp_buf, *stream); + dm_ptr[ib] = x[ib].d; + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + sycl_ext_free(stream, tmp_buf); } static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { @@ -4056,6 +4118,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) { GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__, ggml_op_name(node_op)); return false; + case GGML_OP_MUL_MAT: + // We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available, + // as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present + // in reordering. + if (!g_ggml_sycl_use_async_mem_op) { + GGML_LOG_INFO( + "%s: disabling SYCL graphs due to unsupported node type when using a compiler without the " + "oneAPI async memory allocation extension " + "%s\n", + __func__, ggml_op_name(node_op)); + return false; + } } } return true; diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 7111557bfdd8c..5c6817109ba57 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -14,12 +14,12 @@ SentencePieceProcessor = None try: - from mistral_common.tokens.tokenizers.mistral import MistralTokenizer - from mistral_common.tokens.tokenizers.tekken import Tekkenizer - from mistral_common.tokens.tokenizers.utils import ( + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] _filter_valid_tokenizer_files, ) - from mistral_common.tokens.tokenizers.sentencepiece import ( + from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] SentencePieceTokenizer, ) except ImportError: diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 90c98c3ffe526..122b4788d9199 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,5 +1,3 @@ -mistral-common>=1.8.3 - -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 22f2fe5fdb828..5af6263f9851d 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() + +if (CMAKE_SYSTEM_NAME MATCHES "AIX") + # AIX's flock() function comes from libbsd.a + target_link_libraries(${TARGET} PRIVATE -lbsd) +endif() diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 5fde6ca0c32ae..fd1fb6581b163 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -76,9 +76,11 @@ struct mtmd_cli_context { mtmd::bitmaps bitmaps; - // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another - // so here we don't need to keep track of chat history + // chat template common_chat_templates_ptr tmpls; + std::vector chat_history; + bool use_jinja = false; + // TODO: support for --system-prompt with /clear command // support for legacy templates (models not having EOT token) llama_tokens antiprompt_tokens; @@ -108,6 +110,8 @@ struct mtmd_cli_context { } tmpls = common_chat_templates_init(model, params.chat_template); + use_jinja = params.use_jinja; + chat_history.clear(); LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); init_vision_context(params); @@ -193,19 +197,33 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { return 1; } } + + std::string generated_text = common_detokenize(ctx.lctx, generated_tokens); + common_chat_msg msg; + msg.role = "assistant"; + msg.content = generated_text; + ctx.chat_history.push_back(std::move(msg)); + return 0; } -static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) { - common_chat_templates_inputs tmpl_inputs; - tmpl_inputs.messages = {msg}; - tmpl_inputs.add_generation_prompt = true; - tmpl_inputs.use_jinja = false; // jinja is buggy here - auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs); - LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str()); +static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { + LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", + new_msg.role.c_str(), new_msg.content.c_str()); + auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, + new_msg, new_msg.role == "user", + ctx.use_jinja); + ctx.chat_history.push_back(new_msg); + return formatted; +} + +static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { + bool add_bos = ctx.chat_history.empty(); + auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; - text.text = formatted_chat.prompt.c_str(); + text.text = formatted_chat.c_str(); text.add_special = add_bos; text.parse_special = true; @@ -303,7 +321,7 @@ int main(int argc, char ** argv) { return 1; // error is already printed by libmtmd } } - if (eval_message(ctx, msg, true)) { + if (eval_message(ctx, msg)) { return 1; } if (!g_is_interrupted && generate_response(ctx, n_predict)) { @@ -322,7 +340,6 @@ int main(int argc, char ** argv) { LOG("\n /quit or /exit exit the program"); LOG("\n"); - bool is_first_msg = true; std::string content; while (!g_is_interrupted) { @@ -342,7 +359,8 @@ int main(int argc, char ** argv) { } if (line == "/clear") { ctx.n_past = 0; - llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS + ctx.chat_history.clear(); + llama_memory_clear(llama_get_memory(ctx.lctx), true); LOG("Chat history cleared\n\n"); continue; } @@ -367,7 +385,7 @@ int main(int argc, char ** argv) { common_chat_msg msg; msg.role = "user"; msg.content = content; - int ret = eval_message(ctx, msg, is_first_msg); + int ret = eval_message(ctx, msg); if (ret) { return 1; } @@ -376,7 +394,6 @@ int main(int argc, char ** argv) { return 1; } content.clear(); - is_first_msg = false; } } if (g_is_interrupted) LOG("\nInterrupted by user\n"); diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt index e52294ccc0ef9..6ad7534e290bc 100644 --- a/tools/run/CMakeLists.txt +++ b/tools/run/CMakeLists.txt @@ -13,5 +13,11 @@ endif () if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() + +if (CMAKE_SYSTEM_NAME MATCHES "AIX") + # AIX's flock() function comes from libbsd.a + target_link_libraries(${TARGET} PRIVATE -lbsd) +endif() + target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 8737fba124d50..4124bffa40f85 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2839,7 +2839,7 @@ struct server_context { slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else if (slot.has_next_token) { + } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) { stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); send_text = stop_pos == std::string::npos; } @@ -5714,6 +5714,7 @@ int main(int argc, char ** argv) { clean_up(); t.join(); + llama_memory_breakdown_print(ctx_server.ctx); return 0; }