cpp: fix memory bug causing leak when reloading models

shubham0204 · shubham0204 · commit bb2695783140 · 2025-09-14T07:15:28.000+05:30
diff --git a/smollm/src/main/cpp/CMakeLists.txt b/smollm/src/main/cpp/CMakeLists.txt
@@ -100,11 +100,11 @@ function(build_library target_name)
     target_include_directories(
             ${target_name}
             PUBLIC
+            ${COMMON_DIR}
             ${GGML_DIR}/include
             ${GGML_DIR}/src
             ${GGML_DIR}/src/ggml-cpu
             ${LLAMA_DIR}/include
-            ${COMMON_DIR}
             ${VENDOR_DIR}
     )
 
diff --git a/smollm/src/main/cpp/LLMInference.cpp b/smollm/src/main/cpp/LLMInference.cpp
@@ -1,6 +1,4 @@
 #include "LLMInference.h"
-#include "llama.h"
-#include "gguf.h"
 #include <android/log.h>
 #include <cstring>
 #include <iostream>
@@ -9,20 +7,9 @@
 #define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
 #define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
 
-std::vector<llama_token> common_tokenize(
-    const struct llama_vocab * vocab,
-    const std::string & text,
-    bool   add_special,
-    bool   parse_special = false);
-
-std::string common_token_to_piece(
-    const struct llama_context * ctx,
-    llama_token   token,
-    bool          special = true);
-
 void
-LLMInference::loadModel(const char* model_path, float minP, float temperature, bool storeChats, long contextSize,
-                        const char* chatTemplate, int nThreads, bool useMmap, bool useMlock) {
+LLMInference::loadModel(const char *model_path, float minP, float temperature, bool storeChats, long contextSize,
+                        const char *chatTemplate, int nThreads, bool useMmap, bool useMlock) {
     LOGi("loading model with"
          "\n\tmodel_path = %s"
          "\n\tminP = %f"
@@ -35,56 +22,57 @@ LLMInference::loadModel(const char* model_path, float minP, float temperature, b
          "\n\tuseMlock = %d",
          model_path, minP, temperature, storeChats, contextSize, chatTemplate, nThreads, useMmap, useMlock);
 
+    // load dynamic backends
+    ggml_backend_load_all();
+
     // create an instance of llama_model
     llama_model_params model_params = llama_model_default_params();
-    model_params.use_mmap           = useMmap;
-    model_params.use_mlock          = useMlock;
-    _model                          = llama_model_load_from_file(model_path, model_params);
-
+    model_params.use_mmap = useMmap;
+    model_params.use_mlock = useMlock;
+    _model = llama_model_load_from_file(model_path, model_params);
     if (!_model) {
         LOGe("failed to load model from %s", model_path);
         throw std::runtime_error("loadModel() failed");
     }
 
     // create an instance of llama_context
     llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.n_ctx                = contextSize;
-    ctx_params.n_threads            = nThreads;
-    ctx_params.no_perf              = true; // disable performance metrics
-    _ctx                            = llama_init_from_model(_model, ctx_params);
-
+    ctx_params.n_ctx = contextSize;
+    ctx_params.n_batch = contextSize;
+    ctx_params.n_threads = nThreads;
+    ctx_params.no_perf = true; // disable performance metrics
+    _ctx = llama_init_from_model(_model, ctx_params);
     if (!_ctx) {
         LOGe("llama_new_context_with_model() returned null)");
         throw std::runtime_error("llama_new_context_with_model() returned null");
     }
 
-    // initialize sampler
+    // create an instance of llama_sampler
     llama_sampler_chain_params sampler_params = llama_sampler_chain_default_params();
-    sampler_params.no_perf                    = true; // disable performance metrics
-    _sampler                                  = llama_sampler_chain_init(sampler_params);
-
+    sampler_params.no_perf = true; // disable performance metrics
+    _sampler = llama_sampler_chain_init(sampler_params);
     llama_sampler_chain_add(_sampler, llama_sampler_init_temp(temperature));
     llama_sampler_chain_add(_sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
-    if (minP >= 0.01f) {
-        // minP = 0.0 (disabled)
-        // minP can be adjusted across 100 steps between [0.0,1.0], the smallest step being 0.01
-        llama_sampler_chain_add(_sampler, llama_sampler_init_min_p(minP, 1));
-    }
 
     _formattedMessages = std::vector<char>(llama_n_ctx(_ctx));
     _messages.clear();
-    _chatTemplate     = strdup(chatTemplate);
+
+    if (chatTemplate == nullptr) {
+        _chatTemplate = llama_model_chat_template(_model, nullptr);
+    } else {
+        _chatTemplate = strdup(chatTemplate);
+    }
     this->_storeChats = storeChats;
 }
 
 void
-LLMInference::addChatMessage(const char* message, const char* role) {
-    _messages.push_back({ strdup(role), strdup(message) });
+LLMInference::addChatMessage(const char *message, const char *role) {
+    _messages.push_back({strdup(role), strdup(message)});
 }
 
 float
 LLMInference::getResponseGenerationTime() const {
-    return (float)_responseNumTokens / (_responseGenerationTime / 1e6);
+    return (float) _responseNumTokens / (_responseGenerationTime / 1e6);
 }
 
 int
@@ -93,19 +81,19 @@ LLMInference::getContextSizeUsed() const {
 }
 
 void
-LLMInference::startCompletion(const char* query) {
+LLMInference::startCompletion(const char *query) {
     if (!_storeChats) {
         _prevLen = 0;
         _formattedMessages.clear();
         _formattedMessages = std::vector<char>(llama_n_ctx(_ctx));
     }
     _responseGenerationTime = 0;
-    _responseNumTokens      = 0;
+    _responseNumTokens = 0;
     addChatMessage(query, "user");
     // apply the chat-template
     int newLen = llama_chat_apply_template(_chatTemplate, _messages.data(), _messages.size(), true,
                                            _formattedMessages.data(), _formattedMessages.size());
-    if (newLen > (int)_formattedMessages.size()) {
+    if (newLen > (int) _formattedMessages.size()) {
         // resize the output buffer `_formattedMessages`
         // and re-apply the chat template
         _formattedMessages.resize(newLen);
@@ -120,19 +108,20 @@ LLMInference::startCompletion(const char* query) {
 
     // create a llama_batch containing a single sequence
     // see llama_batch_init for more details
-    _batch.token    = _promptTokens.data();
-    _batch.n_tokens = _promptTokens.size();
+    _batch = new llama_batch();
+    _batch->token = _promptTokens.data();
+    _batch->n_tokens = _promptTokens.size();
 }
 
 // taken from:
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/llama.android/llama/src/main/cpp/llama-android.cpp#L38
 bool
-LLMInference::_isValidUtf8(const char* response) {
+LLMInference::_isValidUtf8(const char *response) {
     if (!response) {
         return true;
     }
-    const unsigned char* bytes = (const unsigned char*)response;
-    int                  num;
+    const unsigned char *bytes = (const unsigned char *) response;
+    int num;
     while (*bytes != 0x00) {
         if ((*bytes & 0x80) == 0x00) {
             // U+0000 to U+007F
@@ -166,14 +155,14 @@ LLMInference::completionLoop() {
     // check if the length of the inputs to the model
     // have exceeded the context size of the model
     uint32_t contextSize = llama_n_ctx(_ctx);
-    _nCtxUsed            = llama_kv_self_used_cells(_ctx);
-    if (_nCtxUsed + _batch.n_tokens > contextSize) {
+    _nCtxUsed = llama_memory_seq_pos_max(llama_get_memory(_ctx), 0) + 1;
+    if (_nCtxUsed + _batch->n_tokens > contextSize) {
         throw std::runtime_error("context size reached");
     }
 
     auto start = ggml_time_us();
     // run the model
-    if (llama_decode(_ctx, _batch) < 0) {
+    if (llama_decode(_ctx, *_batch) < 0) {
         throw std::runtime_error("llama_decode() failed");
     }
 
@@ -186,7 +175,6 @@ LLMInference::completionLoop() {
         return "[EOG]";
     }
     std::string piece = common_token_to_piece(_ctx, _currToken, true);
-    LOGi("common_token_to_piece: %s", piece.c_str());
     auto end = ggml_time_us();
     _responseGenerationTime += (end - start);
     _responseNumTokens += 1;
@@ -195,8 +183,8 @@ LLMInference::completionLoop() {
     // re-init the batch with the newly predicted token
     // key, value pairs of all previous tokens have been cached
     // in the KV cache
-    _batch.token    = &_currToken;
-    _batch.n_tokens = 1;
+    _batch->token = &_currToken;
+    _batch->n_tokens = 1;
 
     if (_isValidUtf8(_cacheResponseTokens.c_str())) {
         _response += _cacheResponseTokens;
@@ -214,23 +202,21 @@ LLMInference::stopCompletion() {
         addChatMessage(_response.c_str(), "assistant");
     }
     _response.clear();
-    const char* tmpl = llama_model_chat_template(_model, nullptr);
-    _prevLen         = llama_chat_apply_template(tmpl, _messages.data(), _messages.size(), false, nullptr, 0);
+    _prevLen = llama_chat_apply_template(_chatTemplate, _messages.data(), _messages.size(), false, nullptr, 0);
     if (_prevLen < 0) {
         throw std::runtime_error("llama_chat_apply_template() in LLMInference::stopCompletion() failed");
     }
 }
 
 LLMInference::~LLMInference() {
-    LOGi("deallocating LLMInference instance");
     // free memory held by the message text in messages
     // (as we had used strdup() to create a malloc'ed copy)
-    for (llama_chat_message& message : _messages) {
-        free(const_cast<char*>(message.role));
-        free(const_cast<char*>(message.content));
+    for (llama_chat_message &message: _messages) {
+        free(const_cast<char *>(message.role));
+        free(const_cast<char *>(message.content));
     }
-    free(const_cast<char*>(_chatTemplate));
-    llama_sampler_free(_sampler);
     llama_free(_ctx);
     llama_model_free(_model);
-}
+    delete _batch;
+    llama_sampler_free(_sampler);
+}
diff --git a/smollm/src/main/cpp/LLMInference.h b/smollm/src/main/cpp/LLMInference.h
@@ -1,5 +1,6 @@
+#pragma once
 #include "llama.h"
-#include <jni.h>
+#include "common.h"
 #include <string>
 #include <vector>
 
@@ -9,7 +10,7 @@ class LLMInference {
     llama_model*   _model;
     llama_sampler* _sampler;
     llama_token    _currToken;
-    llama_batch    _batch;
+    llama_batch*   _batch;
 
     // container to store user/assistant messages in the chat
     std::vector<llama_chat_message> _messages;