From 3fb52db5b76bd3ef5f827163bf87611e8425b251 Mon Sep 17 00:00:00 2001 From: sakirr Date: Thu, 19 Feb 2026 17:49:48 +0000 Subject: [PATCH 1/3] fix: clear KV cache and reset batch state between sequential decode calls on arm64 --- .../src/backends/llamacpp/llamacpp_backend.cpp | 7 +++++++ .../src/backends/llamacpp/rac_llm_llamacpp.cpp | 5 +++++ .../src/jni/runanywhere_commons_jni.cpp | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp index fd546712d..0feb00468 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp @@ -548,6 +548,13 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques return false; } + // Clear KV cache before each new generation to avoid position conflicts on + // sequential calls (fixes #356: SIGABRT on second decode on Android arm64). + llama_memory_t mem = llama_get_memory(context_); + if (mem) { + llama_memory_clear(mem, true); + } + cancel_requested_.store(false); std::string prompt = build_prompt(request); diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp index ac6b40955..03d37dc22 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp @@ -207,6 +207,11 @@ rac_result_t rac_llm_llamacpp_generate(rac_handle_t handle, const char* prompt, } RAC_LOG_INFO("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generate() returned, tokens=%d", result.tokens_generated); + if (result.finish_reason == "error") { + RAC_LOG_ERROR("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generation failed (e.g. llama_decode error)"); + return RAC_ERROR_GENERATION_FAILED; + } + // Fill RAC result struct out_result->text = result.text.empty() ? nullptr : strdup(result.text.c_str()); out_result->completion_tokens = result.tokens_generated; diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp index 44a90df41..0997c7477 100644 --- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp +++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp @@ -567,6 +567,14 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (status != RAC_SUCCESS) { LOGe("racLlmComponentGenerate failed with status=%d", status); + const char* msg = rac_error_message(status); + if (msg && *msg) { + jclass exClass = env->FindClass("java/lang/RuntimeException"); + if (exClass) { + env->ThrowNew(exClass, msg); + env->DeleteLocalRef(exClass); + } + } return nullptr; } From 2473e1bb3459dda223a058ce01006757f1504f60 Mon Sep 17 00:00:00 2001 From: sakirr Date: Fri, 20 Feb 2026 00:18:26 +0000 Subject: [PATCH 2/3] fix: address bot review comments - null guard, decode failure flag, error details, and JNI exception fallback --- .../backends/llamacpp/llamacpp_backend.cpp | 10 +++++-- .../src/backends/llamacpp/llamacpp_backend.h | 1 + .../backends/llamacpp/rac_llm_llamacpp.cpp | 2 ++ .../src/jni/runanywhere_commons_jni.cpp | 26 +++++++++++++++---- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp index 0feb00468..8295d75d2 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp @@ -529,7 +529,9 @@ TextGenerationResult LlamaCppTextGeneration::generate(const TextGenerationReques result.prompt_tokens = prompt_tokens; result.inference_time_ms = duration.count(); - if (cancel_requested_.load()) { + if (decode_failed_) { + result.finish_reason = "error"; + } else if (cancel_requested_.load()) { result.finish_reason = "cancelled"; } else if (success) { result.finish_reason = tokens_generated >= request.max_tokens ? "length" : "stop"; @@ -556,6 +558,7 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques } cancel_requested_.store(false); + decode_failed_ = false; std::string prompt = build_prompt(request); LOGI("Generating with prompt length: %zu", prompt.length()); @@ -724,6 +727,7 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques if (llama_decode(context_, batch) != 0) { LOGE("llama_decode failed during generation"); + decode_failed_ = true; break; } } @@ -732,7 +736,9 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques callback(stop_window); } - llama_memory_clear(llama_get_memory(context_), true); + if (llama_memory_t post_mem = llama_get_memory(context_)) { + llama_memory_clear(post_mem, true); + } llama_batch_free(batch); diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h index 1387d5491..8b7d3cb33 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h @@ -134,6 +134,7 @@ class LlamaCppTextGeneration { bool model_loaded_ = false; std::atomic cancel_requested_{false}; + bool decode_failed_ = false; std::string model_path_; nlohmann::json model_config_; diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp index 03d37dc22..babfbf286 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp @@ -207,8 +207,10 @@ rac_result_t rac_llm_llamacpp_generate(rac_handle_t handle, const char* prompt, } RAC_LOG_INFO("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generate() returned, tokens=%d", result.tokens_generated); + // finish_reason is std::string; TODO: migrate to enum if TextGenerationResult gains one if (result.finish_reason == "error") { RAC_LOG_ERROR("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generation failed (e.g. llama_decode error)"); + rac_error_set_details("Generation failed: llama_decode returned non-zero"); return RAC_ERROR_GENERATION_FAILED; } diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp index 0997c7477..fa4b69da0 100644 --- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp +++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -567,13 +568,17 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (status != RAC_SUCCESS) { LOGe("racLlmComponentGenerate failed with status=%d", status); + rac_llm_result_free(&result); const char* msg = rac_error_message(status); - if (msg && *msg) { - jclass exClass = env->FindClass("java/lang/RuntimeException"); - if (exClass) { - env->ThrowNew(exClass, msg); - env->DeleteLocalRef(exClass); + jclass exClass = env->FindClass("java/lang/RuntimeException"); + if (exClass) { + char fallback[64]; + if (!msg || !*msg) { + snprintf(fallback, sizeof(fallback), "LLM generation failed (status=%d)", status); + msg = fallback; } + env->ThrowNew(exClass, msg); + env->DeleteLocalRef(exClass); } return nullptr; } @@ -849,6 +854,17 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (status != RAC_SUCCESS) { LOGe("rac_llm_component_generate_stream failed with status=%d", status); + const char* msg = rac_error_message(status); + jclass exClass = env->FindClass("java/lang/RuntimeException"); + if (exClass) { + char fallback[64]; + if (!msg || !*msg) { + snprintf(fallback, sizeof(fallback), "LLM stream generation failed (status=%d)", status); + msg = fallback; + } + env->ThrowNew(exClass, msg); + env->DeleteLocalRef(exClass); + } return nullptr; } From 8a163cad18a1dbbf249d481182b164850fee68d4 Mon Sep 17 00:00:00 2001 From: sakirr Date: Fri, 20 Feb 2026 04:26:11 +0000 Subject: [PATCH 3/3] fix: make decode_failed_ std::atomic for thread safety (review) --- .../src/backends/llamacpp/llamacpp_backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h index 8b7d3cb33..dc348a595 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h @@ -134,7 +134,7 @@ class LlamaCppTextGeneration { bool model_loaded_ = false; std::atomic cancel_requested_{false}; - bool decode_failed_ = false; + std::atomic decode_failed_{false}; std::string model_path_; nlohmann::json model_config_;