From 3fb52db5b76bd3ef5f827163bf87611e8425b251 Mon Sep 17 00:00:00 2001
From: sakirr <sakirahmed75531@gmail.com>
Date: Thu, 19 Feb 2026 17:49:48 +0000
Subject: [PATCH 1/3] fix: clear KV cache and reset batch state between
 sequential decode calls on arm64

---
 .../src/backends/llamacpp/llamacpp_backend.cpp            | 7 +++++++
 .../src/backends/llamacpp/rac_llm_llamacpp.cpp            | 5 +++++
 .../src/jni/runanywhere_commons_jni.cpp                   | 8 ++++++++
 3 files changed, 20 insertions(+)

diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index fd546712d..0feb00468 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -548,6 +548,13 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
         return false;
     }
 
+    // Clear KV cache before each new generation to avoid position conflicts on
+    // sequential calls (fixes #356: SIGABRT on second decode on Android arm64).
+    llama_memory_t mem = llama_get_memory(context_);
+    if (mem) {
+        llama_memory_clear(mem, true);
+    }
+
     cancel_requested_.store(false);
 
     std::string prompt = build_prompt(request);
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
index ac6b40955..03d37dc22 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
@@ -207,6 +207,11 @@ rac_result_t rac_llm_llamacpp_generate(rac_handle_t handle, const char* prompt,
     }
     RAC_LOG_INFO("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generate() returned, tokens=%d", result.tokens_generated);
 
+    if (result.finish_reason == "error") {
+        RAC_LOG_ERROR("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generation failed (e.g. llama_decode error)");
+        return RAC_ERROR_GENERATION_FAILED;
+    }
+
     // Fill RAC result struct
     out_result->text = result.text.empty() ? nullptr : strdup(result.text.c_str());
     out_result->completion_tokens = result.tokens_generated;
diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
index 44a90df41..0997c7477 100644
--- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
+++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
@@ -567,6 +567,14 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate
 
     if (status != RAC_SUCCESS) {
         LOGe("racLlmComponentGenerate failed with status=%d", status);
+        const char* msg = rac_error_message(status);
+        if (msg && *msg) {
+            jclass exClass = env->FindClass("java/lang/RuntimeException");
+            if (exClass) {
+                env->ThrowNew(exClass, msg);
+                env->DeleteLocalRef(exClass);
+            }
+        }
         return nullptr;
     }
 

From 2473e1bb3459dda223a058ce01006757f1504f60 Mon Sep 17 00:00:00 2001
From: sakirr <sakirahmed75531@gmail.com>
Date: Fri, 20 Feb 2026 00:18:26 +0000
Subject: [PATCH 2/3] fix: address bot review comments - null guard, decode
 failure flag, error details, and JNI exception fallback

---
 .../backends/llamacpp/llamacpp_backend.cpp    | 10 +++++--
 .../src/backends/llamacpp/llamacpp_backend.h  |  1 +
 .../backends/llamacpp/rac_llm_llamacpp.cpp    |  2 ++
 .../src/jni/runanywhere_commons_jni.cpp       | 26 +++++++++++++++----
 4 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index 0feb00468..8295d75d2 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -529,7 +529,9 @@ TextGenerationResult LlamaCppTextGeneration::generate(const TextGenerationReques
     result.prompt_tokens = prompt_tokens;
     result.inference_time_ms = duration.count();
 
-    if (cancel_requested_.load()) {
+    if (decode_failed_) {
+        result.finish_reason = "error";
+    } else if (cancel_requested_.load()) {
         result.finish_reason = "cancelled";
     } else if (success) {
         result.finish_reason = tokens_generated >= request.max_tokens ? "length" : "stop";
@@ -556,6 +558,7 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
     }
 
     cancel_requested_.store(false);
+    decode_failed_ = false;
 
     std::string prompt = build_prompt(request);
     LOGI("Generating with prompt length: %zu", prompt.length());
@@ -724,6 +727,7 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
 
         if (llama_decode(context_, batch) != 0) {
             LOGE("llama_decode failed during generation");
+            decode_failed_ = true;
             break;
         }
     }
@@ -732,7 +736,9 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
         callback(stop_window);
     }
 
-    llama_memory_clear(llama_get_memory(context_), true);
+    if (llama_memory_t post_mem = llama_get_memory(context_)) {
+        llama_memory_clear(post_mem, true);
+    }
 
     llama_batch_free(batch);
 
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
index 1387d5491..8b7d3cb33 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
@@ -134,6 +134,7 @@ class LlamaCppTextGeneration {
 
     bool model_loaded_ = false;
     std::atomic<bool> cancel_requested_{false};
+    bool decode_failed_ = false;
 
     std::string model_path_;
     nlohmann::json model_config_;
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
index 03d37dc22..babfbf286 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
@@ -207,8 +207,10 @@ rac_result_t rac_llm_llamacpp_generate(rac_handle_t handle, const char* prompt,
     }
     RAC_LOG_INFO("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generate() returned, tokens=%d", result.tokens_generated);
 
+    // finish_reason is std::string; TODO: migrate to enum if TextGenerationResult gains one
     if (result.finish_reason == "error") {
         RAC_LOG_ERROR("LLM.LlamaCpp", "rac_llm_llamacpp_generate: generation failed (e.g. llama_decode error)");
+        rac_error_set_details("Generation failed: llama_decode returned non-zero");
         return RAC_ERROR_GENERATION_FAILED;
     }
 
diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
index 0997c7477..fa4b69da0 100644
--- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
+++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
@@ -17,6 +17,7 @@
 #include <jni.h>
 
 #include <condition_variable>
+#include <cstdio>
 #include <cstring>
 #include <mutex>
 #include <string>
@@ -567,13 +568,17 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate
 
     if (status != RAC_SUCCESS) {
         LOGe("racLlmComponentGenerate failed with status=%d", status);
+        rac_llm_result_free(&result);
         const char* msg = rac_error_message(status);
-        if (msg && *msg) {
-            jclass exClass = env->FindClass("java/lang/RuntimeException");
-            if (exClass) {
-                env->ThrowNew(exClass, msg);
-                env->DeleteLocalRef(exClass);
+        jclass exClass = env->FindClass("java/lang/RuntimeException");
+        if (exClass) {
+            char fallback[64];
+            if (!msg || !*msg) {
+                snprintf(fallback, sizeof(fallback), "LLM generation failed (status=%d)", status);
+                msg = fallback;
             }
+            env->ThrowNew(exClass, msg);
+            env->DeleteLocalRef(exClass);
         }
         return nullptr;
     }
@@ -849,6 +854,17 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate
 
     if (status != RAC_SUCCESS) {
         LOGe("rac_llm_component_generate_stream failed with status=%d", status);
+        const char* msg = rac_error_message(status);
+        jclass exClass = env->FindClass("java/lang/RuntimeException");
+        if (exClass) {
+            char fallback[64];
+            if (!msg || !*msg) {
+                snprintf(fallback, sizeof(fallback), "LLM stream generation failed (status=%d)", status);
+                msg = fallback;
+            }
+            env->ThrowNew(exClass, msg);
+            env->DeleteLocalRef(exClass);
+        }
         return nullptr;
     }
 

From 8a163cad18a1dbbf249d481182b164850fee68d4 Mon Sep 17 00:00:00 2001
From: sakirr <sakirahmed75531@gmail.com>
Date: Fri, 20 Feb 2026 04:26:11 +0000
Subject: [PATCH 3/3] fix: make decode_failed_ std::atomic for thread safety
 (review)

---
 .../src/backends/llamacpp/llamacpp_backend.h                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
index 8b7d3cb33..dc348a595 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
@@ -134,7 +134,7 @@ class LlamaCppTextGeneration {
 
     bool model_loaded_ = false;
     std::atomic<bool> cancel_requested_{false};
-    bool decode_failed_ = false;
+    std::atomic<bool> decode_failed_{false};
 
     std::string model_path_;
     nlohmann::json model_config_;