Revert "memory : move the recurrent state into the memory context"

ggerganov · ggerganov · commit c4c4cf3672e0 · 2025-11-18T11:27:12.000+02:00
This reverts commit 00f115f.
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -235,12 +235,6 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-llm_graph_input_rs::llm_graph_input_rs(const llama_memory_recurrent_context * mctx) :
-    mctx(mctx),
-    head(mctx->get_head()),
-    rs_z(mctx->get_rs_z()) {
-}
-
 void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
@@ -269,8 +263,8 @@ bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
     res &= s_copy_main->ne[0]  == params.ubatch.n_seqs;
     res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
 
-    res &= this->head == mctx->get_head();
-    res &= this->rs_z == mctx->get_rs_z();
+    res &= head == mctx->get_head();
+    res &= rs_z == mctx->get_rs_z();
 
     return res;
 }
@@ -1906,6 +1900,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
     inp->s_copy_main  = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
     inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
 
+    inp->head = mctx_cur->get_head();
+    inp->rs_z = mctx_cur->get_rs_z();
+
     return inp;
 }
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -219,7 +219,7 @@ class llm_graph_input_cls : public llm_graph_input_i {
 
 class llm_graph_input_rs : public llm_graph_input_i {
 public:
-    llm_graph_input_rs(const llama_memory_recurrent_context * mctx);
+    llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
     virtual ~llm_graph_input_rs() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
@@ -235,9 +235,9 @@ class llm_graph_input_rs : public llm_graph_input_i {
 
     const llama_memory_recurrent_context * mctx;
 
-    // need to match for valid graph reuse
-    const uint32_t head;
-    const  int32_t rs_z;
+    // used in view offsets, need to match for valid graph reuse
+    uint32_t head;
+    int32_t rs_z;
 };
 
 class llm_graph_input_cross_embd : public llm_graph_input_i {
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -1093,15 +1093,12 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
 llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
 
 llama_memory_recurrent_context::llama_memory_recurrent_context(
-        llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem),
-        n_rs(mem->size), head(0), rs_z(0), size(mem->size) {
+        llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
 }
 
 llama_memory_recurrent_context::llama_memory_recurrent_context(
         llama_memory_recurrent * mem,
-        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)),
-        n_rs(mem->n), head(mem->head), rs_z(mem->rs_z), size(mem->size) {
-}
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
 
 llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
 
@@ -1142,19 +1139,19 @@ const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
 }
 
 uint32_t llama_memory_recurrent_context::get_n_rs() const {
-    return n_rs;
+    return is_full ? mem->size : mem->n;
 }
 
 uint32_t llama_memory_recurrent_context::get_head() const {
-    return head;
+    return is_full ? 0 : mem->head;
 }
 
 int32_t llama_memory_recurrent_context::get_rs_z() const {
-    return rs_z;
+    return is_full ? 0 : mem->rs_z;
 }
 
 uint32_t llama_memory_recurrent_context::get_size() const {
-    return size;
+    return mem->size;
 }
 
 ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
@@ -1166,5 +1163,5 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
 }
 
 int32_t llama_memory_recurrent_context::s_copy(int i) const {
-    return  mem->cells[i + head].src0;
+    return  mem->cells[i + mem->head].src0;
 }
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
@@ -175,10 +175,8 @@ class llama_memory_recurrent_context : public llama_memory_context_i {
 
     //
     // data needed for building the compute graph for the current ubatch:
+    // TODO: extract all the state like `head` and `n` here
     //
 
-    const uint32_t n_rs = 0;
-    const uint32_t head = 0;
-    const int32_t  rs_z = -1;
-    const uint32_t size = 0;
+    const bool is_full = false;
 };

Original file line number	Diff line number	Diff line change
`@@ -1093,15 +1093,12 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell`
`1093`	`1093`	`llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}`
`1094`	`1094`
`1095`	`1095`	`llama_memory_recurrent_context::llama_memory_recurrent_context(`
`1096`		`- llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem),`
`1097`		`- n_rs(mem->size), head(0), rs_z(0), size(mem->size) {`
	`1096`	`+ llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {`
`1098`	`1097`	`}`
`1099`	`1098`
`1100`	`1099`	`llama_memory_recurrent_context::llama_memory_recurrent_context(`
`1101`	`1100`	`llama_memory_recurrent * mem,`
`1102`		`- std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)),`
`1103`		`- n_rs(mem->n), head(mem->head), rs_z(mem->rs_z), size(mem->size) {`
`1104`		`-}`
	`1101`	`+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}`
`1105`	`1102`
`1106`	`1103`	`llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;`
`1107`	`1104`
`@@ -1142,19 +1139,19 @@ const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {`
`1142`	`1139`	`}`
`1143`	`1140`
`1144`	`1141`	`uint32_t llama_memory_recurrent_context::get_n_rs() const {`
`1145`		`- return n_rs;`
	`1142`	`+ return is_full ? mem->size : mem->n;`
`1146`	`1143`	`}`
`1147`	`1144`
`1148`	`1145`	`uint32_t llama_memory_recurrent_context::get_head() const {`
`1149`		`- return head;`
	`1146`	`+ return is_full ? 0 : mem->head;`
`1150`	`1147`	`}`
`1151`	`1148`
`1152`	`1149`	`int32_t llama_memory_recurrent_context::get_rs_z() const {`
`1153`		`- return rs_z;`
	`1150`	`+ return is_full ? 0 : mem->rs_z;`
`1154`	`1151`	`}`
`1155`	`1152`
`1156`	`1153`	`uint32_t llama_memory_recurrent_context::get_size() const {`
`1157`		`- return size;`
	`1154`	`+ return mem->size;`
`1158`	`1155`	`}`
`1159`	`1156`
`1160`	`1157`	`ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {`
`@@ -1166,5 +1163,5 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {`
`1166`	`1163`	`}`
`1167`	`1164`
`1168`	`1165`	`int32_t llama_memory_recurrent_context::s_copy(int i) const {`
`1169`		`- return mem->cells[i + head].src0;`
	`1166`	`+ return mem->cells[i + mem->head].src0;`
`1170`	`1167`	`}`