fix: Fix off-by-one for limiting cached tokens to before alora start

gabe-l-hart · gabe-l-hart · commit 52b37e13bc8a · 2025-08-22T10:00:10.000-06:00
This was the cause of the inconsistent results from the dummy test script
with and without the turn that runs the prompt without the adapter before
running it with the adapter.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3338,7 +3338,7 @@ struct server_context {
                                 // if there is an alora invoked, don't cache after the invocation start
                                 if (slot.alora_invocation_start >= 0) {
                                     SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start);
-                                    slot.n_past = std::min(slot.n_past, slot.alora_invocation_start);
+                                    slot.n_past = std::min(slot.n_past, slot.alora_invocation_start - 1);
                                 }
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position

Original file line number	Diff line number	Diff line change
`@@ -3338,7 +3338,7 @@ struct server_context {`
`3338`	`3338`	`// if there is an alora invoked, don't cache after the invocation start`
`3339`	`3339`	`if (slot.alora_invocation_start >= 0) {`
`3340`	`3340`	`SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start);`
`3341`		`- slot.n_past = std::min(slot.n_past, slot.alora_invocation_start);`
	`3341`	`+ slot.n_past = std::min(slot.n_past, slot.alora_invocation_start - 1);`
`3342`	`3342`	`}`
`3343`	`3343`
`3344`	`3344`	`// reuse chunks from the cached prompt by shifting their KV cache in the new position`