Skip to content

Commit 1196526

Browse files
committed
fix: bugs
1 parent 06c0d60 commit 1196526

File tree

2 files changed

+12
-4
lines changed

2 files changed

+12
-4
lines changed

llama/addon/AddonContext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
403403
}
404404

405405
if (options.Has("batchSize")) {
406-
context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value();
406+
context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value() + 1; // +1 to handle edge cases with SWA KV cache
407407
context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side
408408
}
409409

src/evaluator/LlamaContext/LlamaContext.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,9 @@ export class LlamaContextSequence {
10551055
*
10561056
* This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
10571057
*
1058-
* When SWA is used, this index will usually be `Math.max(0, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
1058+
* When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
1059+
*
1060+
* When the KV cache is empty, this index will be `-1`.
10591061
*
10601062
* You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
10611063
*/
@@ -1207,6 +1209,8 @@ export class LlamaContextSequence {
12071209
) {
12081210
this._ensureNotDisposed();
12091211

1212+
let awaitPromise: Promise<void> | undefined;
1213+
12101214
await withLock(this._context, "context", async () => {
12111215
this._ensureNotDisposed();
12121216

@@ -1250,7 +1254,7 @@ export class LlamaContextSequence {
12501254

12511255
const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0)
12521256
? 0
1253-
: this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId);
1257+
: Math.max(0, this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId));
12541258
if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition)
12551259
// we have to drop the cache and reevaluate the sequence due to missing KV cache
12561260
deletionSuccessful = false;
@@ -1310,8 +1314,12 @@ export class LlamaContextSequence {
13101314
this._nextTokenIndex = 0;
13111315
this._context._ctx.disposeSequence(this._sequenceId);
13121316

1313-
await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
1317+
// wait for the evaluation outside the "context" lock to avoid deadlocks
1318+
awaitPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
13141319
});
1320+
1321+
if (awaitPromise != null)
1322+
await awaitPromise;
13151323
}
13161324

13171325
/**

0 commit comments

Comments
 (0)