Skip to content

Commit 632a7bf

Browse files
authored
feat: token prediction (speculative decoding) (#405)
* feat: token prediction (speculative decoding) * feat: `DraftSequenceTokenPredictor` * feat: `InputLookupTokenPredictor` * feat: `controlledEvaluate` * feat: reranking (`LlamaRankingContext`) * feat: `evaluateWithMetadata` * feat: token confidence * feat: `experimentalChunkDocument` * feat: build on arm64 using LLVM, use Visual Studio's CMake when available * feat: try compiling with LLVM on Windows x64 when available * feat(minor): dynamically load `llama.cpp` backends * feat(minor): more token values support in `SpecialToken` * feat(minor): improve memory usage estimation * fix: check for Rosetta usage on macOS x64 when using the `inspect gpu` command * fix: detect running under Rosetta on Apple Silicone and show an error message instead of crashing * fix: switch from `"nextTick"` to `"nextCycle"` for the default batch dispatcher * fix: remove deprecated CLS token * fix: pipe error logs in `inspect gpu` command * docs: improve building from source * docs: CUDA in Docker troubleshooting * docs: reranking * docs: context shift strategy * docs: remove Intel AMX trick, since it's being automatically used in the prebuilt binaries now
1 parent e2c5c3f commit 632a7bf

File tree

123 files changed

+7863
-938
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

123 files changed

+7863
-938
lines changed

.config/typedoc.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,6 @@
2727
"interfacePropertiesFormat": "list",
2828
"sort": ["source-order"],
2929
"docsRoot": "../docs",
30-
"intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
30+
"intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
3131
"useHTMLEncodedBrackets": true
3232
}

.github/workflows/build.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ jobs:
2323
- name: Download latest llama.cpp release
2424
env:
2525
CI: true
26-
# pinned to `b4291` temporarily until the Windows on Arm64 build is fixed
27-
run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
26+
run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
2827
- name: Upload build artifact
2928
uses: actions/upload-artifact@v4
3029
with:

.vitepress/config.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,16 @@ export default defineConfig({
132132
item.lastmod = new Date(buildDate);
133133
item.changefreq = "daily";
134134
item.priority = 0.9;
135+
} else if (item.url === "guide/") {
136+
item.changefreq = "daily";
137+
item.priority = 0.7;
135138
} else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
136139
item = {
137140
...item,
138141
lastmod: new Date(buildDate),
139142
changefreq: "weekly",
140143
priority: item.url.startsWith("cli/")
141-
? 0.7
144+
? 0.6
142145
: 0.5
143146
};
144147
} else if (item.lastmod == null && item.url.startsWith("blog/")) {
@@ -358,6 +361,9 @@ export default defineConfig({
358361
}
359362
},
360363
markdown: {
364+
languageAlias: {
365+
"js-highlight": "javascript"
366+
},
361367
codeTransformers: [
362368
transformerTwoslash({
363369
explicitTrigger: false,
@@ -482,7 +488,10 @@ export default defineConfig({
482488
{text: "External Chat State", link: "/external-chat-state"},
483489
{text: "Token Bias", link: "/token-bias"},
484490
{text: "Objects Lifecycle", link: "/objects-lifecycle"},
491+
{text: "Chat Context Shift", link: "/chat-context-shift"},
485492
{text: "Batching", link: "/batching"},
493+
{text: "Token Prediction", link: "/token-prediction"},
494+
{text: "Low Level API", link: "/low-level-api"},
486495
{text: "Awesome List", link: "/awesome"},
487496
{text: "Troubleshooting", link: "/troubleshooting"},
488497
{text: "Tips and Tricks", link: "/tips-and-tricks"}

.vitepress/config/apiReferenceSidebar.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import {DefaultTheme} from "vitepress";
22
/* eslint import/no-unresolved: "off" */
3-
import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
3+
import typedocSidebar from "../../docs/api/typedoc-sidebar.json";
44

55
const categoryOrder = [
66
"Functions",
@@ -28,6 +28,7 @@ const classesOrder = [
2828
"LlamaCompletion",
2929
"LlamaEmbeddingContext",
3030
"LlamaEmbedding",
31+
"LlamaRankingContext",
3132
"LlamaGrammar",
3233
"LlamaJsonSchemaGrammar",
3334
"LlamaText",

.vitepress/theme/style.css

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type {
354354
}
355355

356356
.language-ts > .lang,
357-
.language-shell > .lang {
357+
.language-shell > .lang,
358+
.language-js-highlight > .lang {
358359
display: none;
359360
}
360361

.vitepress/utils/parseCmakeListsTxtOptions.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
const maxLinesSpan = 10;
22

3+
const cmakeOptionRegex =
4+
/^\s*option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/;
35
export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
46
const lines = cmakeListsTxtString.split("\n");
57

@@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
810
const match = lines
911
.slice(index, index + maxLinesSpan)
1012
.join("\n")
11-
.match(
12-
/^option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/
13-
);
13+
.match(cmakeOptionRegex);
1414
if (match == null || match.groups == null || match?.index !== 0)
1515
return null;
1616

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
* [Use the CLI to chat with a model without writing any code](#try-it-without-installing)
2727
* Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release)
2828
* Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
29-
* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
29+
* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions
3030
* [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
31+
* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
3132
* Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
3233
* Much more
3334

docs/guide/building-from-source.md

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,62 @@ This is useful for building from source on machines that aren't connected to the
2525
:::
2626

2727
::: info
28-
2928
If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source.
3029

3130
If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies).
31+
:::
32+
33+
::: details Dependencies for macOS
34+
If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`,
35+
try running this command to install the Xcode command line tools:
36+
```shell
37+
xcode-select --install
38+
```
39+
:::
40+
41+
::: details Dependencies for Windows x64
42+
If the build fails on your machine, ensure you have all the necessary build tools installed.
43+
44+
You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
45+
```shell
46+
winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
47+
```
48+
> WinGet is built-in on Windows 11 and modern Windows 10 versions
3249
33-
If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools.
50+
---
51+
52+
You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
53+
* **`Workloads` tab:** select `Desktop development with C++`
54+
* **`Individual components` tab**: select the following:
55+
* C++ ATL for latest v143 build tools (x86 & x64)
56+
* C++ MFC for latest v143 build tools (x86 & x64)
57+
* C++ CMake tools for Windows
58+
* C++ Clang Compiler for Windows
59+
* MSBuild support for LLVM (clang-cl) toolset
60+
* Windows Universal CRT SDK
61+
:::
62+
63+
::: details Dependencies for Windows on Arm
64+
On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.
65+
66+
You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
67+
```shell
68+
winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
69+
```
70+
> WinGet is built-in on Windows 11 and modern Windows 10 versions
71+
72+
---
3473

74+
You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
75+
* **`Workloads` tab:** select `Desktop development with C++`
76+
* **`Individual components` tab**: select the following:
77+
* MSVC v143 - VS 2022 C++ ARM64 build tools (latest)
78+
* C++ ATL for latest v143 build tools (ARM64/ARM64EC)
79+
* C++ MFC for latest v143 build tools (ARM64/ARM64EC)
80+
* C++ CMake tools for Windows
81+
* C++ Clang Compiler for Windows
82+
* MSBuild support for LLVM (clang-cl) toolset
83+
* Windows Universal CRT SDK
3584
:::
3685

3786
## `source download` and `source build` Commands

docs/guide/chat-context-shift.md

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Chat Context Shift Strategy {#background}
2+
When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated.
3+
This is called a context shift.
4+
5+
`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it.
6+
7+
You can override `node-llama-cpp`'s default context shift strategy
8+
when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md)
9+
by providing a custom context shift strategy.
10+
11+
## The Default Context Shift Strategy {#default-strategy}
12+
The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`.
13+
14+
This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place.
15+
If a response is completely removed, the prompt that came before it will be removed as well.
16+
17+
## Implementing a Custom Context Shift Strategy {#custom-strategy}
18+
A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and
19+
returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size.
20+
21+
The context shift strategy will be called only when the context state needs to be shifted.
22+
23+
If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long),
24+
the prompting function will abort the evaluation and throw an error.
25+
26+
A custom context shift strategy can be a simple logic that prioritizes which data to remove,
27+
or it can even use a language model to summarize information to shorten the chat history.
28+
29+
It's important to keep the last user prompt and model response as-is to prevent infinite generation loops.
30+
31+
```typescript
32+
import {fileURLToPath} from "url";
33+
import path from "path";
34+
import {getLlama, LlamaChatSession} from "node-llama-cpp";
35+
36+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
37+
38+
const llama = await getLlama();
39+
const model = await llama.loadModel({
40+
modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
41+
});
42+
const context = await model.createContext();
43+
44+
// ---cut---
45+
const session = new LlamaChatSession({
46+
contextSequence: context.getSequence(),
47+
contextShift: {
48+
strategy({
49+
chatHistory, chatWrapper, maxTokensCount, tokenizer,
50+
lastShiftMetadata
51+
}) {
52+
// clone the chat history to not mutate the original
53+
const newChatHistory = chatHistory.map(
54+
(item) => structuredClone(item)
55+
);
56+
57+
function getTokensLeftToRemove() {
58+
const {
59+
contextText
60+
} = chatWrapper.generateContextState({chatHistory});
61+
const tokenUsage = contextText.tokenize(tokenizer).length;
62+
63+
return Math.max(0, tokenUsage - maxTokensCount);
64+
}
65+
66+
while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) {
67+
for (let i = 0; i < newChatHistory.length - 2; i++) {
68+
const chatItem = newChatHistory[i]!;
69+
70+
if (i === 0 && chatItem.type === "system")
71+
// don't remove the first system message
72+
continue;
73+
else if (chatItem.type === "model") {
74+
// remove the model response
75+
newChatHistory.splice(i, 1);
76+
i--;
77+
78+
// remove the user messages that
79+
// came before the model response
80+
while (
81+
i > 0 &&
82+
newChatHistory[i - 1]?.type === "user"
83+
) {
84+
newChatHistory.splice(i - 1, 1);
85+
i--;
86+
}
87+
} else if (chatItem.type === "system") {
88+
// don't remove system messages on their own
89+
continue;
90+
} else if (chatItem.type === "user") {
91+
// don't remove user messages on their own
92+
continue;
93+
} else {
94+
// ensure we handle all message types.
95+
// otherwise, this will error
96+
void (chatItem satisfies never);
97+
}
98+
}
99+
}
100+
101+
return {
102+
chatHistory: newChatHistory,
103+
104+
// this metadata will be passed to the next context shift
105+
// strategy call as the `lastShiftMetadata` argument
106+
metadata: {}
107+
};
108+
}
109+
}
110+
});
111+
```

docs/guide/choosing-a-model.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model:
124124

125125
Many embedding models include terms like `embed` in their name.
126126

127+
* **Reranking models** - models that are trained to rerank (sort) a list of documents
128+
based on their relevance to a given query.
129+
These models are usually smaller and faster than general-purpose models,
130+
making them more efficient and practical for reranking tasks.
131+
132+
Reranking models are often significantly smaller (sometimes as small as 500MB), faster,
133+
and consume less memory than general-purpose models, making them more efficient and practical.
134+
135+
While general-purpose models can also be used for reranking,
136+
doing this requires prompting the model, which is more cumbersome and inefficient than
137+
using a specialized model with a [ranking context](./embedding.md#reranking) for this task.
138+
139+
Many reranking models include terms like `rerank` or `reranker` in their name.
140+
127141
### How much data do you plan to feed the model at once with?
128142
If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size.
129143
The larger the context size is, the more data the model can process at once.

0 commit comments

Comments
 (0)