Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions .github/ISSUE_TEMPLATE/bug-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,10 @@ body:
id: steps
attributes:
label: Steps to reproduce
description: >-
description: |-
Your bug can be investigated much faster if your code can be run without any dependencies other than `node-llama-cpp`.
Issues without reproduction steps or code examples may be closed as not actionable.
Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)).
Please include a link to the model file you used if possible.
Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)), including a link to the model file you used if possible.
Also, please enable enable debug logs by using `getLlama({debug: true})` to get more information.
placeholder: >-
Please try to provide a Minimal, Complete, and Verifiable example.
Expand All @@ -50,10 +49,9 @@ body:
id: env
attributes:
label: My Environment
description: >-
description: |-
Please include the result of the command `npx --yes node-llama-cpp inspect gpu`.
Please also add any other relevant dependencies to this table at the end.
For example: Electron, Bun, Webpack.
Please also add any other relevant dependencies to this table at the end. For example: Electron, Bun, Webpack.
value: |
| Dependency | Version |
| --- | --- |
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/documentation-issue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ body:
id: details
attributes:
label: What was unclear or otherwise insufficient?
description: >-
description: |-
If relevant, please be clear about the documentation URL, as well as the location within the page.
Add a link to the relevant documentation you're referring to.
placeholder: >-
Expand Down
4 changes: 4 additions & 0 deletions .github/ISSUE_TEMPLATE/feature-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ body:
required: false
- label: CUDA support
required: false
- label: Vulkan support
required: false
- label: Grammar
required: false
- label: Function calling
required: false
- type: dropdown
id: pr
attributes:
Expand Down
34 changes: 30 additions & 4 deletions .vitepress/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ const packageVersion = env.get("DOCS_PACKAGE_VERSION")
.default(packageJson.version)
.asString();

const hostname = "https://node-llama-cpp.withcat.ai/";
const hostname = "https://node-llama-cpp.withcat.ai/"
const buildDate = new Date();

const socialPosterLink = hostname + "social.poster.jpg";
const defaultPageTitle = "node-llama-cpp - node.js bindings for llama.cpp";
Expand Down Expand Up @@ -90,7 +91,7 @@ export default defineConfig({
base: urlBase,
sitemap: {
hostname,
transformItems(items) {
async transformItems(items) {
function priorityMatch(a: {url: string}, b: {url: string}, matchers: ((url: string) => boolean)[]): number {
for (const matcher of matchers) {
const aMatch = matcher(a.url);
Expand All @@ -105,13 +106,38 @@ export default defineConfig({
return 0;
}

const blogPosts = await createContentLoader("blog/*.md", {
excerpt: true,
render: true
})
.load();
const blogPostMap = new Map<string, typeof blogPosts[number]>();
for (const blogPost of blogPosts) {
let url = blogPost.url;
if (url.startsWith("/"))
url = url.slice("/".length);

blogPostMap.set(url, blogPost);
}

return items
.map((item) => {
if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
if (item.url === "" || item.url === "blog/") {
item.lastmod = new Date(buildDate);
} else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
item = {
...item,
lastmod: undefined
lastmod: new Date(buildDate)
};
} else if (item.lastmod == null && item.url.startsWith("blog/")) {
const postDate = blogPostMap.get(item.url)?.frontmatter.date;
if (postDate != null) {
const parsedDate = new Date(postDate);
if (Number.isFinite(parsedDate.getTime()))
item.lastmod = parsedDate;
}
} else if (item.lastmod == null) {
item.lastmod = new Date(buildDate);
}

return item;
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<div align="center">
<img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" />
<a href="https://node-llama-cpp.withcat.ai" target="_blank"><img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" /></a>
<h1>node-llama-cpp</h1>
<p>Run AI models locally on your machine</p>
<sub>Pre-built bindings are provided with a fallback to building from source with cmake</sub>
Expand Down
31 changes: 31 additions & 0 deletions docs/guide/chat-session.md
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,34 @@ await new Promise(resolve => setTimeout(resolve, 1500));
const cachedCompletion = completionEngine.complete("Hi there! How");
console.log("Cached completion:", cachedCompletion);
```

## Response Prefix {#response-prefix}
You can force the model response to start with a specific prefix,
to make the model follow a certain direction in its response.

```typescript
import {fileURLToPath} from "url";
import path from "path";
import {getLlama, LlamaChatSession, GeneralChatWrapper} from "node-llama-cpp";

const __dirname = path.dirname(fileURLToPath(import.meta.url));

const llama = await getLlama();
const model = await llama.loadModel({
modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
});
const context = await model.createContext();
const session = new LlamaChatSession({
contextSequence: context.getSequence(),
chatWrapper: new GeneralChatWrapper()
});


const q1 = "Hi there, how are you?";
console.log("User: " + q1);

const a1 = await session.prompt(q1, {
responsePrefix: "The weather today is"
});
console.log("AI: " + a1);
```
24 changes: 24 additions & 0 deletions docs/guide/electron.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,27 @@ so that `node-llama-cpp` can find them.
Cross packaging from one platform to another is not supported, since binaries for other platforms are not downloaded to you machine when your run `npm install`.

Packaging an `arm64` app on an `x64` machine is supported, but packaging an `x64` app on an `arm64` machine is not.

## Bundling
When bundling your code for Electron using [Electron Vite](https://electron-vite.org) or Webpack,
ensure that `node-llama-cpp` is not bundled, and is instead treated as an external module.

Marking `node-llama-cpp` as an external module will prevent its code from being bundled with your application code,
and instead, it'll be loaded from the `node_modules` directory at runtime (which should be packed into a `.asar` archive).

The file structure of `node-llama-cpp` is crucial for it to function correctly,
so bundling it will break its functionality.
Moreover, since `node-llama-cpp` includes prebuilt binaries (and also local builds from source),
those files must be retained in their original structure for it to work.

Electron has [its own bundling solution called ASAR](https://www.electronjs.org/docs/latest/tutorial/asar-archives) that is designed to work with node modules.
ASAR retains the original file structure of node modules by packing all the files into a single `.asar` archive file that Electron will read from at runtime like it would from the file system.
This method ensures node modules work as intended in Electron applications, even though they are bundled into a single file.

Using ASAR is the recommended way to bundle `node-llama-cpp` in your Electron app.

If you're using the scaffolded Electron app, this is already taken care of.

::: tip NOTE
We recommend using [Electron Vite](https://electron-vite.org) over Webpack for your Electron app due to to Vite's speed and Webpack's lack of proper ESM support in the output bundle, which complicates the bundling process.
:::
34 changes: 34 additions & 0 deletions docs/guide/tips-and-tricks.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,37 @@ npx --no node-llama-cpp source download
```

Now, just use `node-llama-cpp` as you normally would.

## Intel AMX {#intel-amx}
> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
> that helps optimize and accelerate matrix multiplication operations.
>
> It's available on the 4th Gen and newer Intel Xeon processors.

Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).

If you're using a 4th Gen or newer Intel Xeon processor,
you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.

To do this, run this command inside your project on the machine you run your project on:
```shell
npx --no node-llama-cpp source download
```

Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:

```typescript
import os from "os";
import {getLlama} from "node-llama-cpp";

const llama = await getLlama({
usePrebuiltBinaries: !os.cpus().some((cpu) => (
cpu.model.toLowerCase().includes("Xeon".toLowerCase())
))
});
```
::: info NOTE
Building from source can take some time (when using CUDA even up to an hour in extreme cases),
so ensure you dedicate some time for this as part of the deployment process.
:::
6 changes: 6 additions & 0 deletions llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
OUTPUT_VARIABLE NODE_ADDON_API_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)

set(LLAMA_BUILD_COMMON ON)

if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
add_compile_options(-Wno-c++17-extensions)
endif()

include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})

add_subdirectory("llama.cpp")
Expand Down
2 changes: 1 addition & 1 deletion llama/addon/AddonContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);

for (size_t i = 0; i < tokensLength; i++) {
llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
}

if (generateLogitAtTheEnd) {
Expand Down
8 changes: 4 additions & 4 deletions llama/addon/AddonModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {
std::string text = info[0].As<Napi::String>().Utf8Value();
bool specialTokens = info[1].As<Napi::Boolean>().Value();

std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);
std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);

Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());
for (size_t i = 0; i < tokens.size(); ++i) {
Expand Down Expand Up @@ -539,23 +539,23 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {
return info.Env().Undefined();
}

return getNapiToken(info, model, llama_token_prefix(model));
return getNapiToken(info, model, llama_token_fim_pre(model));
}
Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
if (disposed) {
Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

return getNapiToken(info, model, llama_token_middle(model));
return getNapiToken(info, model, llama_token_fim_mid(model));
}
Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
if (disposed) {
Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
return info.Env().Undefined();
}

return getNapiToken(info, model, llama_token_suffix(model));
return getNapiToken(info, model, llama_token_fim_suf(model));
}
Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
if (disposed) {
Expand Down
13 changes: 0 additions & 13 deletions llama/addon/AddonSampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,6 @@ void AddonSampler::dispose() {
topPSampler = nullptr;
}

if (softmaxSampler != nullptr) {
llama_sampler_free(softmaxSampler);
softmaxSampler = nullptr;
}

if (seedSampler != nullptr) {
llama_sampler_free(seedSampler);
seedSampler = nullptr;
Expand Down Expand Up @@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {
llama_sampler_chain_add(chain, temperatureSampler);
}

if (softmaxSampler != nullptr) {
llama_sampler_chain_add(chain, softmaxSampler);
}

if (seedSampler != nullptr) {
llama_sampler_chain_add(chain, seedSampler);
}
Expand Down Expand Up @@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
}
}

if (softmaxSampler == nullptr) {
softmaxSampler = llama_sampler_init_softmax();
}

if (config.Has("minP")) {
auto minP = config.Get("minP").As<Napi::Number>().FloatValue();
if (minP != minPSampler_minP) {
Expand Down
2 changes: 0 additions & 2 deletions llama/addon/AddonSampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap<AddonSampler> {

llama_sampler * topPSampler = nullptr;
float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled

llama_sampler * softmaxSampler = nullptr;

llama_sampler * seedSampler = nullptr;
uint32_t seedSampler_seed = 0;
Expand Down
2 changes: 2 additions & 0 deletions llama/addon/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "globals/addonLog.h"
#include "globals/addonProgress.h"
#include "globals/getGpuInfo.h"
#include "globals/getSwapInfo.h"

bool backendInitialized = false;
bool backendDisposed = false;
Expand Down Expand Up @@ -203,6 +204,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo),
Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
Napi::PropertyDescriptor::Function("init", addonInit),
Napi::PropertyDescriptor::Function("dispose", addonDispose),
});
Expand Down
10 changes: 8 additions & 2 deletions llama/addon/globals/getGpuInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void logVulkanWarning(const char* message) {
Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
uint64_t total = 0;
uint64_t used = 0;
uint64_t unifiedVramSize = 0;

#ifdef GPU_INFO_USE_CUDA
size_t cudaDeviceTotal = 0;
Expand All @@ -41,26 +42,31 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
#ifdef GPU_INFO_USE_VULKAN
uint64_t vulkanDeviceTotal = 0;
uint64_t vulkanDeviceUsed = 0;
const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, logVulkanWarning);
uint64_t vulkanDeviceUnifiedVramSize = 0;
const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, &vulkanDeviceUnifiedVramSize, logVulkanWarning);

if (vulkanDeviceSupportsMemoryBudgetExtension) {
total += vulkanDeviceTotal;
used += vulkanDeviceUsed;
unifiedVramSize += vulkanDeviceUnifiedVramSize;
}
#endif

#ifdef GPU_INFO_USE_METAL
uint64_t metalDeviceTotal = 0;
uint64_t metalDeviceUsed = 0;
getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed);
uint64_t metalDeviceUnifiedVramSize = 0;
getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed, &metalDeviceUnifiedVramSize);

total += metalDeviceTotal;
used += metalDeviceUsed;
unifiedVramSize += metalDeviceUnifiedVramSize;
#endif

Napi::Object result = Napi::Object::New(info.Env());
result.Set("total", Napi::Number::From(info.Env(), total));
result.Set("used", Napi::Number::From(info.Env(), used));
result.Set("unifiedSize", Napi::Number::From(info.Env(), unifiedVramSize));

return result;
}
Expand Down
Loading
Loading