Skip to content

Commit 76dea80

Browse files
committed
fix: adapt to llama.cpp breaking changes
1 parent 3b195db commit 76dea80

File tree

7 files changed

+48
-20
lines changed

7 files changed

+48
-20
lines changed

docs/guide/tips-and-tricks.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,37 @@ npx --no node-llama-cpp source download
8585
```
8686

8787
Now, just use `node-llama-cpp` as you normally would.
88+
89+
## Intel AMX {#intel-amx}
90+
> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
91+
> that helps optimize and accelerate matrix multiplication operations.
92+
>
93+
> It's available on the 4th Gen and newer Intel Xeon processors.
94+
95+
Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
96+
97+
If you're using a 4th Gen or newer Intel Xeon processor,
98+
you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
99+
100+
To do this, run this command inside your project on the machine you run your project on:
101+
```shell
102+
npx --no node-llama-cpp source download
103+
```
104+
105+
Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
106+
and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
107+
108+
```typescript
109+
import os from "os";
110+
import {getLlama} from "node-llama-cpp";
111+
112+
const llama = await getLlama({
113+
usePrebuiltBinaries: !os.cpus().some((cpu) => (
114+
cpu.model.toLowerCase().includes("Xeon".toLowerCase())
115+
))
116+
});
117+
```
118+
::: info NOTE
119+
Building from source can take some time (when using CUDA even up to an hour in extreme cases),
120+
so ensure you dedicate some time for this as part of the deployment process.
121+
:::

llama/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
2222
OUTPUT_VARIABLE NODE_ADDON_API_DIR
2323
OUTPUT_STRIP_TRAILING_WHITESPACE)
2424

25+
set(LLAMA_BUILD_COMMON ON)
26+
27+
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
28+
add_compile_options(-Wno-c++17-extensions)
29+
endif()
30+
2531
include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
2632

2733
add_subdirectory("llama.cpp")

llama/addon/AddonContext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
447447
GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
448448

449449
for (size_t i = 0; i < tokensLength; i++) {
450-
llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
450+
common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
451451
}
452452

453453
if (generateLogitAtTheEnd) {

llama/addon/AddonModel.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {
426426
std::string text = info[0].As<Napi::String>().Utf8Value();
427427
bool specialTokens = info[1].As<Napi::Boolean>().Value();
428428

429-
std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);
429+
std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);
430430

431431
Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());
432432
for (size_t i = 0; i < tokens.size(); ++i) {
@@ -539,23 +539,23 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {
539539
return info.Env().Undefined();
540540
}
541541

542-
return getNapiToken(info, model, llama_token_prefix(model));
542+
return getNapiToken(info, model, llama_token_fim_pre(model));
543543
}
544544
Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
545545
if (disposed) {
546546
Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
547547
return info.Env().Undefined();
548548
}
549549

550-
return getNapiToken(info, model, llama_token_middle(model));
550+
return getNapiToken(info, model, llama_token_fim_mid(model));
551551
}
552552
Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
553553
if (disposed) {
554554
Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
555555
return info.Env().Undefined();
556556
}
557557

558-
return getNapiToken(info, model, llama_token_suffix(model));
558+
return getNapiToken(info, model, llama_token_fim_suf(model));
559559
}
560560
Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
561561
if (disposed) {

llama/addon/AddonSampler.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ void AddonSampler::dispose() {
5252
topPSampler = nullptr;
5353
}
5454

55-
if (softmaxSampler != nullptr) {
56-
llama_sampler_free(softmaxSampler);
57-
softmaxSampler = nullptr;
58-
}
59-
6055
if (seedSampler != nullptr) {
6156
llama_sampler_free(seedSampler);
6257
seedSampler = nullptr;
@@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {
135130
llama_sampler_chain_add(chain, temperatureSampler);
136131
}
137132

138-
if (softmaxSampler != nullptr) {
139-
llama_sampler_chain_add(chain, softmaxSampler);
140-
}
141-
142133
if (seedSampler != nullptr) {
143134
llama_sampler_chain_add(chain, seedSampler);
144135
}
@@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
206197
}
207198
}
208199

209-
if (softmaxSampler == nullptr) {
210-
softmaxSampler = llama_sampler_init_softmax();
211-
}
212-
213200
if (config.Has("minP")) {
214201
auto minP = config.Get("minP").As<Napi::Number>().FloatValue();
215202
if (minP != minPSampler_minP) {

llama/addon/AddonSampler.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap<AddonSampler> {
2525

2626
llama_sampler * topPSampler = nullptr;
2727
float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled
28-
29-
llama_sampler * softmaxSampler = nullptr;
3028

3129
llama_sampler * seedSampler = nullptr;
3230
uint32_t seedSampler_seed = 0;

src/bindings/utils/compileLLamaCpp.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
9494
if (ciMode) {
9595
if (!cmakeCustomOptions.has("GGML_OPENMP"))
9696
cmakeCustomOptions.set("GGML_OPENMP", "OFF");
97+
98+
if (!cmakeCustomOptions.has("GGML_AMX"))
99+
cmakeCustomOptions.set("GGML_AMX", "OFF");
97100
}
98101

99102
await fs.remove(outDirectory);

0 commit comments

Comments
 (0)