fix: adapt to llama.cpp breaking changes

giladgd · giladgd · commit 76dea80dfe96 · 2024-10-27T01:43:10.000+02:00
diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md
@@ -85,3 +85,37 @@ npx --no node-llama-cpp source download
 ```
 
 Now, just use `node-llama-cpp` as you normally would.
+
+## Intel AMX {#intel-amx}
+> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
+> that helps optimize and accelerate matrix multiplication operations.
+> 
+> It's available on the 4th Gen and newer Intel Xeon processors.
+
+Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
+
+If you're using a 4th Gen or newer Intel Xeon processor,
+you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
+
+To do this, run this command inside your project on the machine you run your project on:
+```shell
+npx --no node-llama-cpp source download
+```
+
+Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
+and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
+
+```typescript
+import os from "os";
+import {getLlama} from "node-llama-cpp";
+
+const llama = await getLlama({
+    usePrebuiltBinaries: !os.cpus().some((cpu) => (
+        cpu.model.toLowerCase().includes("Xeon".toLowerCase())
+    ))
+});
+```
+::: info NOTE
+Building from source can take some time (when using CUDA even up to an hour in extreme cases),
+so ensure you dedicate some time for this as part of the deployment process.
+:::
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
                 OUTPUT_VARIABLE NODE_ADDON_API_DIR
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
 
+set(LLAMA_BUILD_COMMON ON)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    add_compile_options(-Wno-c++17-extensions)
+endif()
+
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
     GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
 
     for (size_t i = 0; i < tokensLength; i++) {
-        llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
+        common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
     }
 
     if (generateLogitAtTheEnd) {
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
@@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {
     std::string text = info[0].As<Napi::String>().Utf8Value();
     bool specialTokens = info[1].As<Napi::Boolean>().Value();
 
-    std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);
+    std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);
 
     Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());
     for (size_t i = 0; i < tokens.size(); ++i) {
@@ -539,23 +539,23 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_prefix(model));
+    return getNapiToken(info, model, llama_token_fim_pre(model));
 }
 Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_middle(model));
+    return getNapiToken(info, model, llama_token_fim_mid(model));
 }
 Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_suffix(model));
+    return getNapiToken(info, model, llama_token_fim_suf(model));
 }
 Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
     if (disposed) {
diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
@@ -52,11 +52,6 @@ void AddonSampler::dispose() {
         topPSampler = nullptr;
     }
 
-    if (softmaxSampler != nullptr) {
-        llama_sampler_free(softmaxSampler);
-        softmaxSampler = nullptr;
-    }
-
     if (seedSampler != nullptr) {
         llama_sampler_free(seedSampler);
         seedSampler = nullptr;
@@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {
             llama_sampler_chain_add(chain, temperatureSampler);
         }
 
-        if (softmaxSampler != nullptr) {
-            llama_sampler_chain_add(chain, softmaxSampler);
-        }
-
         if (seedSampler != nullptr) {
             llama_sampler_chain_add(chain, seedSampler);
         }
@@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
         }
     }
 
-    if (softmaxSampler == nullptr) {
-        softmaxSampler = llama_sampler_init_softmax();
-    }
-
     if (config.Has("minP")) {
         auto minP = config.Get("minP").As<Napi::Number>().FloatValue();
         if (minP != minPSampler_minP) {
diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h
@@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap<AddonSampler> {
 
         llama_sampler * topPSampler = nullptr;
         float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled
-
-        llama_sampler * softmaxSampler = nullptr;
         
         llama_sampler * seedSampler = nullptr;
         uint32_t seedSampler_seed = 0;
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
@@ -94,6 +94,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (ciMode) {
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
+                    if (!cmakeCustomOptions.has("GGML_AMX"))
+                        cmakeCustomOptions.set("GGML_AMX", "OFF");
                 }
 
                 await fs.remove(outDirectory);

Original file line number	Diff line number	Diff line change
`@@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {`
`447`	`447`	`GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);`
`448`	`448`
`449`	`449`	`for (size_t i = 0; i < tokensLength; i++) {`
`450`		`- llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);`
	`450`	`+ common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);`
`451`	`451`	`}`
`452`	`452`
`453`	`453`	`if (generateLogitAtTheEnd) {`
Original file line number	Diff line number	Diff line change
`@@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {`
`426`	`426`	`std::string text = info[0].As<Napi::String>().Utf8Value();`
`427`	`427`	`bool specialTokens = info[1].As<Napi::Boolean>().Value();`
`428`	`428`
`429`		`- std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);`
	`429`	`+ std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);`
`430`	`430`
`431`	`431`	`Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());`
`432`	`432`	`for (size_t i = 0; i < tokens.size(); ++i) {`
`@@ -539,23 +539,23 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {`
`539`	`539`	`return info.Env().Undefined();`
`540`	`540`	`}`
`541`	`541`
`542`		`- return getNapiToken(info, model, llama_token_prefix(model));`
	`542`	`+ return getNapiToken(info, model, llama_token_fim_pre(model));`
`543`	`543`	`}`
`544`	`544`	`Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {`
`545`	`545`	`if (disposed) {`
`546`	`546`	`Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();`
`547`	`547`	`return info.Env().Undefined();`
`548`	`548`	`}`
`549`	`549`
`550`		`- return getNapiToken(info, model, llama_token_middle(model));`
	`550`	`+ return getNapiToken(info, model, llama_token_fim_mid(model));`
`551`	`551`	`}`
`552`	`552`	`Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {`
`553`	`553`	`if (disposed) {`
`554`	`554`	`Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();`
`555`	`555`	`return info.Env().Undefined();`
`556`	`556`	`}`
`557`	`557`
`558`		`- return getNapiToken(info, model, llama_token_suffix(model));`
	`558`	`+ return getNapiToken(info, model, llama_token_fim_suf(model));`
`559`	`559`	`}`
`560`	`560`	`Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {`
`561`	`561`	`if (disposed) {`
Original file line number	Diff line number	Diff line change
`@@ -52,11 +52,6 @@ void AddonSampler::dispose() {`
`52`	`52`	`topPSampler = nullptr;`
`53`	`53`	`}`
`54`	`54`
`55`		`- if (softmaxSampler != nullptr) {`
`56`		`- llama_sampler_free(softmaxSampler);`
`57`		`- softmaxSampler = nullptr;`
`58`		`- }`
`59`		`-`
`60`	`55`	`if (seedSampler != nullptr) {`
`61`	`56`	`llama_sampler_free(seedSampler);`
`62`	`57`	`seedSampler = nullptr;`
`@@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {`
`135`	`130`	`llama_sampler_chain_add(chain, temperatureSampler);`
`136`	`131`	`}`
`137`	`132`
`138`		`- if (softmaxSampler != nullptr) {`
`139`		`- llama_sampler_chain_add(chain, softmaxSampler);`
`140`		`- }`
`141`		`-`
`142`	`133`	`if (seedSampler != nullptr) {`
`143`	`134`	`llama_sampler_chain_add(chain, seedSampler);`
`144`	`135`	`}`
`@@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {`
`206`	`197`	`}`
`207`	`198`	`}`
`208`	`199`
`209`		`- if (softmaxSampler == nullptr) {`
`210`		`- softmaxSampler = llama_sampler_init_softmax();`
`211`		`- }`
`212`		`-`
`213`	`200`	`if (config.Has("minP")) {`
`214`	`201`	`auto minP = config.Get("minP").As<Napi::Number>().FloatValue();`
`215`	`202`	`if (minP != minPSampler_minP) {`
Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions`
`94`	`94`	`if (ciMode) {`
`95`	`95`	`if (!cmakeCustomOptions.has("GGML_OPENMP"))`
`96`	`96`	`cmakeCustomOptions.set("GGML_OPENMP", "OFF");`
	`97`	`+`
	`98`	`+ if (!cmakeCustomOptions.has("GGML_AMX"))`
	`99`	`+ cmakeCustomOptions.set("GGML_AMX", "OFF");`
`97`	`100`	`}`
`98`	`101`
`99`	`102`	`await fs.remove(outDirectory);`