fix: llama.cpp interface breaking change (#10)

giladgd · web-flow · commit 8086c5f07cea · 2023-08-25T22:05:15.000Z
* Made changes to adapt to new llama.cpp interface breaking changes
* Now saving the release that the binaries were compiled for as part of the final npm package and download it by default

BREAKING CHANGE: only `.gguf` models are supported from now on
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -20,12 +20,17 @@ jobs:
       - name: Generate docs
         run: npm run generate-docs
       - name: Download latest llama.cpp release
-        run: node ./dist/cli/cli.js download --release latest --skipBuild
+        run: node ./dist/cli/cli.js download --release latest --skipBuild --updateBinariesReleaseMetadata
       - name: Upload build artifact
         uses: actions/upload-artifact@v3
         with:
           name: "build"
           path: "dist"
+      - name: Upload binariesGithubRelease.json artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: "binariesGithubRelease"
+          path: "llama/binariesGithubRelease.json"
       - name: Upload build artifact
         uses: actions/upload-artifact@v3
         with:
@@ -227,6 +232,9 @@ jobs:
           mv artifacts/build dist/
           mv artifacts/docs docs/
           
+          rm -f ./llama/binariesGithubRelease
+          mv artifacts/binariesGithubRelease ./llama/binariesGithubRelease.json
+          
           echo "Built binaries:"
           ls llamaBins
       - name: Release
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ import {LlamaModel, LlamaContext, LlamaChatSession} from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 const model = new LlamaModel({
-    modelPath: path.join(__dirname, "models", "vicuna-13b-v1.5-16k.ggmlv3.q5_1.bin")
+    modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf")
 });
 const context = new LlamaContext({model});
 const session = new LlamaChatSession({context});
@@ -73,7 +73,7 @@ export class MyCustomChatPromptWrapper extends ChatPromptWrapper {
 }
 
 const model = new LlamaModel({
-    modelPath: path.join(__dirname, "models", "vicuna-13b-v1.5-16k.ggmlv3.q5_1.bin"),
+    modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf"),
     promptWrapper: new MyCustomChatPromptWrapper() // by default, LlamaChatPromptWrapper is used
 })
 const context = new LlamaContext({model});
@@ -103,7 +103,7 @@ import {LlamaModel, LlamaContext, LlamaChatSession} from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 const model = new LlamaModel({
-    modelPath: path.join(__dirname, "models", "vicuna-13b-v1.5-16k.ggmlv3.q5_1.bin")
+    modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf")
 });
 
 const context = new LlamaContext({model});
diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -67,6 +67,7 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
             }
         }
 
+        llama_backend_init(false);
         model = llama_load_model_from_file(modelPath.c_str(), params);
 
         if (model == NULL) {
@@ -124,7 +125,18 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
 
     // Decode each token and accumulate the result.
     for (size_t i = 0; i < tokens.ElementLength(); i++) {
-      const char* str = llama_token_to_str(ctx, (llama_token)tokens[i]);
+      // source: https://github.com/ggerganov/llama.cpp/blob/232caf3c1581a6cb023571780ff41dc2d66d1ca0/llama.cpp#L799-L811
+      std::vector<char> result(8, 0);
+      const int n_tokens = llama_token_to_str(ctx, (llama_token)tokens[i], result.data(), result.size());
+      if (n_tokens < 0) {
+          result.resize(-n_tokens);
+          int check = llama_token_to_str(ctx, (llama_token)tokens[i], result.data(), result.size());
+          GGML_ASSERT(check == -n_tokens);
+      } else {
+          result.resize(n_tokens);
+      }
+
+      const char* str = result.data();
       if (str == nullptr) {
         Napi::Error::New(info.Env(), "Invalid token").ThrowAsJavaScriptException();
         return info.Env().Undefined();
@@ -134,6 +146,15 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
 
     return Napi::String::New(info.Env(), ss.str());
   }
+  Napi::Value TokenBos(const Napi::CallbackInfo& info) {
+    return Napi::Number::From(info.Env(), llama_token_bos(ctx));
+  }
+  Napi::Value TokenEos(const Napi::CallbackInfo& info) {
+    return Napi::Number::From(info.Env(), llama_token_eos(ctx));
+  }
+  Napi::Value GetMaxContextSize(const Napi::CallbackInfo& info) {
+    return Napi::Number::From(info.Env(), llama_n_ctx(ctx));
+  }
   Napi::Value Eval(const Napi::CallbackInfo& info);
   static void init(Napi::Object exports) {
     exports.Set("LLAMAContext",
@@ -142,6 +163,9 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
             {
                 InstanceMethod("encode", &LLAMAContext::Encode),
                 InstanceMethod("decode", &LLAMAContext::Decode),
+                InstanceMethod("tokenBos", &LLAMAContext::TokenBos),
+                InstanceMethod("tokenEos", &LLAMAContext::TokenEos),
+                InstanceMethod("getMaxContextSize", &LLAMAContext::GetMaxContextSize),
                 InstanceMethod("eval", &LLAMAContext::Eval),
             }));
   }
@@ -151,7 +175,6 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
 class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
   LLAMAContext* ctx;
   std::vector<llama_token> tokens;
-  std::vector<llama_token> restriction;
   llama_token result;
 
   public:
@@ -160,13 +183,6 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
     Napi::Uint32Array tokens = info[0].As<Napi::Uint32Array>();
     this->tokens.reserve(tokens.ElementLength());
     for (size_t i = 0; i < tokens.ElementLength(); i++) { this->tokens.push_back(static_cast<llama_token>(tokens[i])); }
-
-    if (info.Length() > 1 && info[1].IsTypedArray()) {
-      Napi::Uint32Array restriction = info[1].As<Napi::Uint32Array>();
-      this->restriction.reserve(restriction.ElementLength());
-      for (size_t i = 0; i < restriction.ElementLength(); i++) { this->restriction.push_back(static_cast<llama_token>(restriction[i])); }
-      std::sort(this->restriction.begin(), this->restriction.end());
-    }
   }
   ~LLAMAContextEvalWorker() { ctx->Unref(); }
   using Napi::AsyncWorker::Queue;
@@ -175,39 +191,30 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
   protected:
   void Execute() {
     // Perform the evaluation using llama_eval.
-    int r = llama_eval(ctx->ctx, tokens.data(), tokens.size(), llama_get_kv_cache_token_count(ctx->ctx), 6);
+    int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), 6);
     if (r != 0) {
       SetError("Eval has failed");
       return;
     }
 
+    llama_token new_token_id = 0;
+
     // Select the best prediction.
-    float* logits = llama_get_logits(ctx->ctx);
-    int n_vocab = llama_n_vocab(ctx->ctx);
-    llama_token re;
-    if (restriction.empty()) {
-      float max = logits[0];
-      re = 0;
-      for (llama_token id = 1; id < n_vocab; id++) {
-        float logit = logits[id];
-        if (logit > max) {
-          max = logit;
-          re = id;
-        }
-      }
-    } else {
-      float max = logits[restriction[0]];
-      re = 0;
-      for (size_t i = 1; i < restriction.size(); i++) {
-        llama_token id = restriction[i];
-        float logit = logits[id];
-        if (logit > max) {
-          max = logit;
-          re = id;
-        }
-      }
+    auto logits = llama_get_logits(ctx->ctx);
+    auto n_vocab = llama_n_vocab(ctx->ctx);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+      candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
     }
-    result = re;
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+    new_token_id = llama_sample_token_greedy(ctx->ctx , &candidates_p);
+
+    result = new_token_id;
   }
   void OnOK() {
     Napi::Env env = Napi::AsyncWorker::Env();
@@ -223,15 +230,11 @@ Napi::Value LLAMAContext::Eval(const Napi::CallbackInfo& info) {
   return worker->Promise();
 }
 
-Napi::Value tokenBos(const Napi::CallbackInfo& info) { return Napi::Number::From(info.Env(), llama_token_bos()); }
-Napi::Value tokenEos(const Napi::CallbackInfo& info) { return Napi::Number::From(info.Env(), llama_token_eos()); }
 Napi::Value systemInfo(const Napi::CallbackInfo& info) { return Napi::String::From(info.Env(), llama_print_system_info()); }
 
 Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
   llama_backend_init(false);
   exports.DefineProperties({
-      Napi::PropertyDescriptor::Function("tokenBos", tokenBos),
-      Napi::PropertyDescriptor::Function("tokenEos", tokenEos),
       Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
   });
   LLAMAModel::init(exports);
diff --git a/llama/binariesGithubRelease.json b/llama/binariesGithubRelease.json
@@ -0,0 +1,3 @@
+{
+    "release": "latest"
+}
diff --git a/package.json b/package.json
@@ -68,15 +68,14 @@
     "node-gyp",
     "prebuilt-binaries",
     "llm",
-    "ggml",
-    "ggmlv3",
+    "gguf",
     "raspberry-pi",
     "self-hosted",
     "local",
     "catai"
   ],
   "author": "Gilad S.",
-  "license": "ISC",
+  "license": "MIT",
   "bugs": {
     "url": "https://github.com/withcatai/node-llama-cpp/issues"
   },
diff --git a/src/chatWrappers/LlamaChatPromptWrapper.ts b/src/chatWrappers/LlamaChatPromptWrapper.ts
@@ -3,7 +3,7 @@ import {ChatPromptWrapper} from "../ChatPromptWrapper.js";
 // source: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
 export class LlamaChatPromptWrapper extends ChatPromptWrapper {
     public override wrapPrompt(prompt: string, {systemPrompt, promptIndex}: {systemPrompt: string, promptIndex: number}) {
-        if (promptIndex === 0) {
+        if (promptIndex === 0 && systemPrompt != "") {
             return "<s>[INST] <<SYS>>\n" + systemPrompt + "\n<</SYS>>\n\n" + prompt + " [/INST]\n\n";
         } else {
             return "<s>[INST] " + prompt + " [/INST]\n\n";
diff --git a/src/cli/commands/DownloadCommand.ts b/src/cli/commands/DownloadCommand.ts
@@ -11,13 +11,15 @@ import {defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, llamaCppDirectory, te
 import {compileLlamaCpp} from "../../utils/compileLLamaCpp.js";
 import withOra from "../../utils/withOra.js";
 import {clearTempFolder} from "../../utils/clearTempFolder.js";
+import {setBinariesGithubRelease} from "../../utils/binariesGithubRelease.js";
 
 type DownloadCommandArgs = {
     repo: string,
     release: "latest" | string,
     arch?: string,
     nodeTarget?: string,
-    skipBuild?: boolean
+    skipBuild?: boolean,
+    updateBinariesReleaseMetadata?: boolean
 };
 
 export const DownloadCommand: CommandModule<object, DownloadCommandArgs> = {
@@ -33,7 +35,7 @@ export const DownloadCommand: CommandModule<object, DownloadCommandArgs> = {
             .option("release", {
                 type: "string",
                 default: defaultLlamaCppRelease,
-                description: "The tag of the llama.cpp release to download. Can also be set via the NODE_LLAMA_CPP_REPO_RELEASE environment variable"
+                description: "The tag of the llama.cpp release to download. Set to \"latest\" to download the latest release. Can also be set via the NODE_LLAMA_CPP_REPO_RELEASE environment variable"
             })
             .option("arch", {
                 type: "string",
@@ -47,12 +49,18 @@ export const DownloadCommand: CommandModule<object, DownloadCommandArgs> = {
                 type: "boolean",
                 default: false,
                 description: "Skip building llama.cpp after downloading it"
+            })
+            .option("updateBinariesReleaseMetadata", {
+                type: "boolean",
+                hidden: true, // this for the CI to use
+                default: false,
+                description: "Update the binariesGithubRelease.json file with the release of llama.cpp that was downloaded"
             });
     },
     handler: DownloadLlamaCppCommand
 };
 
-export async function DownloadLlamaCppCommand({repo, release, arch, nodeTarget, skipBuild}: DownloadCommandArgs) {
+export async function DownloadLlamaCppCommand({repo, release, arch, nodeTarget, skipBuild, updateBinariesReleaseMetadata}: DownloadCommandArgs) {
     const octokit = new Octokit();
     const [githubOwner, githubRepo] = repo.split("/");
 
@@ -147,6 +155,10 @@ export async function DownloadLlamaCppCommand({repo, release, arch, nodeTarget,
         });
     }
 
+    if (updateBinariesReleaseMetadata) {
+        await setBinariesGithubRelease(githubRelease!.data.tag_name);
+    }
+
     console.log();
     console.log();
     console.log(`${chalk.yellow("Repo:")} ${repo}`);
diff --git a/src/config.ts b/src/config.ts
@@ -3,6 +3,7 @@ import * as path from "path";
 import * as os from "os";
 import envVar from "env-var";
 import * as uuid from "uuid";
+import {getBinariesGithubRelease} from "./utils/binariesGithubRelease.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
@@ -14,12 +15,13 @@ export const llamaBinsDirectory = path.join(__dirname, "..", "llamaBins");
 export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp");
 export const tempDownloadDirectory = path.join(os.tmpdir(), "node-llama-cpp", uuid.v4());
 export const usedBinFlagJsonPath = path.join(llamaDirectory, "usedBin.json");
+export const binariesGithubReleasePath = path.join(llamaDirectory, "binariesGithubRelease.json");
 
 export const defaultLlamaCppGitHubRepo = env.get("NODE_LLAMA_CPP_REPO")
     .default("ggerganov/llama.cpp")
     .asString();
 export const defaultLlamaCppRelease = env.get("NODE_LLAMA_CPP_REPO_RELEASE")
-    .default("latest")
+    .default(await getBinariesGithubRelease())
     .asString();
 export const defaultSkipDownload = env.get("NODE_LLAMA_CPP_SKIP_DOWNLOAD")
     .default("false")
diff --git a/src/llamaEvaluator/LlamaContext.ts b/src/llamaEvaluator/LlamaContext.ts
@@ -1,4 +1,4 @@
-import {LLAMAContext, llamaCppNode} from "./LlamaBins.js";
+import {LLAMAContext} from "./LlamaBins.js";
 import {LlamaModel} from "./LlamaModel.js";
 
 export class LlamaContext {
@@ -18,12 +18,12 @@ export class LlamaContext {
         return this._ctx.decode(tokens);
     }
 
-    public async *evaluate(tokens: Uint32Array, getRestrictions?: () => Uint32Array) {
+    public async *evaluate(tokens: Uint32Array) {
         let evalTokens = tokens;
 
         if (this._prependBos) {
             const tokenArray = Array.from(tokens);
-            tokenArray.unshift(llamaCppNode.tokenBos());
+            tokenArray.unshift(this._ctx.tokenBos());
 
             evalTokens = Uint32Array.from(tokenArray);
             this._prependBos = false;
@@ -32,10 +32,10 @@ export class LlamaContext {
         // eslint-disable-next-line no-constant-condition
         while (true) {
             // Evaluate to get the next token.
-            const nextToken = await this._ctx.eval(evalTokens, getRestrictions?.());
+            const nextToken = await this._ctx.eval(evalTokens);
 
             // the assistant finished answering
-            if (nextToken === llamaCppNode.tokenEos())
+            if (nextToken === this._ctx.tokenEos())
                 break;
 
             yield nextToken;
diff --git a/src/utils/binariesGithubRelease.ts b/src/utils/binariesGithubRelease.ts
@@ -0,0 +1,22 @@
+import fs from "fs-extra";
+import {binariesGithubReleasePath} from "../config.js";
+
+type BinariesGithubReleaseFile = {
+    release: "latest" | string
+};
+
+export async function getBinariesGithubRelease() {
+    const binariesGithubRelease: BinariesGithubReleaseFile = await fs.readJson(binariesGithubReleasePath);
+
+    return binariesGithubRelease.release;
+}
+
+export async function setBinariesGithubRelease(release: BinariesGithubReleaseFile["release"]) {
+    const binariesGithubReleaseJson: BinariesGithubReleaseFile = {
+        release: release
+    };
+
+    await fs.writeJson(binariesGithubReleasePath, binariesGithubReleaseJson, {
+        spaces: 4
+    });
+}
diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts
@@ -89,9 +89,7 @@ export async function loadBin(): Promise<LlamaCppNodeModule> {
 export type LlamaCppNodeModule = {
     LLAMAModel: LLAMAModel,
     LLAMAContext: LLAMAContext,
-    tokenBos(): number,
-    systemInfo(): string,
-    tokenEos(): number,
+    systemInfo(): string
 };
 
 export type LLAMAModel = {
@@ -113,8 +111,9 @@ export type LLAMAModel = {
 export type LLAMAContext = {
     new (model: LLAMAModel): LLAMAContext,
     encode(text: string): Uint32Array,
-    // restrictions used to guide the model towards a specific topic,
-    // to only allow the given tokens to be returned, or at least to be more likely to be returned
-    eval(tokens: Uint32Array, restrictions?: Uint32Array): Promise<number>,
-    decode(tokens: Uint32Array): string
+    eval(tokens: Uint32Array): Promise<number>,
+    decode(tokens: Uint32Array): string,
+    tokenBos(): number,
+    tokenEos(): number,
+    getMaxContextSize(): number
 };