feat: get embedding for text (#144)

giladgd · web-flow · commit 4cf1fbaf06f9 · 2024-01-21T02:29:44.000+02:00
* feat: get embedding for text
* feat(minor): improve `resolveChatWrapperBasedOnModel` logic
* style: improve GitHub release notes formatting
diff --git a/.releaserc.ts b/.releaserc.ts
@@ -14,12 +14,12 @@ const homepageUrlWithoutTrailingSlash = homepageUrl.endsWith("/")
     ? homepageUrl.slice(0, -1)
     : homepageUrl;
 
-const newFooterTemplate = defaultFooterTemplate + "\n---\n" +
-    `Shipped with \`llama.cpp\` release: [\`${llamaCppRelease.split("`").join("")}\`](https://github.com/${defaultLlamaCppGitHubRepo}/releases/tag/${encodeURIComponent(llamaCppRelease)}) ` +
-    `(to use the latest \`llama.cpp\` release available, run \`npx --no ${cliBinName} download --release latest\`. [learn more](${homepageUrlWithoutTrailingSlash}/guide/building-from-source))\n`;
+const newFooterTemplate = defaultFooterTemplate + "\n---\n\n" +
+    `Shipped with \`llama.cpp\` release [\`${llamaCppRelease.split("`").join("")}\`](https://github.com/${defaultLlamaCppGitHubRepo}/releases/tag/${encodeURIComponent(llamaCppRelease)})\n\n` +
+    `> To use the latest \`llama.cpp\` release available, run \`npx --no ${cliBinName} download --release latest\`. ([learn more](${homepageUrlWithoutTrailingSlash}/guide/building-from-source#downloading-a-newer-release))\n`;
 
 /**
- * @type {import('semantic-release').GlobalConfig}
+ * @type {import("semantic-release").GlobalConfig}
  */
 export default {
     "branches": [
diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -358,7 +358,9 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
             if (info.Length() > 1 && info[1].IsObject()) {
                 Napi::Object options = info[1].As<Napi::Object>();
 
-                if (options.Has("seed")) {
+                if (options.Has("noSeed")) {
+                    context_params.seed = time(NULL);
+                } else if (options.Has("seed")) {
                     context_params.seed = options.Get("seed").As<Napi::Number>().Uint32Value();
                 }
 
@@ -370,10 +372,6 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
                     context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value();
                 }
 
-                if (options.Has("logitsAll")) {
-                    context_params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
-                }
-
                 if (options.Has("embedding")) {
                     context_params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
                 }
@@ -544,6 +542,23 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
             return info.Env().Undefined();
         }
 
+        Napi::Value GetEmbedding(const Napi::CallbackInfo& info) {
+            if (disposed) {
+                Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
+                return info.Env().Undefined();
+            }
+
+            const int n_embd = llama_n_embd(model->model);
+            const auto * embeddings = llama_get_embeddings(ctx);
+
+            Napi::Float64Array result = Napi::Float64Array::New(info.Env(), n_embd);
+            for (size_t i = 0; i < n_embd; ++i) {
+                result[i] = embeddings[i];
+            }
+
+            return result;
+        }
+
         static void init(Napi::Object exports) {
             exports.Set(
                 "AddonContext",
@@ -560,6 +575,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
                         InstanceMethod("decodeBatch", &AddonContext::DecodeBatch),
                         InstanceMethod("sampleToken", &AddonContext::SampleToken),
                         InstanceMethod("acceptGrammarEvaluationStateToken", &AddonContext::AcceptGrammarEvaluationStateToken),
+                        InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
                         InstanceMethod("dispose", &AddonContext::Dispose)
                     }
                 )
diff --git a/src/chatWrappers/resolveChatWrapperBasedOnModel.ts b/src/chatWrappers/resolveChatWrapperBasedOnModel.ts
@@ -29,8 +29,8 @@ export function resolveChatWrapperBasedOnModel({
         if (fileType?.toLowerCase() === "gguf") {
             const lowercaseName = name?.toLowerCase();
             const lowercaseSubType = subType?.toLowerCase();
-            const splitLowercaseSubType = lowercaseSubType?.split("-");
-            const firstSplitLowercaseSubType = splitLowercaseSubType?.[0];
+            const splitLowercaseSubType = lowercaseSubType?.split("-") ?? [];
+            const firstSplitLowercaseSubType = splitLowercaseSubType[0];
 
             if (lowercaseName === "llama")
                 return LlamaChatWrapper;
@@ -48,6 +48,8 @@ export function resolveChatWrapperBasedOnModel({
                 return AlpacaChatWrapper;
             else if (lowercaseName === "functionary")
                 return FunctionaryChatWrapper;
+            else if (lowercaseName === "dolphin" && splitLowercaseSubType.includes("mistral"))
+                return ChatMLChatWrapper;
         }
     }
 
diff --git a/src/index.ts b/src/index.ts
@@ -5,6 +5,9 @@ import {LlamaJsonSchemaGrammar} from "./llamaEvaluator/LlamaJsonSchemaGrammar.js
 import {LlamaJsonSchemaValidationError} from "./utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
 import {LlamaGrammarEvaluationState, LlamaGrammarEvaluationStateOptions} from "./llamaEvaluator/LlamaGrammarEvaluationState.js";
 import {LlamaContext, LlamaContextSequence} from "./llamaEvaluator/LlamaContext/LlamaContext.js";
+import {
+    LlamaEmbeddingContext, type LlamaEmbeddingContextOptions, LlamaEmbedding, type LlamaEmbeddingJSON
+} from "./llamaEvaluator/LlamaEmbeddingContext.js";
 import {
     type LlamaContextOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty, type CustomBatchingDispatchSchedule,
     type CustomBatchingPrioritizeStrategy, type BatchItem, type PrioritizedBatchItem, type ContextShiftOptions,
@@ -70,6 +73,10 @@ export {
     type ContextTokensDeleteRange,
     type EvaluationPriority,
     type LlamaContextSequenceRepeatPenalty,
+    LlamaEmbeddingContext,
+    type LlamaEmbeddingContextOptions,
+    LlamaEmbedding,
+    type LlamaEmbeddingJSON,
     LlamaChatSession,
     defineChatSessionFunction,
     type LlamaChatSessionOptions,
diff --git a/src/llamaEvaluator/LlamaContext/LlamaContext.ts b/src/llamaEvaluator/LlamaContext/LlamaContext.ts
@@ -44,13 +44,13 @@ export class LlamaContext {
         seed = null,
         contextSize = model.trainContextSize,
         batchSize = contextSize,
-        logitsAll,
-        embedding,
         threads = 6,
         batching: {
             dispatchSchedule: batchingDispatchSchedule = "nextTick",
             itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism"
-        } = {}
+        } = {},
+        _embedding,
+        _noSeed
     }: LlamaContextOptions) {
         if (model.disposed)
             throw new DisposedError();
@@ -63,9 +63,9 @@ export class LlamaContext {
             seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
             contextSize: contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize,
-            logitsAll,
-            embedding,
-            threads: Math.max(0, Math.floor(threads))
+            threads: Math.max(0, Math.floor(threads)),
+            embedding: _embedding,
+            noSeed: _noSeed
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,
diff --git a/src/llamaEvaluator/LlamaContext/types.ts b/src/llamaEvaluator/LlamaContext/types.ts
@@ -23,20 +23,26 @@ export type LlamaContextOptions = {
     /** prompt processing batch size */
     batchSize?: number,
 
-    /** the llama_eval() call computes all logits, not just the last one */
-    logitsAll?: boolean,
-
-    /** embedding mode only */
-    embedding?: boolean
-
     /**
      * number of threads to use to evaluate tokens.
      * set to 0 to use the maximum threads supported by the current machine hardware
      */
     threads?: number,
 
     /** control the parallel sequences processing behavior */
-    batching?: BatchingOptions
+    batching?: BatchingOptions,
+
+    /**
+     * embedding mode only
+     * @internal
+     */
+    _embedding?: boolean,
+
+    /**
+     * disable the seed generation
+     * @internal
+     */
+    _noSeed?: boolean
 };
 export type LlamaContextSequenceRepeatPenalty = {
     /** Tokens to lower the predication probability of to be the next predicted token */
diff --git a/src/llamaEvaluator/LlamaEmbeddingContext.ts b/src/llamaEvaluator/LlamaEmbeddingContext.ts
@@ -0,0 +1,120 @@
+import {withLock} from "lifecycle-utils";
+import {Token} from "../types.js";
+import {isLlamaText, LlamaText} from "../utils/LlamaText.js";
+import {LlamaModel} from "./LlamaModel.js";
+import {LlamaContext, LlamaContextSequence} from "./LlamaContext/LlamaContext.js";
+
+export type LlamaEmbeddingContextOptions = {
+    model: LlamaModel,
+
+    /** text context size */
+    contextSize?: number,
+
+    /** prompt processing batch size */
+    batchSize?: number,
+
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware
+     */
+    threads?: number,
+};
+
+export class LlamaEmbeddingContext {
+    /** @internal */ private readonly _llamaContext: LlamaContext;
+    /** @internal */ private readonly _sequence: LlamaContextSequence;
+
+    public constructor({
+        model,
+        contextSize = model.trainContextSize,
+        batchSize = contextSize,
+        threads = 6
+    }: LlamaEmbeddingContextOptions) {
+        const resolvedContextSize = Math.min(contextSize, model.trainContextSize);
+        const resolvedBatchSize = Math.min(batchSize, resolvedContextSize);
+
+        this._llamaContext = new LlamaContext({
+            model,
+            contextSize: resolvedContextSize,
+            batchSize: resolvedBatchSize,
+            threads,
+            _embedding: true,
+            _noSeed: true
+        });
+        this._sequence = this._llamaContext.getSequence();
+    }
+
+    public async getEmbeddingFor(input: Token[] | string | LlamaText) {
+        const resolvedInput = typeof input === "string"
+            ? this._llamaContext.model.tokenize(input)
+            : isLlamaText(input)
+                ? input.tokenize(this._llamaContext.model.tokenize)
+                : input;
+
+        if (resolvedInput.length > this._llamaContext.contextSize)
+            throw new Error(
+                "Input is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts."
+            );
+        else if (resolvedInput.length === 0)
+            return new LlamaEmbedding({vector: []});
+
+        return await withLock(this, "evaluate", async () => {
+            await this._sequence.eraseContextTokenRanges([{
+                start: 0,
+                end: this._sequence.nextTokenIndex
+            }]);
+
+            await this._sequence.evaluateWithoutGeneratingNewTokens(resolvedInput);
+
+            const embedding = this._llamaContext._ctx.getEmbedding();
+            const embeddingVector = Array.from(embedding);
+
+            return new LlamaEmbedding({vector: embeddingVector});
+        });
+    }
+
+    public dispose() {
+        return this._llamaContext.dispose();
+    }
+
+    /** @hidden */
+    public [Symbol.dispose]() {
+        return this.dispose();
+    }
+
+    public get disposed() {
+        return this._llamaContext.disposed;
+    }
+}
+
+export type LlamaEmbeddingJSON = {
+    type: "LlamaEmbedding",
+    vector: number[]
+};
+
+export class LlamaEmbedding {
+    public readonly vector: number[];
+
+    public constructor({vector}: {vector: number[]}) {
+        this.vector = vector;
+    }
+
+    public toJSON(): LlamaEmbeddingJSON {
+        return {
+            type: "LlamaEmbedding",
+            vector: this.vector
+        };
+    }
+
+    public static fromJSON(json: LlamaEmbeddingJSON) {
+        if (json == null || json.type !== "LlamaEmbedding" || !(json.vector instanceof Array) ||
+            json.vector.some(v => typeof v !== "number")
+        )
+            throw new Error("Invalid LlamaEmbedding JSON");
+
+        return new LlamaEmbedding({
+            vector: json.vector
+        });
+    }
+}
diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts
@@ -174,6 +174,8 @@ export type AddonContext = {
     shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void,
 
     acceptGrammarEvaluationStateToken(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): void,
+
+    getEmbedding(): Float64Array
 };
 
 export type BatchLogitIndex = number & {
diff --git a/test/modelDependent/functionary/embedding.test.ts b/test/modelDependent/functionary/embedding.test.ts
@@ -0,0 +1,66 @@
+import {describe, expect, test} from "vitest";
+import {LlamaEmbeddingContext, LlamaModel} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+
+describe("functionary", () => {
+    describe("embedding", () => {
+        test("deterministic", async () => {
+            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+
+            const model = new LlamaModel({
+                modelPath
+            });
+            const embeddingContext = new LlamaEmbeddingContext({
+                model,
+                contextSize: 4096
+            });
+
+            const helloWorldEmbedding = await embeddingContext.getEmbeddingFor("Hello world");
+
+            const helloThereEmbedding = await embeddingContext.getEmbeddingFor("Hello there");
+
+            expect(helloWorldEmbedding.vector).to.not.eql(helloThereEmbedding.vector);
+
+            const helloWorld2Embedding = await embeddingContext.getEmbeddingFor("Hello world");
+
+            expect(helloWorld2Embedding.vector).to.eql(helloWorldEmbedding.vector);
+            expect(helloWorld2Embedding.vector).to.not.eql(helloThereEmbedding.vector);
+
+            console.log(helloWorld2Embedding.vector);
+        }, {
+            timeout: 1000 * 60 * 60
+        });
+
+        test("deterministic between runs", async () => {
+            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+
+            const model = new LlamaModel({
+                modelPath
+            });
+            const embeddingContext = new LlamaEmbeddingContext({
+                model,
+                contextSize: 4096
+            });
+
+            const helloWorldEmbedding = await embeddingContext.getEmbeddingFor("Hello world");
+            const helloThereEmbedding = await embeddingContext.getEmbeddingFor("Hello there");
+
+            expect(helloWorldEmbedding.vector).to.not.eql(helloThereEmbedding.vector);
+
+            embeddingContext.dispose();
+
+            const embeddingContext2 = new LlamaEmbeddingContext({
+                model,
+                contextSize: 4096
+            });
+
+            const helloWorldEmbedding2 = await embeddingContext2.getEmbeddingFor("Hello world");
+            const helloThereEmbedding2 = await embeddingContext2.getEmbeddingFor("Hello there");
+
+            expect(helloWorldEmbedding2.vector).to.eql(helloWorldEmbedding.vector);
+            expect(helloThereEmbedding2.vector).to.eql(helloThereEmbedding.vector);
+        }, {
+            timeout: 1000 * 60 * 60
+        });
+    });
+});
diff --git a/test/standalone/parseModelFileName.test.ts b/test/standalone/parseModelFileName.test.ts
@@ -70,4 +70,15 @@ describe("parseModelFileName", () => {
                 parameters: "13B"
             });
     });
+
+    test("dolphin-2.1-mistral-7b.Q4_K_M.gguf", () => {
+        expect(parseModelFileName("dolphin-2.1-mistral-7b.Q4_K_M.gguf"))
+            .toEqual({
+                name: "dolphin",
+                subType: "2.1-mistral",
+                quantization: "Q4_K_M",
+                fileType: "gguf",
+                parameters: "7B"
+            });
+    });
 });