feat: reranking (LlamaRankingContext)

giladgd · giladgd · commit 4e1c676e6a32 · 2025-01-06T04:24:05.000+02:00
diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts
@@ -1,6 +1,6 @@
 import {DefaultTheme} from "vitepress";
 /* eslint import/no-unresolved: "off" */
-import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
+import typedocSidebar from "../../docs/api/typedoc-sidebar.json";
 
 const categoryOrder = [
     "Functions",
@@ -28,6 +28,7 @@ const classesOrder = [
     "LlamaCompletion",
     "LlamaEmbeddingContext",
     "LlamaEmbedding",
+    "LlamaRankingContext",
     "LlamaGrammar",
     "LlamaJsonSchemaGrammar",
     "LlamaText",
diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md
@@ -138,6 +138,59 @@ const embedding = await context.getEmbeddingFor(text);
 console.log("Embedding vector:", embedding.vector);
 ```
 
+## Reranking Documents {#reranking}
+After you search for the most similar documents using embedding vectors,
+you can use inference to rerank (sort) the documents based on their relevance to the given query.
+
+Doing this allows you to combine the best of both worlds: the speed of embedding and the quality of inference.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(
+    fileURLToPath(import.meta.url)
+);
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "bge-reranker-v2-m3-Q8_0.gguf")
+});
+const context = await model.createRankingContext();
+
+const documents = [
+    "The sky is clear and blue today",
+    "I love eating pizza with extra cheese",
+    "Dogs love to play fetch with their owners",
+    "The capital of France is Paris",
+    "Drinking water is important for staying hydrated",
+    "Mount Everest is the tallest mountain in the world",
+    "A warm cup of tea is perfect for a cold winter day",
+    "Painting is a form of creative expression",
+    "Not all the things that shine are made of gold",
+    "Cleaning the house is a good way to keep it tidy"
+];
+
+const query = "Tell me a goegraphical fact";
+const rankedDocuments = await context.rankAndSort(query, documents);
+
+const topDocument = rankedDocuments[0]!;
+const secondDocument = rankedDocuments[1]!;
+
+console.log("query:", query);
+console.log("Top document:", topDocument.document);
+console.log("Second document:", secondDocument.document);
+console.log("Ranked documents:", rankedDocuments);
+```
+> This example will produce this output:
+> ```
+> query: Tell me a goegraphical fact
+> Top document: Mount Everest is the tallest mountain in the world
+> Second document: The capital of France is Paris
+> ```
+> This example uses [bge-reranker-v2-m3-Q8_0.gguf](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/blob/main/bge-reranker-v2-m3-Q8_0.gguf)
+
 ## Using External Databases
 When you have a large number of documents you want to use with embedding, it's often more efficient to store them with their embedding in an external database and search for the most similar embeddings there.
 
diff --git a/docs/index.md b/docs/index.md
@@ -97,6 +97,7 @@ npx -y node-llama-cpp inspect gpu
 * [Remote GGUF reader](./api/functions/readGgufFileInfo.md)
 * [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
 * [Token prediction](./guide/token-prediction.md)
+* [Reranking](./guide/embedding.md#reranking)
 
 </template>
 <template v-slot:simple-code>
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -415,6 +415,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.embeddings = options.Get("embeddings").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("ranking") && options.Get("ranking").As<Napi::Boolean>().Value()) {
+            context_params.pooling_type = LLAMA_POOLING_TYPE_RANK;
+        }
+
         if (options.Has("flashAttention")) {
             context_params.flash_attn = options.Get("flashAttention").As<Napi::Boolean>().Value();
         }
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
@@ -26,6 +26,7 @@ export type BindingModule = {
             flashAttention?: boolean,
             logitsAll?: boolean,
             embeddings?: boolean,
+            ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean
         }): AddonContext
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -80,7 +80,8 @@ export class LlamaContext {
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
         performanceTracking = false,
-        _embeddings
+        _embeddings,
+        _ranking
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
@@ -121,6 +122,7 @@ export class LlamaContext {
             flashAttention: this._flashAttention,
             threads: this._idealThreads,
             embeddings: _embeddings,
+            ranking: _ranking,
             performanceTracking: this._performanceTracking
         }));
         this._batchingOptions = {
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -171,7 +171,13 @@ export type LlamaContextOptions = {
      * embedding mode only
      * @internal
      */
-    _embeddings?: boolean
+    _embeddings?: boolean,
+
+    /**
+     * ranking mode
+     * @internal
+     */
+    _ranking?: boolean
 };
 export type LlamaContextSequenceRepeatPenalty = {
     /** Tokens to lower the predication probability of to be the next predicted token */
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -18,6 +18,7 @@ import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbed
 import {GgufArchitectureType, GgufMetadata} from "../../gguf/types/GgufMetadataTypes.js";
 import {OverridesObject} from "../../utils/OverridesObject.js";
 import {maxRecentDetokenizerTokens} from "../../consts.js";
+import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js";
 import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
 import type {Llama} from "../../bindings/Llama.js";
 import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
@@ -532,6 +533,16 @@ export class LlamaModel {
         return await LlamaEmbeddingContext._create({_model: this}, options);
     }
 
+    /**
+     * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+     */
+    public async createRankingContext(options: LlamaRankingContextOptions = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+
+        return await LlamaRankingContext._create({_model: this}, options);
+    }
+
     /**
      * Get warnings about the model file that would affect its usage.
      *
diff --git a/src/evaluator/LlamaRankingContext.ts b/src/evaluator/LlamaRankingContext.ts
diff --git a/src/index.ts b/src/index.ts
diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts

Original file line number	Diff line number	Diff line change
`@@ -415,6 +415,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad`
`415`	`415`	`context_params.embeddings = options.Get("embeddings").As<Napi::Boolean>().Value();`
`416`	`416`	`}`
`417`	`417`
	`418`	`+ if (options.Has("ranking") && options.Get("ranking").As<Napi::Boolean>().Value()) {`
	`419`	`+ context_params.pooling_type = LLAMA_POOLING_TYPE_RANK;`
	`420`	`+ }`
	`421`	`+`
`418`	`422`	`if (options.Has("flashAttention")) {`
`419`	`423`	`context_params.flash_attn = options.Get("flashAttention").As<Napi::Boolean>().Value();`
`420`	`424`	`}`