diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md index cf697e09..fa4f4167 100644 --- a/docs/guide/embedding.md +++ b/docs/guide/embedding.md @@ -172,7 +172,7 @@ const documents = [ "Cleaning the house is a good way to keep it tidy" ]; -const query = "Tell me a nature geographical fact"; +const query = "Tell me a geographical fact"; const rankedDocuments = await context.rankAndSort(query, documents); const topDocument = rankedDocuments[0]!; @@ -185,7 +185,7 @@ console.log("Ranked documents:", rankedDocuments); ``` > This example will produce this output: > ``` -> query: Tell me a nature geographical fact +> query: Tell me a geographical fact > Top document: Mount Everest is the tallest mountain in the world > Second document: The capital of France is Paris > ``` diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index 243ad4ff..c4ceb32b 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -42,6 +42,7 @@ export class Llama { /** @internal */ public readonly _debug: boolean; /** @internal */ public readonly _threadsSplitter: ThreadsSplitter; /** @internal */ private readonly _gpu: LlamaGpuType; + /** @internal */ private readonly _numa: LlamaNuma; /** @internal */ private readonly _buildType: "localBuild" | "prebuilt"; /** @internal */ private readonly _cmakeOptions: Readonly>; /** @internal */ private readonly _supportsGpuOffloading: boolean; @@ -95,6 +96,7 @@ export class Llama { this._bindings = bindings; this._debug = debug; + this._numa = numa ?? false; this._logLevel = this._debug ? LlamaLogLevel.debug : (logLevel ?? LlamaLogLevel.debug); @@ -111,7 +113,7 @@ export class Llama { bindings.ensureGpuDeviceIsSupported(); - if (numa != null && numa !== false) + if (this._numa !== false) bindings.setNuma(numa); this._gpu = bindings.getGpuType() ?? false; @@ -211,6 +213,13 @@ export class Llama { this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value)); } + /** + * See the `numa` option of `getLlama` for more information + */ + public get numa() { + return this._numa; + } + public get logLevel() { return this._logLevel; } diff --git a/src/bindings/types.ts b/src/bindings/types.ts index 7748772c..5c6af332 100644 --- a/src/bindings/types.ts +++ b/src/bindings/types.ts @@ -22,6 +22,7 @@ export type BuildOptions = { release: string } }; +export const llamaNumaOptions = ["distribute", "isolate", "numactl", "mirror", false] as const satisfies LlamaNuma[]; export type LlamaNuma = false | "distribute" | "isolate" | "numactl" | "mirror"; export type BuildOptionsJSON = Omit & { @@ -44,6 +45,20 @@ export function parseNodeLlamaCppGpuOption(option: (typeof nodeLlamaCppGpuOption return "auto"; } +export function parseNumaOption(option: (typeof llamaNumaOptions)[number] | (typeof nodeLlamaCppGpuOffStringOptions)[number]): LlamaNuma { + function optionIsGpuOff(opt: typeof option): opt is (typeof nodeLlamaCppGpuOffStringOptions)[number] { + return nodeLlamaCppGpuOffStringOptions.includes(opt as (typeof nodeLlamaCppGpuOffStringOptions)[number]); + } + + if (optionIsGpuOff(option)) + return false; + + if (llamaNumaOptions.includes(option)) + return option; + + return false; +} + export function convertBuildOptionsJSONToBuildOptions(buildOptionsJSON: BuildOptionsJSON): BuildOptions { return { diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 39f0bd59..486f3f82 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -13,7 +13,8 @@ import {getLlama} from "../../bindings/getLlama.js"; import {LlamaGrammar} from "../../evaluator/LlamaGrammar.js"; import {LlamaChatSession} from "../../evaluator/LlamaChatSession/LlamaChatSession.js"; import { - BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption + BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, + parseNumaOption } from "../../bindings/types.js"; import withOra from "../../utils/withOra.js"; import {TokenMeter} from "../../evaluator/TokenMeter.js"; @@ -67,6 +68,7 @@ type ChatCommand = { tokenPredictionDraftModel?: string, tokenPredictionModelContextSize?: number, debug: boolean, + numa?: LlamaNuma, meter: boolean, timing: boolean, noMmap: boolean, @@ -298,6 +300,20 @@ export const ChatCommand: CommandModule = { default: false, description: "Print llama.cpp info and debug logs" }) + .option("numa", { + type: "string", + + // yargs types don't support passing `false` as a choice, although it is supported by yargs + choices: llamaNumaOptions as any as Exclude[], + coerce: (value) => { + if (value == null || value == "") + return false; + + return parseNumaOption(value); + }, + defaultDescription: "false", + description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information" + }) .option("meter", { type: "boolean", default: false, @@ -326,7 +342,7 @@ export const ChatCommand: CommandModule = { noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, - environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings + environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings }) { try { await RunChat({ @@ -335,7 +351,7 @@ export const ChatCommand: CommandModule = { temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -352,7 +368,7 @@ async function RunChat({ jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, - tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings + tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -369,11 +385,13 @@ async function RunChat({ : LlamaLogLevel.warn; const llama = gpu == null ? await getLlama("lastBuild", { - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }) : await getLlama({ gpu, - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index 1aae93fd..55cc01b1 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -7,7 +7,8 @@ import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; import {getLlama} from "../../bindings/getLlama.js"; import { - BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption + BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, + parseNumaOption } from "../../bindings/types.js"; import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js"; import withOra from "../../utils/withOra.js"; @@ -49,6 +50,7 @@ type CompleteCommand = { tokenPredictionDraftModel?: string, tokenPredictionModelContextSize?: number, debug: boolean, + numa?: LlamaNuma, meter: boolean, timing: boolean, noMmap: boolean, @@ -218,6 +220,20 @@ export const CompleteCommand: CommandModule = { default: false, description: "Print llama.cpp info and debug logs" }) + .option("numa", { + type: "string", + + // yargs types don't support passing `false` as a choice, although it is supported by yargs + choices: llamaNumaOptions as any as Exclude[], + coerce: (value) => { + if (value == null || value == "") + return false; + + return parseNumaOption(value); + }, + defaultDescription: "false", + description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information" + }) .option("meter", { type: "boolean", default: false, @@ -245,14 +261,14 @@ export const CompleteCommand: CommandModule = { flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, printTimings }) { try { await RunCompletion({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -267,7 +283,7 @@ async function RunCompletion({ modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings }: CompleteCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -282,11 +298,13 @@ async function RunCompletion({ : LlamaLogLevel.warn; const llama = gpu == null ? await getLlama("lastBuild", { - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }) : await getLlama({ gpu, - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index a47df068..b07f1e59 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -7,7 +7,8 @@ import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; import {getLlama} from "../../bindings/getLlama.js"; import { - BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption + BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, + parseNumaOption } from "../../bindings/types.js"; import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js"; import withOra from "../../utils/withOra.js"; @@ -51,6 +52,7 @@ type InfillCommand = { tokenPredictionDraftModel?: string, tokenPredictionModelContextSize?: number, debug: boolean, + numa?: LlamaNuma, meter: boolean, timing: boolean, noMmap: boolean, @@ -228,6 +230,20 @@ export const InfillCommand: CommandModule = { default: false, description: "Print llama.cpp info and debug logs" }) + .option("numa", { + type: "string", + + // yargs types don't support passing `false` as a choice, although it is supported by yargs + choices: llamaNumaOptions as any as Exclude[], + coerce: (value) => { + if (value == null || value == "") + return false; + + return parseNumaOption(value); + }, + defaultDescription: "false", + description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information" + }) .option("meter", { type: "boolean", default: false, @@ -255,14 +271,14 @@ export const InfillCommand: CommandModule = { flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, printTimings }) { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -277,7 +293,7 @@ async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings }: InfillCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -292,11 +308,13 @@ async function RunInfill({ : LlamaLogLevel.warn; const llama = gpu == null ? await getLlama("lastBuild", { - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }) : await getLlama({ gpu, - logLevel: llamaLogLevel + logLevel: llamaLogLevel, + numa }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts index b82db45e..37a4bd4e 100644 --- a/test/modelDependent/bgeReranker/rank.test.ts +++ b/test/modelDependent/bgeReranker/rank.test.ts @@ -4,7 +4,10 @@ import {getTestLlama} from "../../utils/getTestLlama.js"; describe("bgeReranker", () => { describe("rank", () => { - test("simple ranking", {timeout: 1000 * 60 * 60 * 2}, async () => { + test("simple ranking", {timeout: 1000 * 60 * 60 * 2}, async (test) => { + if (process.platform !== "darwin") + test.skip(); // the scores are a bit different on different platforms, so skipping on other platforms due to flakiness + const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf"); const llama = await getTestLlama(); @@ -28,7 +31,7 @@ describe("bgeReranker", () => { "Cleaning the house is a good way to keep it tidy" ]; - const query = "Tell me a nature geographical fact"; + const query = "Tell me a geographical fact"; const ranks = await Promise.all( documents.map((doc) => rankingContext.rank(query, doc)) @@ -40,24 +43,27 @@ describe("bgeReranker", () => { const highestRankDocument = documents[highestRankIndex]; expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world"); - expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("0.0024726231566347743"); + expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("0.026596993576865856"); expect(simplifyRanks(ranks)).toMatchInlineSnapshot(` [ 0.00002039908727992137, 0.00006772414961977023, 0.00003716893710288947, + 0.004496273160941178, 0.00003716893710288947, + 0.026596993576865856, 0.00003716893710288947, - 0.0024726231566347743, - 0.00003716893710288947, - 0.00003716893710288947, + 0.00002039908727992137, 0.00002039908727992137, 0.00003716893710288947, ] `); }); - test("rank all", {timeout: 1000 * 60 * 60 * 2}, async () => { + test("rank all", {timeout: 1000 * 60 * 60 * 2}, async (test) => { + if (process.platform !== "darwin") + test.skip(); // the scores are a bit different on different platforms, so skipping on other platforms due to flakiness + const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf"); const llama = await getTestLlama(); @@ -81,7 +87,7 @@ describe("bgeReranker", () => { "Cleaning the house is a good way to keep it tidy" ]; - const query = "Tell me a nature geographical fact"; + const query = "Tell me a geographical fact"; const ranks = await rankingContext.rankAll(query, documents); @@ -91,24 +97,27 @@ describe("bgeReranker", () => { const highestRankDocument = documents[highestRankIndex]; expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world"); - expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("0.0024726231566347743"); + expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("0.026596993576865856"); expect(simplifyRanks(ranks)).toMatchInlineSnapshot(` [ 0.00002039908727992137, 0.00006772414961977023, 0.00003716893710288947, + 0.004496273160941178, 0.00003716893710288947, + 0.026596993576865856, 0.00003716893710288947, - 0.0024726231566347743, - 0.00003716893710288947, - 0.00003716893710288947, + 0.00002039908727992137, 0.00002039908727992137, 0.00003716893710288947, ] `); }); - test("rank and sort", {timeout: 1000 * 60 * 60 * 2}, async () => { + test("rank and sort", {timeout: 1000 * 60 * 60 * 2}, async (test) => { + if (process.platform !== "darwin") + test.skip(); // the scores are a bit different on different platforms, so skipping on other platforms due to flakiness + const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf"); const llama = await getTestLlama(); @@ -130,7 +139,7 @@ describe("bgeReranker", () => { "Cleaning the house is a good way to keep it tidy" ]; - const query = "Tell me a nature geographical fact"; + const query = "Tell me a geographical fact"; const rankedDocuments = await rankingContext.rankAndSort(query, documents); @@ -141,21 +150,25 @@ describe("bgeReranker", () => { expect(simplifySortedRanks([topDocument])[0]).toMatchInlineSnapshot(` { "document": "Mount Everest is the tallest mountain in the world", - "score": 0.0024726231566347743, + "score": 0.026596993576865856, } `); expect(simplifySortedRanks(rankedDocuments)).toMatchInlineSnapshot(` [ { "document": "Mount Everest is the tallest mountain in the world", - "score": 0.0024726231566347743, + "score": 0.026596993576865856, + }, + { + "document": "The capital of France is Paris", + "score": 0.004496273160941178, }, { "document": "I love eating pizza with extra cheese", "score": 0.00006772414961977023, }, { - "document": "The capital of France is Paris", + "document": "A warm cup of tea is perfect for a cold winter day", "score": 0.00003716893710288947, }, { @@ -166,10 +179,6 @@ describe("bgeReranker", () => { "document": "Cleaning the house is a good way to keep it tidy", "score": 0.00003716893710288947, }, - { - "document": "A warm cup of tea is perfect for a cold winter day", - "score": 0.00003716893710288947, - }, { "document": "Not all the things that shine are made of gold", "score": 0.00002039908727992137, @@ -181,6 +190,51 @@ describe("bgeReranker", () => { ] `); }); + + test("rank and sort without scores", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const rankingContext = await model.createRankingContext({ + contextSize: 512 + }); + + const documents = [ + "The sky is clear and blue today", + "I love eating pizza with extra cheese", + "Dogs love to play fetch with their owners", + "The capital of France is Paris", + "Mount Everest is the tallest mountain in the world", + "A warm cup of tea is perfect for a cold winter day", + "Not all the things that shine are made of gold", + "Cleaning the house is a good way to keep it tidy" + ]; + + const query = "Tell me a geographical fact"; + + const rankedDocuments = await rankingContext.rankAndSort(query, documents); + + const topDocument = rankedDocuments[0]!; + + expect(topDocument.document).to.eql("Mount Everest is the tallest mountain in the world"); + + expect(onlyDocuments([topDocument])[0]).toMatchInlineSnapshot('"Mount Everest is the tallest mountain in the world"'); + expect(onlyDocuments(rankedDocuments)).toMatchInlineSnapshot(` + [ + "Mount Everest is the tallest mountain in the world", + "The capital of France is Paris", + "I love eating pizza with extra cheese", + "A warm cup of tea is perfect for a cold winter day", + "Dogs love to play fetch with their owners", + "Cleaning the house is a good way to keep it tidy", + "Not all the things that shine are made of gold", + "The sky is clear and blue today", + ] + `); + }); }); }); @@ -195,6 +249,10 @@ function simplifySortedRanks item.document); +} + function simplifyScore(score: number) { return toSigmoid(parseFloat(roundToPrecision(toLogit(score), 0.6).toFixed(1))); }