Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guide/embedding.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ const documents = [
"Cleaning the house is a good way to keep it tidy"
];

const query = "Tell me a nature geographical fact";
const query = "Tell me a geographical fact";
const rankedDocuments = await context.rankAndSort(query, documents);

const topDocument = rankedDocuments[0]!;
Expand All @@ -185,7 +185,7 @@ console.log("Ranked documents:", rankedDocuments);
```
> This example will produce this output:
> ```
> query: Tell me a nature geographical fact
> query: Tell me a geographical fact
> Top document: Mount Everest is the tallest mountain in the world
> Second document: The capital of France is Paris
> ```
Expand Down
11 changes: 10 additions & 1 deletion src/bindings/Llama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export class Llama {
/** @internal */ public readonly _debug: boolean;
/** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
/** @internal */ private readonly _gpu: LlamaGpuType;
/** @internal */ private readonly _numa: LlamaNuma;
/** @internal */ private readonly _buildType: "localBuild" | "prebuilt";
/** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
/** @internal */ private readonly _supportsGpuOffloading: boolean;
Expand Down Expand Up @@ -95,6 +96,7 @@ export class Llama {

this._bindings = bindings;
this._debug = debug;
this._numa = numa ?? false;
this._logLevel = this._debug
? LlamaLogLevel.debug
: (logLevel ?? LlamaLogLevel.debug);
Expand All @@ -111,7 +113,7 @@ export class Llama {

bindings.ensureGpuDeviceIsSupported();

if (numa != null && numa !== false)
if (this._numa !== false)
bindings.setNuma(numa);

this._gpu = bindings.getGpuType() ?? false;
Expand Down Expand Up @@ -211,6 +213,13 @@ export class Llama {
this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
}

/**
* See the `numa` option of `getLlama` for more information
*/
public get numa() {
return this._numa;
}

public get logLevel() {
return this._logLevel;
}
Expand Down
15 changes: 15 additions & 0 deletions src/bindings/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export type BuildOptions = {
release: string
}
};
export const llamaNumaOptions = ["distribute", "isolate", "numactl", "mirror", false] as const satisfies LlamaNuma[];
export type LlamaNuma = false | "distribute" | "isolate" | "numactl" | "mirror";

export type BuildOptionsJSON = Omit<BuildOptions, "customCmakeOptions"> & {
Expand All @@ -44,6 +45,20 @@ export function parseNodeLlamaCppGpuOption(option: (typeof nodeLlamaCppGpuOption
return "auto";
}

export function parseNumaOption(option: (typeof llamaNumaOptions)[number] | (typeof nodeLlamaCppGpuOffStringOptions)[number]): LlamaNuma {
function optionIsGpuOff(opt: typeof option): opt is (typeof nodeLlamaCppGpuOffStringOptions)[number] {
return nodeLlamaCppGpuOffStringOptions.includes(opt as (typeof nodeLlamaCppGpuOffStringOptions)[number]);
}

if (optionIsGpuOff(option))
return false;

if (llamaNumaOptions.includes(option))
return option;

return false;
}


export function convertBuildOptionsJSONToBuildOptions(buildOptionsJSON: BuildOptionsJSON): BuildOptions {
return {
Expand Down
30 changes: 24 additions & 6 deletions src/cli/commands/ChatCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import {getLlama} from "../../bindings/getLlama.js";
import {LlamaGrammar} from "../../evaluator/LlamaGrammar.js";
import {LlamaChatSession} from "../../evaluator/LlamaChatSession/LlamaChatSession.js";
import {
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
parseNumaOption
} from "../../bindings/types.js";
import withOra from "../../utils/withOra.js";
import {TokenMeter} from "../../evaluator/TokenMeter.js";
Expand Down Expand Up @@ -67,6 +68,7 @@ type ChatCommand = {
tokenPredictionDraftModel?: string,
tokenPredictionModelContextSize?: number,
debug: boolean,
numa?: LlamaNuma,
meter: boolean,
timing: boolean,
noMmap: boolean,
Expand Down Expand Up @@ -298,6 +300,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
default: false,
description: "Print llama.cpp info and debug logs"
})
.option("numa", {
type: "string",

// yargs types don't support passing `false` as a choice, although it is supported by yargs
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
coerce: (value) => {
if (value == null || value == "")
return false;

return parseNumaOption(value);
},
defaultDescription: "false",
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
})
.option("meter", {
type: "boolean",
default: false,
Expand Down Expand Up @@ -326,7 +342,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
}) {
try {
await RunChat({
Expand All @@ -335,7 +351,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
temperature, minP, topK, topP, seed,
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
debug, meter, timing, noMmap, printTimings
debug, numa, meter, timing, noMmap, printTimings
});
} catch (err) {
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
Expand All @@ -352,7 +368,7 @@ async function RunChat({
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
}: ChatCommand) {
if (contextSize === -1) contextSize = undefined;
if (gpuLayers === -1) gpuLayers = undefined;
Expand All @@ -369,11 +385,13 @@ async function RunChat({
: LlamaLogLevel.warn;
const llama = gpu == null
? await getLlama("lastBuild", {
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
})
: await getLlama({
gpu,
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
});
const logBatchSize = batchSize != null;
const useMmap = !noMmap && llama.supportsMmap;
Expand Down
30 changes: 24 additions & 6 deletions src/cli/commands/CompleteCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import fs from "fs-extra";
import prettyMilliseconds from "pretty-ms";
import {getLlama} from "../../bindings/getLlama.js";
import {
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
parseNumaOption
} from "../../bindings/types.js";
import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
import withOra from "../../utils/withOra.js";
Expand Down Expand Up @@ -49,6 +50,7 @@ type CompleteCommand = {
tokenPredictionDraftModel?: string,
tokenPredictionModelContextSize?: number,
debug: boolean,
numa?: LlamaNuma,
meter: boolean,
timing: boolean,
noMmap: boolean,
Expand Down Expand Up @@ -218,6 +220,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
default: false,
description: "Print llama.cpp info and debug logs"
})
.option("numa", {
type: "string",

// yargs types don't support passing `false` as a choice, although it is supported by yargs
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
coerce: (value) => {
if (value == null || value == "")
return false;

return parseNumaOption(value);
},
defaultDescription: "false",
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
})
.option("meter", {
type: "boolean",
default: false,
Expand Down Expand Up @@ -245,14 +261,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
flashAttention, swaFullCache, threads, temperature, minP, topK,
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
debug, meter, timing, noMmap, printTimings
debug, numa, meter, timing, noMmap, printTimings
}) {
try {
await RunCompletion({
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
});
} catch (err) {
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
Expand All @@ -267,7 +283,7 @@ async function RunCompletion({
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
threads, temperature, minP, topK, topP, seed, gpuLayers,
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
}: CompleteCommand) {
if (contextSize === -1) contextSize = undefined;
if (gpuLayers === -1) gpuLayers = undefined;
Expand All @@ -282,11 +298,13 @@ async function RunCompletion({
: LlamaLogLevel.warn;
const llama = gpu == null
? await getLlama("lastBuild", {
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
})
: await getLlama({
gpu,
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
});
const logBatchSize = batchSize != null;
const useMmap = !noMmap && llama.supportsMmap;
Expand Down
30 changes: 24 additions & 6 deletions src/cli/commands/InfillCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import fs from "fs-extra";
import prettyMilliseconds from "pretty-ms";
import {getLlama} from "../../bindings/getLlama.js";
import {
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
parseNumaOption
} from "../../bindings/types.js";
import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
import withOra from "../../utils/withOra.js";
Expand Down Expand Up @@ -51,6 +52,7 @@ type InfillCommand = {
tokenPredictionDraftModel?: string,
tokenPredictionModelContextSize?: number,
debug: boolean,
numa?: LlamaNuma,
meter: boolean,
timing: boolean,
noMmap: boolean,
Expand Down Expand Up @@ -228,6 +230,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
default: false,
description: "Print llama.cpp info and debug logs"
})
.option("numa", {
type: "string",

// yargs types don't support passing `false` as a choice, although it is supported by yargs
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
coerce: (value) => {
if (value == null || value == "")
return false;

return parseNumaOption(value);
},
defaultDescription: "false",
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
})
.option("meter", {
type: "boolean",
default: false,
Expand Down Expand Up @@ -255,14 +271,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
flashAttention, swaFullCache, threads, temperature, minP, topK,
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
debug, meter, timing, noMmap, printTimings
debug, numa, meter, timing, noMmap, printTimings
}) {
try {
await RunInfill({
modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
});
} catch (err) {
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
Expand All @@ -277,7 +293,7 @@ async function RunInfill({
modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
}: InfillCommand) {
if (contextSize === -1) contextSize = undefined;
if (gpuLayers === -1) gpuLayers = undefined;
Expand All @@ -292,11 +308,13 @@ async function RunInfill({
: LlamaLogLevel.warn;
const llama = gpu == null
? await getLlama("lastBuild", {
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
})
: await getLlama({
gpu,
logLevel: llamaLogLevel
logLevel: llamaLogLevel,
numa
});
const logBatchSize = batchSize != null;
const useMmap = !noMmap && llama.supportsMmap;
Expand Down
Loading
Loading