Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/bindings/utils/compileLLamaCpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,14 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
if (!cmakeCustomOptions.has("GGML_CCACHE"))
cmakeCustomOptions.set("GGML_CCACHE", "OFF");

if (!cmakeCustomOptions.has("LLAMA_CURL"))
if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL"))) {
cmakeCustomOptions.set("LLAMA_CURL", "OFF");

// avoid linking to extra libraries that we don't use
if (!cmakeCustomOptions.has("LLAMA_OPENSSL"))
cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF");
}

if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
cmakeCustomOptions.set("GGML_OPENMP", "OFF");

Expand Down
116 changes: 77 additions & 39 deletions src/evaluator/LlamaRankingContext.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import {AsyncDisposeAggregator, EventRelay, withLock} from "lifecycle-utils";
import {AsyncDisposeAggregator, EventRelay, splitText, withLock} from "lifecycle-utils";
import {Token} from "../types.js";
import {LlamaText} from "../utils/LlamaText.js";
import {tokenizeInput} from "../utils/tokenizeInput.js";
import {resolveBeginningTokenToPrepend, resolveEndTokenToAppend} from "../utils/tokenizerUtils.js";
import {isRankingTemplateValid, parseRankingTemplate} from "../gguf/insights/GgufInsights.js";
import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
import type {LlamaContext, LlamaContextSequence} from "./LlamaContext/LlamaContext.js";
import type {GgufTensorInfo} from "../gguf/types/GgufTensorInfoTypes.js";

export type LlamaRankingContextOptions = {
/**
* The number of tokens the model can see at once.
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
* - **`"auto"`** - adapt to the current VRAM state and attempt to set the context size as high as possible up to the size
* the model was trained on.
* - **`number`** - set the context size to a specific number of tokens.
* If there's not enough VRAM, an error will be thrown.
* Use with caution.
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attempt to set the context size as high as possible
* up to the size the model was trained on, but at least `min` and at most `max`.
*
* Defaults to `"auto"`.
Expand All @@ -36,6 +37,22 @@ export type LlamaRankingContextOptions = {
/** An abort signal to abort the context creation */
createSignal?: AbortSignal,

/**
* The template to use for the ranking evaluation.
* If not provided, the model's template will be used by default.
*
* The template is tokenized with special tokens enabled, but the provided query and document are not.
*
* **<span v-pre>`{{query}}`</span>** is replaced with the query content.
*
* **<span v-pre>`{{document}}`</span>** is replaced with the document content.
*
* It's recommended to not set this option unless you know what you're doing.
*
* Defaults to the model's template.
*/
template?: `${string}{{query}}${string}{{document}}${string}` | `${string}{{document}}${string}{{query}}${string}`,

/**
* Ignore insufficient memory errors and continue with the context creation.
* Can cause the process to crash if there's not enough VRAM for the new context.
Expand All @@ -50,17 +67,21 @@ export type LlamaRankingContextOptions = {
*/
export class LlamaRankingContext {
/** @internal */ private readonly _llamaContext: LlamaContext;
/** @internal */ private readonly _template: string | undefined;
/** @internal */ private readonly _sequence: LlamaContextSequence;
/** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();

public readonly onDispose = new EventRelay<void>();

private constructor({
_llamaContext
_llamaContext,
_template
}: {
_llamaContext: LlamaContext
_llamaContext: LlamaContext,
_template: string | undefined
}) {
this._llamaContext = _llamaContext;
this._template = _template;
this._sequence = this._llamaContext.getSequence();

this._disposeAggregator.add(
Expand All @@ -81,9 +102,6 @@ export class LlamaRankingContext {
* @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
*/
public async rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
throw new Error("Computing rankings is not supported for this model.");

const resolvedInput = this._getEvaluationInput(query, document);

if (resolvedInput.length > this._llamaContext.contextSize)
Expand Down Expand Up @@ -159,7 +177,35 @@ export class LlamaRankingContext {

/** @internal */
private _getEvaluationInput(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
if (this._template != null) {
const resolvedInput = splitText(this._template, ["{{query}}", "{{document}}"])
.flatMap((item) => {
if (typeof item === "string")
return this._llamaContext.model.tokenize(item, true, "trimLeadingSpace");
else if (item.separator === "{{query}}")
return tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
else if (item.separator === "{{document}}")
return tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
else
void (item satisfies never);

void (item satisfies never);
return [];
});

const beginningTokens = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);

if (beginningTokens != null && resolvedInput.at(0) !== beginningTokens)
resolvedInput.unshift(beginningTokens);

if (endToken != null && resolvedInput.at(-1) !== endToken)
resolvedInput.unshift(endToken);

return resolvedInput;
}

if (this.model.tokens.eos == null && this.model.tokens.sep == null)
throw new Error("Computing rankings is not supported for this model.");

const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
Expand All @@ -169,12 +215,12 @@ export class LlamaRankingContext {
return [];

const resolvedInput = [
this.model.tokens.bos,
...(this.model.tokens.bos == null ? [] : [this.model.tokens.bos]),
...resolvedQuery,
this.model.tokens.eos,
this.model.tokens.sep,
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos]),
...(this.model.tokens.sep == null ? [] : [this.model.tokens.sep]),
...resolvedDocument,
this.model.tokens.eos
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos])
];

return resolvedInput;
Expand Down Expand Up @@ -218,24 +264,27 @@ export class LlamaRankingContext {
batchSize,
threads = 6,
createSignal,
template,
ignoreMemorySafetyChecks
}: LlamaRankingContextOptions) {
const tensorInfo = _model.fileInfo.tensorInfo;

if (_model.tokens.bos == null || _model.tokens.eos == null || _model.tokens.sep == null)
throw new Error("Computing rankings is not supported for this model.");

// source: `append_pooling` in `llama.cpp`
if (findLayer(tensorInfo, "cls", "weight") == null || findLayer(tensorInfo, "cls", "bias") == null)
throw new Error("Computing rankings is not supported for this model.");

// source: `append_pooling` in `llama.cpp`
if (findLayer(tensorInfo, "cls.output", "weight") != null && findLayer(tensorInfo, "cls.output", "bias") == null)
throw new Error("Computing rankings is not supported for this model.");
const resolvedTemplate = template ?? parseRankingTemplate(_model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"]);

if (_model.tokens.eos == null && _model.tokens.sep == null) {
if (!isRankingTemplateValid(resolvedTemplate)) {
if (resolvedTemplate === _model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"])
throw new Error("The model's builtin template is invalid. It must contain both {query} and {document} placeholders.");
else
throw new Error("The provided template is invalid. It must contain both {{query}} and {{document}} placeholders.");
} else if (resolvedTemplate == null)
throw new Error("Computing rankings is not supported for this model.");
}

if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
throw new Error("Computing rankings is not supported for encoder-decoder models.");

if (!_model.fileInsights.supportsRanking)
throw new Error("Computing rankings is not supported for this model.");

const llamaContext = await _model.createContext({
contextSize,
batchSize,
Expand All @@ -247,23 +296,12 @@ export class LlamaRankingContext {
});

return new LlamaRankingContext({
_llamaContext: llamaContext
_llamaContext: llamaContext,
_template: resolvedTemplate
});
}
}

function findLayer(tensorInfo: GgufTensorInfo[] | undefined, name: string, suffix: string) {
if (tensorInfo == null)
return undefined;

for (const tensor of tensorInfo) {
if (tensor.name === name + "." + suffix)
return tensor;
}

return undefined;
}

function logitToSigmoid(logit: number) {
return 1 / (1 + Math.exp(-logit));
}
46 changes: 45 additions & 1 deletion src/gguf/insights/GgufInsights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
import {GgufArchitectureType} from "../types/GgufMetadataTypes.js";
import {getReadablePath} from "../../cli/utils/getReadablePath.js";
import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js";
import {GgufInsightsTokens} from "./GgufInsightsTokens.js";

export type GgufInsightsResourceRequirements = {
cpuRam: number,
Expand All @@ -16,15 +17,18 @@ export class GgufInsights {
/** @internal */ public readonly _llama: Llama;
/** @internal */ private readonly _modelSize: number;
/** @internal */ private _totalFileLayers: number | null = null;
/** @internal */ private readonly _ggufFileInfo: GgufFileInfo;
/** @internal */ private _supportsRanking?: boolean;
/** @internal */ public readonly _ggufFileInfo: GgufFileInfo;
/** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver;
/** @internal */ private readonly _tokens: GgufInsightsTokens;

private constructor(ggufFileInfo: GgufFileInfo, llama: Llama) {
this._llama = llama;
this._ggufFileInfo = ggufFileInfo;

this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
this._tokens = GgufInsightsTokens._create(this);
}

/**
Expand Down Expand Up @@ -60,6 +64,10 @@ export class GgufInsights {
return this._configurationResolver;
}

public get tokens() {
return this._tokens;
}

/** The context size the model was trained on */
public get trainContextSize() {
return this._ggufFileInfo.architectureMetadata.context_length;
Expand Down Expand Up @@ -132,6 +140,29 @@ export class GgufInsights {
return false;
}

public get supportsRanking() {
if (this._supportsRanking != null)
return this._supportsRanking;

const layers = this._ggufFileInfo.fullTensorInfo ?? [];
for (let i = layers.length - 1; i >= 0; i--) {
const tensor = layers[i];
if (tensor == null)
continue;

if (tensor.name === "cls.weight" || tensor.name === "cls.output.weight") {
this._supportsRanking = this.tokens.sepToken != null || this.tokens.eosToken != null ||
isRankingTemplateValid(parseRankingTemplate(this._ggufFileInfo.metadata?.tokenizer?.["chat_template.rerank"]));
this._supportsRanking &&= !(this.hasEncoder && this.hasDecoder); // encoder-decoder models are not supported

return this._supportsRanking;
}
}

this._supportsRanking = false;
return this._supportsRanking;
}

/**
* The size of the SWA (Sliding Window Attention).
*
Expand Down Expand Up @@ -787,3 +818,16 @@ function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): numb

return 1;
}

export function parseRankingTemplate(template: string | undefined | null): string | undefined {
if (template == null)
return undefined;

return template
.replaceAll("{query}", "{{query}}")
.replaceAll("{document}", "{{document}}");
}

export function isRankingTemplateValid(template: string | undefined | null): boolean {
return template != null && template.includes("{{query}}") && template.includes("{{document}}");
}
51 changes: 51 additions & 0 deletions src/gguf/insights/GgufInsightsTokens.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* eslint @stylistic/max-statements-per-line: ["warn", {"ignoredNodes": ["BreakStatement"]}] */
import type {GgufInsights} from "./GgufInsights.js";

export class GgufInsightsTokens {
/** @internal */ private readonly _ggufInsights: GgufInsights;

private constructor(ggufInsights: GgufInsights) {
this._ggufInsights = ggufInsights;
}

public get sepToken(): number | null {
const tokenizerModel = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.model;
const totalTokens = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.tokens?.length;

let sepTokenId = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.["seperator_token_id"];
if (sepTokenId == null && tokenizerModel === "bert") {
sepTokenId = 102; // source: `llama_vocab::impl::load` in `llama-vocab.cpp`
}

if (totalTokens != null && sepTokenId != null && sepTokenId >= totalTokens)
return null;

return sepTokenId ?? null;
}

public get eosToken(): number | null {
const tokenizerModel = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.model;
const totalTokens = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.tokens?.length;

const eosTokenId = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.["eos_token_id"];
if (eosTokenId != null && totalTokens != null && eosTokenId < totalTokens)
return eosTokenId;

switch (tokenizerModel) {
case "no_vocab": return null;
case "none": return null;
case "bert": return null;
case "rwkv": return null;
case "llama": return 2;
case "gpt2": return 11;
case "t5": return 1;
case "plamo2": return 2;
}
return 2; // source: `llama_vocab::impl::load` in `llama-vocab.cpp`
}

/** @internal */
public static _create(ggufInsights: GgufInsights) {
return new GgufInsightsTokens(ggufInsights);
}
}
7 changes: 4 additions & 3 deletions src/gguf/types/GgufMetadataTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ export const enum GgufMetadataTokenizerTokenType {

export type GgufMetadataTokenizer = {
readonly ggml: {
readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | string,
readonly model: "no_vocab" | "none" | "llama" | "gpt2" | "bert" | "rwkv" | "t5" | "plamo2" | string,
readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "falcon3" |
"pixtral" | "mpt" | "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" |
"jina-v2-code" | "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" |
Expand All @@ -279,7 +279,7 @@ export type GgufMetadataTokenizer = {
readonly eot_token_id?: number,
readonly eom_token_id?: number,
readonly unknown_token_id?: number,
readonly separator_token_id?: number,
readonly seperator_token_id?: number,
readonly padding_token_id?: number,
readonly cls_token_id?: number,
readonly mask_token_id?: number,
Expand All @@ -304,7 +304,8 @@ export type GgufMetadataTokenizer = {
readonly huggingface?: {
readonly json?: string
},
readonly chat_template?: string
readonly chat_template?: string,
readonly "chat_template.rerank"?: string
};

export const enum GgufMetadataArchitecturePoolingType {
Expand Down
Loading
Loading