Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/guide/awesome.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
* [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
<br /><DataBadge title="License" content="AGPL-3.0"/>

* [Clippy](https://felixrieseberg.github.io/clippy/) ([GitHub](https://github.com/felixrieseberg/clippy)) - Clippy, resurrected from the 1990s, now with some AI
<br /><DataBadge title="License" content="MIT"/>


## Proprietary
* [BashBuddy](https://bashbuddy.run) ([GitHub](https://github.com/wosherco/bashbuddy)) - write bash commands with natural language
Expand Down
12 changes: 9 additions & 3 deletions src/bindings/utils/getLlamaGpuTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,18 @@ import {getPlatform} from "./getPlatform.js";
* as some of them are inadvisable for the current machine (like CUDA on an x64 Mac machine).
*/
export async function getLlamaGpuTypes(include: "supported" | "allValid"): Promise<LlamaGpuType[]> {
if (include === "supported")
return await getGpuTypesToUseForOption("auto");

const platform = getPlatform();
const arch = process.arch;

if (include === "supported") {
const gpuTypes = new Set(await getGpuTypesToUseForOption("auto"));

if (platform === "win" && arch !== "x64")
gpuTypes.delete("vulkan"); // no Vulkan prebuilt binary yet due to incomplete support for arm64

return [...gpuTypes];
}

const res: LlamaGpuType[] = [];

// Metal is not properly supported by llama.cpp on x64 Mac machines
Expand Down
85 changes: 85 additions & 0 deletions src/cli/recommendedModels.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,74 @@
import {ModelRecommendation} from "./utils/resolveModelRecommendationFileOptions.js";

export const recommendedModels: ModelRecommendation[] = [{
name: "Qwen 3 32B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This model is censored, but its responses quality on many topics is extremely high.\n" +
"This is the 32 billion parameters version of the model.\n" +
"Its performance is comparable and even surpasses DeepSeek R1 and GPT-o1.",

fileOptions: [
"hf:Qwen/Qwen3-32B-GGUF:Q8_0",
"hf:Qwen/Qwen3-32B-GGUF:Q6_K",
"hf:Qwen/Qwen3-32B-GGUF:Q5_K_M",
"hf:Qwen/Qwen3-32B-GGUF:Q4_K_M"
]
}, {
name: "Qwen 3 14B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
"This is the 14 billion parameters version of the model.",

fileOptions: [
"hf:Qwen/Qwen3-14B-GGUF:Q8_0",
"hf:Qwen/Qwen3-14B-GGUF:Q6_K",
"hf:Qwen/Qwen3-14B-GGUF:Q5_K_M",
"hf:Qwen/Qwen3-14B-GGUF:Q4_K_M"
]
}, {
name: "Qwen 3 8B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
"This is the 8 billion parameters version of the model.",

fileOptions: [
"hf:Qwen/Qwen3-8B-GGUF:Q8_0",
"hf:Qwen/Qwen3-8B-GGUF:Q6_K",
"hf:Qwen/Qwen3-8B-GGUF:Q5_K_M",
"hf:Qwen/Qwen3-8B-GGUF:Q4_K_M"
]
}, {
name: "Qwen 3 4B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
"This is the 4 billion parameters version of the model, and is suitable for simpler tasks and can run on lower-end hardware, as well as be very fast on higher-end hardware.",

fileOptions: [
"hf:Qwen/Qwen3-4B-GGUF:Q8_0",
"hf:Qwen/Qwen3-4B-GGUF:Q6_K",
"hf:Qwen/Qwen3-4B-GGUF:Q5_K_M",
"hf:Qwen/Qwen3-4B-GGUF:Q4_K_M"
]
}, {
name: "Qwen 3 0.6B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This model is censored, but its responses quality on many topics is very high compared to its small size.\n" +
"This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n",

fileOptions: [
"hf:Qwen/Qwen3-0.6B-GGUF:Q8_0"
]
}, {
name: "DeepSeek R1 Distill Qwen 7B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "DeepSeek R1 model was created by DeepSeek and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
Expand Down Expand Up @@ -75,6 +143,23 @@ export const recommendedModels: ModelRecommendation[] = [{
"hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q5_K_S",
"hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M"
]
}, {
name: "Qwen 3 30B A3B MoE",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
"It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
"This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" +
"Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" +
"This model is censored, but its responses quality on many topics is high compared to its high generation speed.\n" +
"This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" +
"Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.",

fileOptions: [
"hf:Qwen/Qwen3-30B-A3B-GGUF:Q8_0",
"hf:Qwen/Qwen3-30B-A3B-GGUF:Q6_K",
"hf:Qwen/Qwen3-30B-A3B-GGUF:Q5_K_M",
"hf:Qwen/Qwen3-30B-A3B-GGUF:Q4_K_M"
]
}, {
name: "QwQ 32B",
abilities: ["chat", "complete", "functionCalling", "reasoning"],
Expand Down
10 changes: 8 additions & 2 deletions src/evaluator/LlamaChatSession/LlamaChatSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ export class LlamaChatSession {
/** @internal */ private readonly _chatLock = {};
/** @internal */ private _chatHistory: ChatHistoryItem[];
/** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
/** @internal */ private _canUseContextWindowForCompletion: boolean = true;
/** @internal */ private _chat: LlamaChat | null;
/** @internal */ public _chatHistoryStateRef = {};
/** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set<AbortController>();
Expand Down Expand Up @@ -519,7 +520,9 @@ export class LlamaChatSession {

const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
let lastEvaluation = this._lastEvaluation;
let lastEvaluation = this._canUseContextWindowForCompletion
? this._lastEvaluation
: undefined;
let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
? undefined
Expand Down Expand Up @@ -723,6 +726,7 @@ export class LlamaChatSession {
}

this._lastEvaluation = lastEvaluation;
this._canUseContextWindowForCompletion = true;
this._chatHistory = newChatHistory;
this._chatHistoryStateRef = {};

Expand Down Expand Up @@ -876,9 +880,10 @@ export class LlamaChatSession {

this._lastEvaluation = {
cleanHistory: this._chatHistory,
contextWindow: lastEvaluation.contextWindow,
contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
contextShiftMetadata: lastEvaluation.contextShiftMetadata
};
this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";

if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
throw abortController.signal.reason;
Expand Down Expand Up @@ -918,6 +923,7 @@ export class LlamaChatSession {
this._chatHistory = structuredClone(chatHistory);
this._chatHistoryStateRef = {};
this._lastEvaluation = undefined;
this._canUseContextWindowForCompletion = false;
}

/** Clear the chat history and reset it to the initial state. */
Expand Down
Loading