withcatai · giladgd · May 18, 2025 · May 17, 2025 · May 18, 2025 · May 18, 2025
diff --git a/docs/guide/awesome.md b/docs/guide/awesome.md
@@ -15,6 +15,9 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
 * [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
   <br /><DataBadge title="License" content="AGPL-3.0"/>
 
+* [Clippy](https://felixrieseberg.github.io/clippy/) ([GitHub](https://github.com/felixrieseberg/clippy)) - Clippy, resurrected from the 1990s, now with some AI
+  <br /><DataBadge title="License" content="MIT"/>
+
 
 ## Proprietary
 * [BashBuddy](https://bashbuddy.run) ([GitHub](https://github.com/wosherco/bashbuddy)) - write bash commands with natural language

diff --git a/src/bindings/utils/getLlamaGpuTypes.ts b/src/bindings/utils/getLlamaGpuTypes.ts
@@ -15,12 +15,18 @@ import {getPlatform} from "./getPlatform.js";
  * as some of them are inadvisable for the current machine (like CUDA on an x64 Mac machine).
  */
 export async function getLlamaGpuTypes(include: "supported" | "allValid"): Promise<LlamaGpuType[]> {
-    if (include === "supported")
-        return await getGpuTypesToUseForOption("auto");
-
     const platform = getPlatform();
     const arch = process.arch;
 
+    if (include === "supported") {
+        const gpuTypes = new Set(await getGpuTypesToUseForOption("auto"));
+
+        if (platform === "win" && arch !== "x64")
+            gpuTypes.delete("vulkan"); // no Vulkan prebuilt binary yet due to incomplete support for arm64
+
+        return [...gpuTypes];
+    }
+
     const res: LlamaGpuType[] = [];
 
     // Metal is not properly supported by llama.cpp on x64 Mac machines

diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts
@@ -1,6 +1,74 @@
 import {ModelRecommendation} from "./utils/resolveModelRecommendationFileOptions.js";
 
 export const recommendedModels: ModelRecommendation[] = [{
+    name: "Qwen 3 32B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high.\n" +
+        "This is the 32 billion parameters version of the model.\n" +
+        "Its performance is comparable and even surpasses DeepSeek R1 and GPT-o1.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-32B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-32B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-32B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-32B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 14B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 14 billion parameters version of the model.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-14B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-14B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-14B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-14B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 8B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 8 billion parameters version of the model.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-8B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-8B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-8B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-8B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 4B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 4 billion parameters version of the model, and is suitable for simpler tasks and can run on lower-end hardware, as well as be very fast on higher-end hardware.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-4B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-4B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-4B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-4B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 0.6B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is very high compared to its small size.\n" +
+        "This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-0.6B-GGUF:Q8_0"
+    ]
+}, {
     name: "DeepSeek R1 Distill Qwen 7B",
     abilities: ["chat", "complete", "functionCalling", "reasoning"],
     description: "DeepSeek R1 model was created by DeepSeek and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
@@ -75,6 +143,23 @@ export const recommendedModels: ModelRecommendation[] = [{
         "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q5_K_S",
         "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M"
     ]
+}, {
+    name: "Qwen 3 30B A3B MoE",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" +
+        "Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" +
+        "This model is censored, but its responses quality on many topics is high compared to its high generation speed.\n" +
+        "This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" +
+        "Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q4_K_M"
+    ]
 }, {
     name: "QwQ 32B",
     abilities: ["chat", "complete", "functionCalling", "reasoning"],

diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -343,6 +343,7 @@ export class LlamaChatSession {
     /** @internal */ private readonly _chatLock = {};
     /** @internal */ private _chatHistory: ChatHistoryItem[];
     /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
+    /** @internal */ private _canUseContextWindowForCompletion: boolean = true;
     /** @internal */ private _chat: LlamaChat | null;
     /** @internal */ public _chatHistoryStateRef = {};
     /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set<AbortController>();
@@ -519,7 +520,9 @@ export class LlamaChatSession {
 
             const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
             const [abortController, disposeAbortController] = wrapAbortSignal(signal);
-            let lastEvaluation = this._lastEvaluation;
+            let lastEvaluation = this._canUseContextWindowForCompletion
+                ? this._lastEvaluation
+                : undefined;
             let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
             let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
                 ? undefined
@@ -723,6 +726,7 @@ export class LlamaChatSession {
                     }
 
                     this._lastEvaluation = lastEvaluation;
+                    this._canUseContextWindowForCompletion = true;
                     this._chatHistory = newChatHistory;
                     this._chatHistoryStateRef = {};
 
@@ -876,9 +880,10 @@ export class LlamaChatSession {
 
                 this._lastEvaluation = {
                     cleanHistory: this._chatHistory,
-                    contextWindow: lastEvaluation.contextWindow,
+                    contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
                     contextShiftMetadata: lastEvaluation.contextShiftMetadata
                 };
+                this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
 
                 if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
                     throw abortController.signal.reason;
@@ -918,6 +923,7 @@ export class LlamaChatSession {
         this._chatHistory = structuredClone(chatHistory);
         this._chatHistoryStateRef = {};
         this._lastEvaluation = undefined;
+        this._canUseContextWindowForCompletion = false;
     }
 
     /** Clear the chat history and reset it to the initial state. */