fix: improve CUDA compilation (#66)

giladgd · web-flow · commit 86915850003a · 2023-10-10T02:34:13.000+03:00
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.12)
 project ("llama-addon")
 
 if (MSVC)
-  add_compile_options(/EHsc)
+  # add_compile_options(/EHsc)
 else()
   add_compile_options(-fexceptions)
 endif()
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -28,6 +28,7 @@ type ChatCommand = {
     temperature: number,
     topK: number,
     topP: number,
+    gpuLayers?: number,
     repeatPenalty: number,
     lastTokensRepeatPenalty: number,
     penalizeRepeatingNewLine: boolean,
@@ -122,6 +123,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`.",
                 group: "Optional:"
             })
+            .option("gpuLayers", {
+                alias: "gl",
+                type: "number",
+                description: "number of layers to store in VRAM",
+                group: "Optional:"
+            })
             .option("repeatPenalty", {
                 alias: "rp",
                 type: "number",
@@ -165,12 +172,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         model, systemInfo, systemPrompt, prompt, wrapper, contextSize,
-        grammar, threads, temperature, topK, topP, repeatPenalty,
+        grammar, threads, temperature, topK, topP, gpuLayers, repeatPenalty,
         lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
     }) {
         try {
             await RunChat({
-                model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP,
+                model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP, gpuLayers,
                 lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
             });
         } catch (err) {
@@ -183,7 +190,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     model: modelArg, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar: grammarArg, threads, temperature, topK, topP,
-    lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
+    gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
 }: ChatCommand) {
     const {LlamaChatSession} = await import("../../llamaEvaluator/LlamaChatSession.js");
     const {LlamaModel} = await import("../../llamaEvaluator/LlamaModel.js");
@@ -192,7 +199,8 @@ async function RunChat({
 
     let initialPrompt = prompt ?? null;
     const model = new LlamaModel({
-        modelPath: path.resolve(process.cwd(), modelArg)
+        modelPath: path.resolve(process.cwd(), modelArg),
+        gpuLayers: gpuLayers != null ? gpuLayers : undefined
     });
     const context = new LlamaContext({
         model,
diff --git a/src/cli/commands/ClearCommand.ts b/src/cli/commands/ClearCommand.ts
@@ -1,5 +1,5 @@
 import {CommandModule} from "yargs";
-import * as fs from "fs-extra";
+import fs from "fs-extra";
 import chalk from "chalk";
 import {llamaCppDirectory} from "../../config.js";
 import withOra from "../../utils/withOra.js";
diff --git a/src/utils/clearLlamaBuild.ts b/src/utils/clearLlamaBuild.ts
@@ -1,5 +1,5 @@
 import path from "path";
-import * as fs from "fs-extra";
+import fs from "fs-extra";
 import {llamaDirectory} from "../config.js";
 import {clearTempFolder} from "./clearTempFolder.js";
 
diff --git a/src/utils/clearTempFolder.ts b/src/utils/clearTempFolder.ts
@@ -1,5 +1,5 @@
 import process from "process";
-import * as fs from "fs-extra";
+import fs from "fs-extra";
 import {tempDownloadDirectory} from "../config.js";
 
 export async function clearTempFolder() {
diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts
@@ -1,5 +1,5 @@
 import path from "path";
-import * as fs from "fs-extra";
+import fs from "fs-extra";
 import which from "which";
 import chalk from "chalk";
 import {chmodr} from "chmodrp";
@@ -29,10 +29,13 @@ export async function getCmakePath() {
     } catch (err) {}
 
     try {
-        const resolvedPath = await which("cmake", {
+        let resolvedPath = await which("cmake", {
             path: path.join(llamaDirectory, "xpack", "xpacks", ".bin")
         });
 
+        if (resolvedPath.toLowerCase().endsWith(".cmd"))
+            resolvedPath = (await getBinFromWindowCmd(resolvedPath, "cmake.exe")) ?? "";
+
         if (resolvedPath !== "")
             return resolvedPath;
     } catch (err) {}
@@ -89,3 +92,26 @@ async function downloadCmake() {
     await fs.remove(localXpacksCacheDirectory);
     await fixXpackPermissions();
 }
+
+async function getBinFromWindowCmd(cmdFilePath: string, binName: string) {
+    const fileContent: string = await fs.readFile(cmdFilePath, "utf8");
+    const lowercaseFileContent = fileContent.toLowerCase();
+
+    if (!lowercaseFileContent.includes(binName))
+        return null;
+
+    const lastIndexOfBinName = lowercaseFileContent.lastIndexOf(binName);
+    const characterAfterBinName = fileContent[lastIndexOfBinName + binName.length];
+
+    if (characterAfterBinName !== '"' && characterAfterBinName !== "'")
+        return null;
+
+    const startStringCharacter = fileContent.lastIndexOf(characterAfterBinName, lastIndexOfBinName);
+
+    const binPath = fileContent.slice(startStringCharacter + 1, lastIndexOfBinName + binName.length);
+
+    if (!await fs.pathExists(binPath))
+        return null;
+
+    return binPath;
+}
diff --git a/src/utils/compileLLamaCpp.ts b/src/utils/compileLLamaCpp.ts
@@ -30,6 +30,9 @@ export async function compileLlamaCpp({
         else cmakeCustomOptions.push("LLAMA_METAL=OFF");
 
         if (cuda || process.env.LLAMA_CUBLAS === "1") cmakeCustomOptions.push("LLAMA_CUBLAS=1");
+        if (cuda && process.env.CUDA_PATH != null && await fs.pathExists(process.env.CUDA_PATH))
+            cmakeCustomOptions.push("CMAKE_GENERATOR_TOOLSET=" + process.env.CUDA_PATH);
+
         if (process.env.LLAMA_MPI === "1") cmakeCustomOptions.push("LLAMA_MPI=1");
         if (process.env.LLAMA_OPENBLAS === "1") cmakeCustomOptions.push("LLAMA_OPENBLAS=1");
         if (process.env.LLAMA_BLAS_VENDOR != null) cmakeCustomOptions.push("LLAMA_BLAS_VENDOR=" + process.env.LLAMA_BLAS_VENDOR);