feat: threads count setting on a model (#33)

giladgd · web-flow · commit 47c3c5ffdaba · 2023-09-02T22:33:20.000+03:00
diff --git a/README.md b/README.md
@@ -287,6 +287,7 @@ Optional:
   -c, --contextSize      Context size to use for the model                  [number] [default: 4096]
   -g, --grammar          Restrict the model response to a specific grammar, like JSON for example
      [string] [choices: "text", "json", "list", "arithmetic", "japanese", "chess"] [default: "text"]
+      --threads          Number of threads to use for the evaluation of tokens [number] [default: 6]
   -t, --temperature      Temperature is a hyperparameter that controls the randomness of the generat
                          ed text. It affects the probability distribution of the model's output toke
                          ns. A higher temperature (e.g., 1.5) makes the output more random and creat
diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -13,6 +13,7 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
     llama_context_params params;
     llama_model* model;
     float temperature;
+    int threads;
     int32_t top_k;
     float top_p;
 
@@ -21,6 +22,7 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
         params.seed = -1;
         params.n_ctx = 4096;
         temperature = 0.0f;
+        threads = 6;
         top_k = 40;
         top_p = 0.95f;
 
@@ -74,6 +76,10 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
                 params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
             }
 
+            if (options.Has("threads")) {
+                threads = options.Get("threads").As<Napi::Number>().Int32Value();
+            }
+
             if (options.Has("temperature")) {
                 temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
             }
@@ -283,7 +289,7 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
   protected:
   void Execute() {
     // Perform the evaluation using llama_eval.
-    int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), 6);
+    int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), (ctx->model)->threads);
     if (r != 0) {
       SetError("Eval has failed");
       return;
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -18,6 +18,7 @@ type ChatCommand = {
     wrapper: "auto" | "general" | "llamaChat" | "chatML",
     contextSize: number,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[0],
+    threads: number,
     temperature: number,
     topK: number,
     topP: number,
@@ -76,6 +77,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 description: "Restrict the model response to a specific grammar, like JSON for example",
                 group: "Optional:"
             })
+            .option("threads", {
+                type: "number",
+                default: 6,
+                description: "Number of threads to use for the evaluation of tokens",
+                group: "Optional:"
+            })
             .option("temperature", {
                 alias: "t",
                 type: "number",
@@ -107,10 +114,10 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         model, systemInfo, systemPrompt, wrapper, contextSize, grammar,
-        temperature, topK, topP, maxTokens
+        threads, temperature, topK, topP, maxTokens
     }) {
         try {
-            await RunChat({model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens});
+            await RunChat({model, systemInfo, systemPrompt, wrapper, contextSize, grammar, threads, temperature, topK, topP, maxTokens});
         } catch (err) {
             console.error(err);
             process.exit(1);
@@ -120,7 +127,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 
 async function RunChat({
-    model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, temperature, topK, topP, maxTokens
+    model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, threads, temperature, topK, topP, maxTokens
 }: ChatCommand) {
     const {LlamaChatSession} = await import("../../llamaEvaluator/LlamaChatSession.js");
     const {LlamaModel} = await import("../../llamaEvaluator/LlamaModel.js");
@@ -130,6 +137,7 @@ async function RunChat({
     const model = new LlamaModel({
         modelPath: modelArg,
         contextSize,
+        threads,
         temperature,
         topK,
         topP
diff --git a/src/llamaEvaluator/LlamaModel.ts b/src/llamaEvaluator/LlamaModel.ts
@@ -21,6 +21,9 @@ export type LlamaModelOptions = {
     /** if true, reduce VRAM usage at the cost of performance */
     lowVram?: boolean,
 
+    /** number of threads to use to evaluate tokens */
+    threads?: number,
+
     /**
      * Temperature is a hyperparameter that controls the randomness of the generated text.
      * It affects the probability distribution of the model's output tokens.
@@ -85,6 +88,7 @@ export class LlamaModel {
      * @param {number} [options.batchSize] - prompt processing batch size
      * @param {number} [options.gpuLayers] - number of layers to store in VRAM
      * @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
+     * @param {number} [options.threads] - number of threads to use to evaluate tokens
      * @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
      * It affects the probability distribution of the model's output tokens.
      * A higher temperature (e.g., 1.5) makes the output more random and creative,
@@ -114,14 +118,15 @@ export class LlamaModel {
      */
     public constructor({
         modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers,
-        lowVram, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
+        lowVram, threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
     }: LlamaModelOptions) {
         this._model = new LLAMAModel(modelPath, removeNullFields({
             seed: seed != null ? Math.max(-1, seed) : undefined,
             contextSize,
             batchSize,
             gpuLayers,
             lowVram,
+            threads,
             temperature,
             topK,
             topP,
diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts
@@ -111,6 +111,7 @@ export type LLAMAModel = {
         useMmap?: boolean,
         useMlock?: boolean,
         embedding?: boolean,
+        threads?: number,
         temperature?: number,
         topK?: number,
         topP?: number