feat(minor): add --printTimings option to the chat CLI command (#138)

JonHolman · web-flow · commit c0f5bd8c9a47 · 2024-01-21T02:50:49.000+02:00
diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -215,6 +215,13 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
   Napi::Value GetContextSize(const Napi::CallbackInfo& info) {
     return Napi::Number::From(info.Env(), llama_n_ctx(ctx));
   }
+
+  Napi::Value PrintTimings(const Napi::CallbackInfo& info) {
+    llama_print_timings(ctx);
+    llama_reset_timings(ctx);
+    return info.Env().Undefined();
+  }
+
   Napi::Value GetTokenString(const Napi::CallbackInfo& info) {
     int token = info[0].As<Napi::Number>().Int32Value();
     std::stringstream ss;
@@ -242,6 +249,7 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
                 InstanceMethod("getContextSize", &LLAMAContext::GetContextSize),
                 InstanceMethod("getTokenString", &LLAMAContext::GetTokenString),
                 InstanceMethod("eval", &LLAMAContext::Eval),
+                InstanceMethod("printTimings", &LLAMAContext::PrintTimings),
             }));
   }
 };
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -21,6 +21,7 @@ const modelWrappers = ["auto", "general", "llamaChat", "chatML", "falconChat"] a
 type ChatCommand = {
     model: string,
     systemInfo: boolean,
+    printTimings: boolean,
     systemPrompt: string,
     prompt?: string,
     wrapper: (typeof modelWrappers)[number],
@@ -62,6 +63,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 description: "Print llama.cpp system info",
                 group: "Optional:"
             })
+            .option("printTimings", {
+                type: "boolean",
+                default: false,
+                description: "Print llama.cpp timings",
+                group: "Optional:"
+            })
             .option("systemPrompt", {
                 alias: "s",
                 type: "string",
@@ -191,13 +198,13 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         model, systemInfo, systemPrompt, prompt, wrapper, contextSize,
         grammar, jsonSchemaGrammarFile, threads, temperature, topK, topP,
         gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, printTimings
     }) {
         try {
             await RunChat({
                 model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, jsonSchemaGrammarFile, threads, temperature, topK,
                 topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty,
-                repeatPresencePenalty, maxTokens, noHistory
+                repeatPresencePenalty, maxTokens, noHistory, printTimings
             });
         } catch (err) {
             console.error(err);
@@ -210,7 +217,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 async function RunChat({
     model: modelArg, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty,
-    penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory
+    penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, printTimings
 }: ChatCommand) {
     const {LlamaChatSession} = await import("../../llamaEvaluator/LlamaChatSession.js");
     const {LlamaModel} = await import("../../llamaEvaluator/LlamaModel.js");
@@ -340,6 +347,9 @@ async function RunChat({
         });
         process.stdout.write(endColor);
         console.log();
+
+        if (printTimings)
+            context.printTimings();
     }
 }
 
diff --git a/src/llamaEvaluator/LlamaContext.ts b/src/llamaEvaluator/LlamaContext.ts
@@ -201,6 +201,10 @@ export class LlamaContext {
         return this._ctx.getContextSize();
     }
 
+    public printTimings() {
+        this._ctx.printTimings();
+    }
+
     /**
      * @param {Uint32Array} tokens
      * @param {object} options
diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts
@@ -138,6 +138,7 @@ export type LLAMAContext = {
     tokenNl(): number,
     getContextSize(): number
     getTokenString(token: number): string
+    printTimings(): void
 };
 
 export type LLAMAGrammar = {