fix: no thread limit when using a GPU (#322)

giladgd · web-flow · commit 2204e7a205d4 · 2024-09-20T03:10:45.000+03:00
* fix: no thread limit when using a GPU
* fix: improve `defineChatSessionFunction` types and docs
* fix: format numbers printed in the CLI
* fix: disable the browser's autocomplete in the docs search
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -698,6 +698,7 @@ jobs:
           export DOCS_PACKAGE_VERSION="$(cat ./docsVersion.txt)"
           echo "Package version: $DOCS_PACKAGE_VERSION"
           
+          git apply --ignore-whitespace ./scripts/patches/vitepress+1.3.4.patch
           npm run docs:build
       - name: Upload docs
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -42,4 +42,6 @@ jobs:
           CI: true
         run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
       - name: Compile docs
-        run: npm run docs:build
+        run: |
+          git apply --ignore-whitespace ./scripts/patches/vitepress+1.3.4.patch
+          npm run docs:build
diff --git a/.releaserc.ts b/.releaserc.ts
@@ -83,6 +83,7 @@ function getDryRunResult() {
 }
 
 const dryRunResult = getDryRunResult();
+console.info("Next release type", dryRunResult?.nextRelease?.type);
 if (dryRunResult == null || !(dryRunResult.nextRelease.type === "major" || dryRunResult.nextRelease.type === "minor"))
     githubPluginConfig.discussionCategoryName = false;
 
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -132,7 +132,7 @@
     "@resvg/resvg-js": "^2.6.2",
     "@semantic-release/exec": "^6.0.3",
     "@semantic-release/npm": "12.0.1",
-    "@shikijs/vitepress-twoslash": "^1.17.7",
+    "@shikijs/vitepress-twoslash": "^1.18.0",
     "@types/async-retry": "^1.4.8",
     "@types/bytes": "^3.1.4",
     "@types/cross-spawn": "^6.0.2",
@@ -160,13 +160,13 @@
     "tslib": "^2.7.0",
     "typedoc": "^0.26.7",
     "typedoc-plugin-markdown": "^4.2.7",
-    "typedoc-plugin-mdn-links": "^3.2.12",
+    "typedoc-plugin-mdn-links": "^3.3.0",
     "typedoc-vitepress-theme": "^1.0.1",
     "typescript": "^5.6.2",
     "vite-node": "^2.1.1",
-    "vitepress": "^1.3.4",
+    "vitepress": "1.3.4",
     "vitest": "^2.1.1",
-    "zx": "^8.1.7"
+    "zx": "^8.1.8"
   },
   "dependencies": {
     "@huggingface/jinja": "^0.3.1",
@@ -192,7 +192,7 @@
     "pretty-ms": "^9.1.0",
     "proper-lockfile": "^4.1.2",
     "semver": "^7.6.3",
-    "simple-git": "^3.26.0",
+    "simple-git": "^3.27.0",
     "slice-ansi": "^7.1.0",
     "stdout-update": "^4.0.1",
     "strip-ansi": "^7.1.0",
diff --git a/scripts/patches/vitepress+1.3.4.patch b/scripts/patches/vitepress+1.3.4.patch
@@ -0,0 +1,12 @@
+diff --git a/node_modules/vitepress/dist/client/theme-default/components/VPLocalSearchBox.vue b/node_modules/vitepress/dist/client/theme-default/components/VPLocalSearchBox.vue
+index c8aded4..ccd5eff 100644
+--- a/node_modules/vitepress/dist/client/theme-default/components/VPLocalSearchBox.vue
++++ b/node_modules/vitepress/dist/client/theme-default/components/VPLocalSearchBox.vue
+@@ -443,6 +443,7 @@ function formMarkRegex(terms: Set<string>) {
+             :placeholder="buttonText"
+             id="localsearch-input"
+             aria-labelledby="localsearch-label"
++            autocomplete="off"
+             class="search-input"
+           />
+           <div class="search-actions">
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
@@ -24,7 +24,7 @@ const addonLogLevelToLlamaLogLevel: ReadonlyMap<number, LlamaLogLevel> = new Map
     [...LlamaLogLevelToAddonLogLevel.entries()].map(([key, value]) => [value, key])
 );
 const defaultLogLevel = 5;
-const defaultMinThreadSplitterThreads = 4;
+const defaultCPUMinThreadSplitterThreads = 4;
 
 export class Llama {
     /** @internal */ public readonly _bindings: BindingModule;
@@ -86,7 +86,13 @@ export class Llama {
         this._debug = debug;
         this._vramOrchestrator = vramOrchestrator;
         this._vramPadding = vramPadding;
-        this._threadsSplitter = new ThreadsSplitter(maxThreads ?? Math.max(defaultMinThreadSplitterThreads, this._mathCores));
+        this._threadsSplitter = new ThreadsSplitter(
+            maxThreads ?? (
+                this._gpu === false
+                    ? Math.max(defaultCPUMinThreadSplitterThreads, this._mathCores)
+                    : 0
+            )
+        );
 
         this._logLevel = this._debug
             ? LlamaLogLevel.debug
@@ -155,14 +161,16 @@ export class Llama {
     /**
      * The maximum number of threads that can be used by the Llama instance.
      *
-     * Default to `cpuMathCores`.
+     * If set to `0`, the Llama instance will have no limit on the number of threads.
+     *
+     * See the `maxThreads` option of `getLlama` for more information.
      */
     public get maxThreads() {
         return this._threadsSplitter.maxThreads;
     }
 
     public set maxThreads(value: number) {
-        this._threadsSplitter.maxThreads = Math.floor(Math.max(1, value));
+        this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
     }
 
     public get logLevel() {
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
@@ -114,7 +114,11 @@ export type LlamaOptions = {
     /**
      * The maximum number of threads to use for the Llama instance.
      *
-     * Defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher.
+     * Set to `0` to have no thread limit.
+     *
+     * When not using a GPU, defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher.
+     *
+     * When using a GPU, there's no limit by default.
      */
     maxThreads?: number,
 
@@ -174,7 +178,11 @@ export type LastBuildOptions = {
     /**
      * The maximum number of threads to use for the Llama instance.
      *
-     * Defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher.
+     * Set to `0` to have no thread limit.
+     *
+     * When not using a GPU, defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher.
+     *
+     * When using a GPU, there's no limit by default.
      */
     maxThreads?: number,
 
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
@@ -72,22 +72,22 @@ export async function printCommonInfoLines({
             value: () => toOneLine(String(model.tokens.eosString))
         }, {
             title: "Train context size",
-            value: String(model.trainContextSize)
+            value: model.trainContextSize.toLocaleString("en-US")
         }]
     });
     printInfoLine({
         title: "Context",
         padTitle: padTitle,
         info: [{
             title: "Size",
-            value: String(context.contextSize)
+            value: context.contextSize.toLocaleString("en-US")
         }, {
             title: "Threads",
-            value: String(context.currentThreads)
+            value: context.currentThreads.toLocaleString("en-US")
         }, {
             show: logBatchSize,
             title: "Batch size",
-            value: bytes(context.batchSize)
+            value: context.batchSize.toLocaleString("en-US")
         }, {
             show: context.flashAttention,
             title: "Flash attention",
diff --git a/src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts b/src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts
@@ -5,6 +5,8 @@ import {ChatSessionModelFunction} from "../../../types.js";
  * Define a function that can be used by the model in a chat session, and return it.
  *
  * This is a helper function to facilitate defining functions with full TypeScript type information.
+ *
+ * The handler function can return a Promise, and the return value will be awaited before being returned to the model.
  * @param functionDefinition
  */
 export function defineChatSessionFunction<const Params extends GbnfJsonSchema | undefined>({
@@ -13,8 +15,8 @@ export function defineChatSessionFunction<const Params extends GbnfJsonSchema |
     handler
 }: {
     description?: string,
-    params?: Params,
-    handler: (params: GbnfJsonSchemaToType<Params>) => any
+    params?: Params & GbnfJsonSchema,
+    handler: (params: GbnfJsonSchemaToType<Params>) => Promise<any> | any
 }): ChatSessionModelFunction<Params> {
     return {
         description,
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -94,10 +94,19 @@ export class LlamaContext {
         this._flashAttention = flashAttention;
         this._idealThreads = typeof threads === "number"
             ? this._llama._threadsSplitter.normalizeThreadsValue(threads)
-            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.ideal ?? this._llama.maxThreads);
-        this._minThreads = typeof threads === "number"
-            ? 1
-            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1);
+            : this._llama._threadsSplitter.normalizeThreadsValue(
+                threads?.ideal ?? (
+                    this._llama.maxThreads === 0
+                        ? this._llama.cpuMathCores
+                        : this._llama.maxThreads
+                )
+            );
+        this._minThreads = Math.max(
+            1,
+            typeof threads === "number"
+                ? 1
+                : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
+        );
         this._performanceTracking = !!performanceTracking;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
@@ -417,7 +426,11 @@ export class LlamaContext {
                 }
 
                 if (currentBatchSize !== 0) {
-                    const [threadsToUse, consumerHandle] = await this._threadSplitterConsumer?.getAllocationToConsume() ?? [];
+                    const allocationResult = this._threadSplitterConsumer?.getAllocationToConsume();
+                    const [threadsToUse, consumerHandle] = allocationResult instanceof Promise
+                        ? await allocationResult ?? []
+                        : allocationResult ?? [];
+
                     try {
                         if (threadsToUse != null)
                             this._ctx.setThreads(threadsToUse);
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -62,15 +62,21 @@ export type LlamaContextOptions = {
      * To ensure the minimum number of threads you want to use are always used,
      * set this to an object with a `min` property (see the `min` property description for more details).
      *
-     * Defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+     * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+     *
+     * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+     * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
      */
     threads?: number | {
         /**
          * The ideal number of threads to use for evaluations.
          *
          * If other evaluations are running, the actual number of threads may be lower than this value.
          *
-         * Defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+         * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+         *
+         * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+         * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
          */
         ideal?: number,
 
diff --git a/src/utils/ThreadsSplitter.ts b/src/utils/ThreadsSplitter.ts
@@ -1,4 +1,5 @@
 import {DisposedError, DisposableHandle} from "lifecycle-utils";
+import type {Promisable} from "./transformPromisable.js";
 
 export class ThreadsSplitter {
     private readonly _threadDemands = new MaxNumberCollection();
@@ -7,22 +8,32 @@ export class ThreadsSplitter {
     private _totalWantedThreads: number = 0;
     public maxThreads: number;
 
+    /**
+     * Set to `0` to disable the limit
+     * @param maxThreads
+     */
     public constructor(maxThreads: number) {
-        this.maxThreads = Math.floor(Math.max(1, maxThreads));
+        this.maxThreads = Math.floor(Math.max(0, maxThreads));
 
         this._removeWantedThreads = this._removeWantedThreads.bind(this);
         this._removeThreadDemand = this._removeThreadDemand.bind(this);
     }
 
     public createConsumer(wantedThreads: number, minThreads: number = 1) {
-        if (minThreads > wantedThreads)
+        if (wantedThreads !== 0 && minThreads > wantedThreads)
             minThreads = wantedThreads;
 
+        if (this.maxThreads !== 0 && wantedThreads === 0)
+            wantedThreads = this.maxThreads;
+
         return new ThreadsSplitterConsumer(this, wantedThreads, minThreads);
     }
 
     public normalizeThreadsValue(threads: number) {
-        return Math.floor(Math.max(1, Math.min(this.maxThreads, threads)));
+        if (this.maxThreads === 0)
+            return Math.floor(Math.max(0, threads));
+
+        return Math.floor(Math.max(0, Math.min(this.maxThreads, threads)));
     }
 
     /** @internal */
@@ -152,10 +163,17 @@ export class ThreadsSplitterConsumer {
         this._demandedThreadsGcRegistry.unregister(this);
     }
 
-    public async getAllocationToConsume(): Promise<[threadsToUse: number, usageHandle: DisposableHandle]> {
+    public getAllocationToConsume(): Promisable<[threadsToUse: number, usageHandle: DisposableHandle]> {
         if (this._disposed)
             throw new DisposedError();
 
+        if (this._threadsSplitter.maxThreads === 0)
+            return [this._wantedThreads, new DisposableHandle(() => {})];
+
+        return this._getAsyncAllocationToConsume();
+    }
+
+    private async _getAsyncAllocationToConsume(): Promise<[threadsToUse: number, usageHandle: DisposableHandle]> {
         do {
             this._usedThreads = this._threadsSplitter._getUpdatedActiveThreads(
                 this._usedThreads, this._wantedThreads, this._demandedThreads
diff --git a/test/standalone/utils/ThreadsSplitter.test.ts b/test/standalone/utils/ThreadsSplitter.test.ts
@@ -16,7 +16,7 @@ describe("utils", () => {
             const consumer2 = threadSplitter.createConsumer(8, 1);
             const allocationPromise = consumer2.getAllocationToConsume();
             let allocationPromiseResolved = false;
-            allocationPromise.then(() => {
+            Promise.resolve(allocationPromise).then(() => {
                 allocationPromiseResolved = true;
             });
             await new Promise(resolve => setTimeout(resolve, 0));
@@ -61,7 +61,7 @@ describe("utils", () => {
             const consumer3 = threadSplitter.createConsumer(8, 5);
             const allocationPromise = consumer3.getAllocationToConsume();
             let allocationPromiseResolved = false;
-            allocationPromise.then(() => {
+            Promise.resolve(allocationPromise).then(() => {
                 allocationPromiseResolved = true;
             });
             await new Promise(resolve => setTimeout(resolve, 0));

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ function getDryRunResult() {`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`const dryRunResult = getDryRunResult();`
	`86`	`+console.info("Next release type", dryRunResult?.nextRelease?.type);`
`86`	`87`	`if (dryRunResult == null \|\| !(dryRunResult.nextRelease.type === "major" \|\| dryRunResult.nextRelease.type === "minor"))`
`87`	`88`	`githubPluginConfig.discussionCategoryName = false;`
`88`	`89`