Add support of fp16 shaders in canary (#116)

tqchen · tqchen · commit ba98442c4ef1 · 2023-05-25T15:03:54.000-04:00
diff --git a/examples/simple-chat/package.json b/examples/simple-chat/package.json
@@ -8,9 +8,11 @@
         "build": "cp src/gh-config.js src/app-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash --public-url /web-llm"
     },
     "devDependencies": {
+        "buffer": "^5.7.1",
         "parcel": "^2.8.3",
-        "typescript": "^4.9.5",
-        "tslib": "^2.3.1"
+        "process": "^0.11.10",
+        "tslib": "^2.3.1",
+        "typescript": "^4.9.5"
     },
     "dependencies": {
         "@mlc-ai/web-llm": "file:../.."
diff --git a/examples/simple-chat/src/gh-config.js b/examples/simple-chat/src/gh-config.js
@@ -7,10 +7,16 @@ export default {
 		{
 			"model_url": "https://huggingface.co/mlc-ai/mlc-chat-vicuna-v1-7b-q4f32_0/resolve/main/",
 			"local_id": "vicuna-v1-7b-q4f32_0"
+		},
+		{
+			"model_url": "https://huggingface.co/mlc-ai/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_0/resolve/main/",
+			"local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_0",
+			"required_features": ["shader-f16"],
 		}
 	],
 	"model_lib_map": {
 		"vicuna-v1-7b-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/vicuna-v1-7b-q4f32_0-webgpu.wasm",
-		"RedPajama-INCITE-Chat-3B-v1-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm"
+		"RedPajama-INCITE-Chat-3B-v1-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm",
+		"RedPajama-INCITE-Chat-3B-v1-q4f16_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f16_0-webgpu.wasm"
 	}
 }
diff --git a/examples/simple-chat/src/mlc-local-config.js b/examples/simple-chat/src/mlc-local-config.js
@@ -1,18 +1,26 @@
 // config used when serving from local mlc-llm/dist
 // use web-llm/script/serve_mlc_llm_dist.sh to start the artifact server
 export default {
-	"model_list": [
-		{
-			"model_url": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f32_0/params/",
-			"local_id": "RedPajama-INCITE-Chat-3B-v1-q4f32_0"
-		},
-		{
-			"model_url": "http://localhost:8000/vicuna-v1-7b-q4f32_0/params/",
-      		"local_id": "vicuna-v1-7b-q4f32_0"
-		}
-	],
-	"model_lib_map": {
-		"vicuna-v1-7b-q4f32_0": "http://localhost:8000/vicuna-v1-7b-q4f32_0/vicuna-v1-7b-q4f32_0-webgpu.wasm",
-		"RedPajama-INCITE-Chat-3B-v1-q4f32_0": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f32_0/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm"
-	}
+  "model_list": [
+    {
+      "model_url": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f32_0/params/",
+      "local_id": "RedPajama-INCITE-Chat-3B-v1-q4f32_0"
+    },
+    {
+      "model_url": "http://localhost:8000/vicuna-v1-7b-q4f32_0/params/",
+      "local_id": "vicuna-v1-7b-q4f32_0"
+    },
+    // fp16 options are enabled through chrome canary flags
+    // chrome --enale-dawn-features=enable_unsafe_apis
+    {
+      "model_url": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f16_0/params/",
+      "local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_0",
+      "required_features": ["shader-f16"]
+    }
+  ],
+  "model_lib_map": {
+    "vicuna-v1-7b-q4f32_0": "http://localhost:8000/vicuna-v1-7b-q4f32_0/vicuna-v1-7b-q4f32_0-webgpu.wasm",
+    "RedPajama-INCITE-Chat-3B-v1-q4f32_0": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f32_0/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm",
+    "RedPajama-INCITE-Chat-3B-v1-q4f16_0": "http://localhost:8000/RedPajama-INCITE-Chat-3B-v1-q4f16_0/RedPajama-INCITE-Chat-3B-v1-q4f16_0-webgpu.wasm"
+  }
 }
diff --git a/examples/simple-chat/src/simple_chat.ts b/examples/simple-chat/src/simple_chat.ts
@@ -1,5 +1,5 @@
 import appConfig from "./app-config";
-import { ChatModule } from "@mlc-ai/web-llm";
+import { ChatModule, ModelRecord } from "@mlc-ai/web-llm";
 
 function getElementAndCheck(id: string): HTMLElement {
   const element = document.getElementById(id);
@@ -9,11 +9,6 @@ function getElementAndCheck(id: string): HTMLElement {
   return element;
 }
 
-interface ModelRecord {
-  model_url: string;
-  local_id: string;
-}
-
 interface AppConfig {
   model_list: Array<ModelRecord>;
   model_lib_map?: Record<string, string>;
@@ -159,7 +154,7 @@ class ChatUI {
   }
 
   private resetChatHistory() {
-    const clearTags = ["left", "right", "init"];
+    const clearTags = ["left", "right", "init", "error"];
     for (const tag of clearTags) {
       const matches = this.uiChat.getElementsByClassName(`msg ${tag}-msg`);
       for (const item of matches) {
@@ -173,13 +168,23 @@ class ChatUI {
 
   private async asyncInitChat() {
     if (this.chatLoaded) return;
-
+    this.requestInProgress = true;
     this.appendMessage("init", "");
     const initProgressCallback = (report) => {
       this.updateLastMessage("init", report.text);
     }
     this.chat.setInitProgressCallback(initProgressCallback);
-    await this.chat.reload(this.selectedModel, undefined, this.config);
+
+    try {
+      await this.chat.reload(this.selectedModel, undefined, this.config);
+    } catch (err) {
+      this.appendMessage("error", "Init error, " + err.toString());
+      console.log(err.stack);
+      this.unloadChat();
+      this.requestInProgress = false;
+      return;
+    }
+    this.requestInProgress = false;
     this.chatLoaded = true;
   }
 
@@ -192,16 +197,8 @@ class ChatUI {
    * Run generate
    */
   private async asyncGenerate() {
+    await this.asyncInitChat();
     this.requestInProgress = true;
-    try {
-      await this.asyncInitChat();
-    } catch (err) {
-      this.appendMessage("error", "Init error, " + err.toString());
-      console.log(err.stack);
-      this.unloadChat();
-      this.requestInProgress = false;
-      return;
-    }
     const prompt = this.uiChatInput.value;
     if (prompt == "") {
       this.requestInProgress = false;
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@mlc-ai/web-llm",
-  "version": "0.1.0",
+  "version": "0.1.2",
   "description": "Hardware accelerated language model chats on browsers",
   "main": "lib/index.js",
   "types": "lib/index.d.ts",
diff --git a/site/index.md b/site/index.md
@@ -27,15 +27,17 @@ Won’t it be even more amazing if we can simply open up a browser and directly
 
 ## Instructions
 
-WebGPU just shipped to Chrome. You can try out the latest Chrome 113. Chrome version ≤ 112 is not supported, and if you are using it, the demo will raise an error like `Find an error initializing the WebGPU device OperationError: Required limit (1073741824) is greater than the supported limit (268435456). - While validating maxBufferSize - While validating required limits.`
+WebGPU just shipped to Chrome. You can try out the latest Chrome 113. Chrome version ≤ 112 is not supported, and if you are using it,
+the demo will raise an error like `Find an error initializing the WebGPU device OperationError: Required limit (1073741824) is greater than the supported limit (268435456). - While validating maxBufferSize - While validating required limits.`
 We have tested it on Windows and Mac, you will need a GPU with about 6GB memory to run Vicuna-7B and about 3GB memory to run RedPajama-3B.
+Some of the models requires fp16 support. To enable fp16 shaders, you will need to use the following instruction(`allow_unsafe_apis`) to turn it on in Chrome Canary.
 
 If you have a Mac computer with Apple silicon, here are the instructions for you to run the chatbot demo on your browser locally:
 
 - Upgrade Chrome to version ≥ 113.
 - Launch Chrome. You are recommended to launch from terminal with the following command:
   ```
-  /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --enable-dawn-features=disable_robustness
+  /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --enable-dawn-features=allow_unsafe_apis,disable_robustness
   ```
   This command turns off the robustness check from Chrome that slows down chatbot reply to times. It is not necessary, but we strongly recommend you to start Chrome with this command.
 - Select the model you want to try out. Enter your inputs, click “Send” – we are ready to go! The chat bot will first fetch model parameters into local cache. The download may take a few minutes, only for the first run. The subsequent refreshes and runs will be faster.
diff --git a/src/chat_module.ts b/src/chat_module.ts
@@ -18,7 +18,6 @@ export class ChatModule implements ChatInterface {
   private pipeline?: LLMChatPipeline;
   private initProgressCallback?: InitProgressCallback;
   private interruptSignal = false;
-  private artifactCache = new tvmjs.ArtifactCache();
 
   setInitProgressCallback(initProgressCallback: InitProgressCallback) {
     this.initProgressCallback = initProgressCallback;
@@ -31,22 +30,25 @@ export class ChatModule implements ChatInterface {
       appConfig = prebuiltAppConfig;
     }
 
-    const findModelUrl = () => {
+    const findModelRecord = () => {
       const matchedItem = appConfig?.model_list.find(
         item => item.local_id == localId
       );
-      if (matchedItem !== undefined) return matchedItem.model_url;
+      if (matchedItem !== undefined) return matchedItem;
       throw Error("Cannot find model_url for " + localId);
     }
 
-    let modelUrl = findModelUrl();
+    const modelRecord = findModelRecord();
+    let modelUrl = modelRecord.model_url;
     if (!modelUrl.startsWith("http")) {
       modelUrl = new URL(modelUrl, document.URL).href;
     }
+    const configCache = new tvmjs.ArtifactCache("webllm/config");
+
     // load config
     const configUrl = new URL("mlc-chat-config.json", modelUrl).href;
     const config = await (
-      await this.artifactCache.fetchWithCache(configUrl)
+      await configCache.fetchWithCache(configUrl)
     ).json() as ChatConfig;
 
 
@@ -62,10 +64,22 @@ export class ChatModule implements ChatInterface {
     }
 
     // load tvm wasm
+    const wasmCache = new tvmjs.ArtifactCache("webllm/wasm");
     const wasmUrl = findWasmUrl();
-    const wasmSource = await (
-      await this.artifactCache.fetchWithCache(wasmUrl)
-    ).arrayBuffer();
+    const fetchWasmSource = async () => {
+      if (wasmUrl.includes("localhost")) {
+        // do not cache wasm on local host as we might update code frequently
+        return await fetch(wasmUrl);
+      } else if (!wasmUrl.startsWith("http")) {
+        // do not cache wasm on the same server as it can also refresh
+        // rely on the normal caching strategy
+        return await fetch(new URL(wasmUrl, document.URL).href);
+      } else {
+        // use cache
+        return await wasmCache.fetchWithCache(wasmUrl);
+      }
+    };
+    const wasmSource = await(await fetchWasmSource()).arrayBuffer();
 
     const tvm = await tvmjs.instantiate(
       new Uint8Array(wasmSource),
@@ -88,9 +102,27 @@ export class ChatModule implements ChatInterface {
     } else {
       gpuLabel += " - " + gpuDetectOutput.adapterInfo.vendor;
     }
+    if (modelRecord.required_features !== undefined) {
+      for (const feature of modelRecord.required_features) {
+        if (!gpuDetectOutput.device.features.has(feature)) {
+          if (feature == "shader-f16") {
+            throw Error(
+              "This model requires WebGPU extension shader-f16, " +
+              "which is not enabled in this browser. " +
+              "You can try Chrome Canary with flag --enable-dawn-features=allow_unsafe_api"
+            );
+          }
+          throw Error(
+            "This model requires feature " + feature +
+            ", which is not yet supported by this browser. "
+          );
+        }
+      }
+    }
+
     tvm.initWebGPU(gpuDetectOutput.device);
     const tokenizer = await this.asyncLoadTokenizer(modelUrl, config);
-    await tvm.fetchNDArrayCache(modelUrl, tvm.webgpu());
+    await tvm.fetchNDArrayCache(modelUrl, tvm.webgpu(), "webllm/model");
 
     this.pipeline = new LLMChatPipeline(tvm, tokenizer, config);
     await this.pipeline?.asyncLoadWebGPUPiplines();
@@ -192,13 +224,14 @@ export class ChatModule implements ChatInterface {
     baseUrl: string,
     config: ChatConfig
   ): Promise<Tokenizer> {
+    const modelCache = new tvmjs.ArtifactCache("webllm/model");
     if (config.tokenizer_files.includes("tokenizer.model")) {
       const url = new URL("tokenizer.model", baseUrl).href;
-      const model = await (await this.artifactCache.fetchWithCache(url)).arrayBuffer();
+      const model = await (await modelCache.fetchWithCache(url)).arrayBuffer();
       return Tokenizer.fromSentencePiece(model);
     } else if (config.tokenizer_files.includes("tokenizer.json")) {
       const url = new URL("tokenizer.json", baseUrl).href;
-      const model = await (await this.artifactCache.fetchWithCache(url)).arrayBuffer();
+      const model = await (await modelCache.fetchWithCache(url)).arrayBuffer();
       return Tokenizer.fromJSON(model);
     }
     throw Error("Cannot handle tokenizer files " + config.tokenizer_files)
diff --git a/src/config.ts b/src/config.ts
@@ -29,6 +29,7 @@ export interface ChatConfig {
 export interface ModelRecord {
   model_url: string;
   local_id: string;
+  required_features?: Array<string>;
 }
 /**
  * Extra configuration taht can be
diff --git a/src/index.ts b/src/index.ts
@@ -1,5 +1,5 @@
 export {
-  AppConfig
+  ModelRecord, AppConfig
 } from "./config";
 
 
diff --git a/src/types.ts b/src/types.ts
@@ -1,4 +1,4 @@
-import { ModelRecord, AppConfig } from "./config"
+import { AppConfig } from "./config"
 
 /**
  * Custom options that can be used to
@@ -10,8 +10,6 @@ export interface ChatOptions {
   temperature?: number;
 }
 
-
-
 /**
  * Report during intialization.
  */
@@ -46,16 +44,16 @@ export interface ChatInterface {
    */
   setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void;
 
-	/**
-	 * Reload the chat with a new model.
-	 *
-	 * @param localIdOrUrl local_id of the model or model artifact url.
-	 * @param chatOpts Extra options to overide chat behavior.
-	 * @param appConfig Override the app config in this load.
-	 * @returns A promise when reload finishes.
+  /**
+   * Reload the chat with a new model.
+   *
+   * @param localIdOrUrl local_id of the model or model artifact url.
+   * @param chatOpts Extra options to overide chat behavior.
+   * @param appConfig Override the app config in this load.
+   * @returns A promise when reload finishes.
    * @note This is an async function.
-	 */
-	reload: (localIdOrUrl: string, chatOpts?: ChatOptions, appConfig?: AppConfig) => Promise<void>;
+   */
+  reload: (localIdOrUrl: string, chatOpts?: ChatOptions, appConfig?: AppConfig) => Promise<void>;
 
   /**
    * Generate a response for a given input.
@@ -83,9 +81,9 @@ export interface ChatInterface {
   interruptGenerate: () => void;
 
   /**
-	 * Explicitly unload the current model and release the related resources.
-	 */
-	unload: () => Promise<void>;
+   * Explicitly unload the current model and release the related resources.
+   */
+  unload: () => Promise<void>;
 
   /**
    * Reset the current chat session by clear all memories.

Original file line number	Diff line number	Diff line change
`@@ -7,10 +7,16 @@ export default {`
`7`	`7`	`{`
`8`	`8`	`"model_url": "https://huggingface.co/mlc-ai/mlc-chat-vicuna-v1-7b-q4f32_0/resolve/main/",`
`9`	`9`	`"local_id": "vicuna-v1-7b-q4f32_0"`
	`10`	`+ },`
	`11`	`+ {`
	`12`	`+ "model_url": "https://huggingface.co/mlc-ai/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_0/resolve/main/",`
	`13`	`+ "local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_0",`
	`14`	`+ "required_features": ["shader-f16"],`
`10`	`15`	`}`
`11`	`16`	`],`
`12`	`17`	`"model_lib_map": {`
`13`	`18`	`"vicuna-v1-7b-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/vicuna-v1-7b-q4f32_0-webgpu.wasm",`
`14`		`- "RedPajama-INCITE-Chat-3B-v1-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm"`
	`19`	`+ "RedPajama-INCITE-Chat-3B-v1-q4f32_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f32_0-webgpu.wasm",`
	`20`	`+ "RedPajama-INCITE-Chat-3B-v1-q4f16_0": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/RedPajama-INCITE-Chat-3B-v1-q4f16_0-webgpu.wasm"`
`15`	`21`	`}`
`16`	`22`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@mlc-ai/web-llm",`
`3`		`- "version": "0.1.0",`
	`3`	`+ "version": "0.1.2",`
`4`	`4`	`"description": "Hardware accelerated language model chats on browsers",`
`5`	`5`	`"main": "lib/index.js",`
`6`	`6`	`"types": "lib/index.d.ts",`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ export interface ChatConfig {`
`29`	`29`	`export interface ModelRecord {`
`30`	`30`	`model_url: string;`
`31`	`31`	`local_id: string;`
	`32`	`+ required_features?: Array<string>;`
`32`	`33`	`}`
`33`	`34`	`/**`
`34`	`35`	`* Extra configuration taht can be`