diff --git a/examples/README.md b/examples/README.md index 74c5f47d..3d1265b0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,6 +22,7 @@ Note that all examples below run in-browser and use WebGPU as a backend. - [simple-chat-ts](simple-chat-ts): a mininum and complete chat bot app in TypeScript. - [get-started-web-worker](get-started-web-worker): same as get-started, but using web worker. - [next-simple-chat](next-simple-chat): a mininum and complete chat bot app with [Next.js](https://nextjs.org/). +- [wasm-gating](wasm-gating): capability-based routing between baseline and subgroup WebGPU WASM builds. - [multi-round-chat](multi-round-chat): while APIs are functional, we internally optimize so that multi round chat usage can reuse KV cache - [text-completion](text-completion): demonstrates API `engine.completions.create()`, which is pure text completion with no conversation, as opposed to `engine.chat.completions.create()` - [embeddings](embeddings): demonstrates API `engine.embeddings.create()`, integration with `EmbeddingsInterface` and `MemoryVectorStore` of [Langchain.js](https://js.langchain.com), and RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine diff --git a/examples/wasm-gating/README.md b/examples/wasm-gating/README.md new file mode 100644 index 00000000..a460a74a --- /dev/null +++ b/examples/wasm-gating/README.md @@ -0,0 +1,19 @@ +# WebLLM Wasm Gating App + +This folder provides a minimum demo to show capability-based routing between +baseline and subgroup WebGPU WASM builds in a webapp setting. +To try it out, you can do the following steps under this folder + +```bash +npm install +npm start +``` + +Edit `src/wasm_gating.ts` if you would like to point the example at your own +model path and baseline `model_lib`. The example will switch to +`-subgroups.wasm` when the adapter reports subgroup support. + +Note if you would like to hack WebLLM core package. +You can change the WebLLM dependency to `"file:../.."`, and follow the build +from source instruction in the project to build webllm locally. This option is only recommended +if you would like to hack WebLLM core package. diff --git a/examples/wasm-gating/package.json b/examples/wasm-gating/package.json new file mode 100644 index 00000000..1d144aa7 --- /dev/null +++ b/examples/wasm-gating/package.json @@ -0,0 +1,25 @@ +{ + "name": "wasm-gating", + "version": "0.1.0", + "private": true, + "scripts": { + "start": "parcel src/wasm_gating.html --port 8888", + "build": "parcel build src/wasm_gating.html --dist-dir lib" + }, + "devDependencies": { + "buffer": "^5.7.1", + "crypto-browserify": "^3.12.1", + "events": "^3.3.0", + "parcel": "^2.8.3", + "process": "^0.11.10", + "stream-browserify": "^3.0.0", + "string_decoder": "^1.3.0", + "tslib": "^2.3.1", + "typescript": "^4.9.5", + "url": "^0.11.3", + "vm-browserify": "^1.1.2" + }, + "dependencies": { + "@mlc-ai/web-llm": "^0.2.82" + } +} diff --git a/examples/wasm-gating/src/wasm_gating.html b/examples/wasm-gating/src/wasm_gating.html new file mode 100644 index 00000000..388bfae8 --- /dev/null +++ b/examples/wasm-gating/src/wasm_gating.html @@ -0,0 +1,26 @@ + + + + +

WebLLM Test Page

+ Open console to see output +
+
+ +
+
+

Prompt

+ +
+
+

Response

+ +
+
+ + + + + diff --git a/examples/wasm-gating/src/wasm_gating.ts b/examples/wasm-gating/src/wasm_gating.ts new file mode 100644 index 00000000..d3dce1a5 --- /dev/null +++ b/examples/wasm-gating/src/wasm_gating.ts @@ -0,0 +1,111 @@ +import * as webllm from "@mlc-ai/web-llm"; + +function setLabel(id: string, text: string) { + const label = document.getElementById(id); + if (label == null) { + throw Error("Cannot find label " + id); + } + label.innerText = text; +} + +async function main() { + const initProgressCallback = (report: webllm.InitProgressReport) => { + setLabel("init-label", report.text); + }; + + const selectedModel = "Llama-3.2-1B-Instruct-q4f16_1-MLC"; + const adapter = await (navigator as any).gpu?.requestAdapter({ + powerPreference: "high-performance", + }); + if (adapter == null) { + throw Error("Unable to request a WebGPU adapter."); + } + const supportsSubgroups = adapter.features.has("subgroups"); + // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts` + const modelRecord = webllm.prebuiltAppConfig.model_list.find( + (entry) => entry.model_id === selectedModel, + ); + const appConfig = + supportsSubgroups && modelRecord !== undefined + ? { + model_list: [ + { + ...modelRecord, + model_lib: modelRecord.model_lib.replace( + /\.wasm$/, + "-subgroups.wasm", + ), + }, + ], + } + : undefined; + const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + selectedModel, + { + appConfig: appConfig, + initProgressCallback: initProgressCallback, + logLevel: "INFO", // specify the log level + }, + // customize kv cache, use either context_window_size or sliding_window_size (with attention sink) + { + context_window_size: 2048, + // sliding_window_size: 1024, + // attention_sink_size: 4, + }, + ); + + // Option 2: Specify your own model other than the prebuilt ones + // const appConfig: webllm.AppConfig = { + // model_list: [ + // { + // model: "http://127.0.0.1:8000/models/Llama-3.2-1B-Instruct-q4f16_1-MLC/", + // model_id: "Llama-3.2-1B-Instruct-q4f16_1-MLC", + // model_lib: "http://127.0.0.1:8000/libs/Llama-3.2-1B-Instruct-q4f16_1-webgpu.wasm", + // overrides: { + // context_window_size: 2048, + // }, + // }, + // ], + // }; + // if (supportsSubgroups) { + // appConfig.model_list[0].model_lib = appConfig.model_list[0].model_lib.replace( + // /\.wasm$/, + // "-subgroups.wasm", + // ); + // } + // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + // selectedModel, + // { appConfig: appConfig, initProgressCallback: initProgressCallback }, + // ); + + // Option 3: Instantiate MLCEngine() and call reload() separately + // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({ + // appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig + // initProgressCallback: initProgressCallback, + // }); + // await engine.reload(selectedModel); + + const reply0 = await engine.chat.completions.create({ + messages: [{ role: "user", content: "List three US states." }], + // below configurations are all optional + n: 3, + temperature: 1.5, + max_tokens: 256, + // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct + // So we would have a higher chance of seeing the latter two, but never the first in the answer + logit_bias: { + "46510": -100, + "7188": -100, + "8421": 5, + "51325": 5, + }, + logprobs: true, + top_logprobs: 2, + }); + console.log(reply0); + console.log(reply0.usage); + + // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)` +} + +main();