diff --git a/examples/README.md b/examples/README.md
index 74c5f47d..3d1265b0 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -22,6 +22,7 @@ Note that all examples below run in-browser and use WebGPU as a backend.
- [simple-chat-ts](simple-chat-ts): a mininum and complete chat bot app in TypeScript.
- [get-started-web-worker](get-started-web-worker): same as get-started, but using web worker.
- [next-simple-chat](next-simple-chat): a mininum and complete chat bot app with [Next.js](https://nextjs.org/).
+- [wasm-gating](wasm-gating): capability-based routing between baseline and subgroup WebGPU WASM builds.
- [multi-round-chat](multi-round-chat): while APIs are functional, we internally optimize so that multi round chat usage can reuse KV cache
- [text-completion](text-completion): demonstrates API `engine.completions.create()`, which is pure text completion with no conversation, as opposed to `engine.chat.completions.create()`
- [embeddings](embeddings): demonstrates API `engine.embeddings.create()`, integration with `EmbeddingsInterface` and `MemoryVectorStore` of [Langchain.js](https://js.langchain.com), and RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine
diff --git a/examples/wasm-gating/README.md b/examples/wasm-gating/README.md
new file mode 100644
index 00000000..a460a74a
--- /dev/null
+++ b/examples/wasm-gating/README.md
@@ -0,0 +1,19 @@
+# WebLLM Wasm Gating App
+
+This folder provides a minimum demo to show capability-based routing between
+baseline and subgroup WebGPU WASM builds in a webapp setting.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Edit `src/wasm_gating.ts` if you would like to point the example at your own
+model path and baseline `model_lib`. The example will switch to
+`-subgroups.wasm` when the adapter reports subgroup support.
+
+Note if you would like to hack WebLLM core package.
+You can change the WebLLM dependency to `"file:../.."`, and follow the build
+from source instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/wasm-gating/package.json b/examples/wasm-gating/package.json
new file mode 100644
index 00000000..1d144aa7
--- /dev/null
+++ b/examples/wasm-gating/package.json
@@ -0,0 +1,25 @@
+{
+ "name": "wasm-gating",
+ "version": "0.1.0",
+ "private": true,
+ "scripts": {
+ "start": "parcel src/wasm_gating.html --port 8888",
+ "build": "parcel build src/wasm_gating.html --dist-dir lib"
+ },
+ "devDependencies": {
+ "buffer": "^5.7.1",
+ "crypto-browserify": "^3.12.1",
+ "events": "^3.3.0",
+ "parcel": "^2.8.3",
+ "process": "^0.11.10",
+ "stream-browserify": "^3.0.0",
+ "string_decoder": "^1.3.0",
+ "tslib": "^2.3.1",
+ "typescript": "^4.9.5",
+ "url": "^0.11.3",
+ "vm-browserify": "^1.1.2"
+ },
+ "dependencies": {
+ "@mlc-ai/web-llm": "^0.2.82"
+ }
+}
diff --git a/examples/wasm-gating/src/wasm_gating.html b/examples/wasm-gating/src/wasm_gating.html
new file mode 100644
index 00000000..388bfae8
--- /dev/null
+++ b/examples/wasm-gating/src/wasm_gating.html
@@ -0,0 +1,26 @@
+
+
+
+
+
WebLLM Test Page
+ Open console to see output
+
+
+
+
+
+
Prompt
+
+
+
+
Response
+
+
+
+
+
+
+
+
diff --git a/examples/wasm-gating/src/wasm_gating.ts b/examples/wasm-gating/src/wasm_gating.ts
new file mode 100644
index 00000000..d3dce1a5
--- /dev/null
+++ b/examples/wasm-gating/src/wasm_gating.ts
@@ -0,0 +1,111 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+ const label = document.getElementById(id);
+ if (label == null) {
+ throw Error("Cannot find label " + id);
+ }
+ label.innerText = text;
+}
+
+async function main() {
+ const initProgressCallback = (report: webllm.InitProgressReport) => {
+ setLabel("init-label", report.text);
+ };
+
+ const selectedModel = "Llama-3.2-1B-Instruct-q4f16_1-MLC";
+ const adapter = await (navigator as any).gpu?.requestAdapter({
+ powerPreference: "high-performance",
+ });
+ if (adapter == null) {
+ throw Error("Unable to request a WebGPU adapter.");
+ }
+ const supportsSubgroups = adapter.features.has("subgroups");
+ // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
+ const modelRecord = webllm.prebuiltAppConfig.model_list.find(
+ (entry) => entry.model_id === selectedModel,
+ );
+ const appConfig =
+ supportsSubgroups && modelRecord !== undefined
+ ? {
+ model_list: [
+ {
+ ...modelRecord,
+ model_lib: modelRecord.model_lib.replace(
+ /\.wasm$/,
+ "-subgroups.wasm",
+ ),
+ },
+ ],
+ }
+ : undefined;
+ const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+ selectedModel,
+ {
+ appConfig: appConfig,
+ initProgressCallback: initProgressCallback,
+ logLevel: "INFO", // specify the log level
+ },
+ // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
+ {
+ context_window_size: 2048,
+ // sliding_window_size: 1024,
+ // attention_sink_size: 4,
+ },
+ );
+
+ // Option 2: Specify your own model other than the prebuilt ones
+ // const appConfig: webllm.AppConfig = {
+ // model_list: [
+ // {
+ // model: "http://127.0.0.1:8000/models/Llama-3.2-1B-Instruct-q4f16_1-MLC/",
+ // model_id: "Llama-3.2-1B-Instruct-q4f16_1-MLC",
+ // model_lib: "http://127.0.0.1:8000/libs/Llama-3.2-1B-Instruct-q4f16_1-webgpu.wasm",
+ // overrides: {
+ // context_window_size: 2048,
+ // },
+ // },
+ // ],
+ // };
+ // if (supportsSubgroups) {
+ // appConfig.model_list[0].model_lib = appConfig.model_list[0].model_lib.replace(
+ // /\.wasm$/,
+ // "-subgroups.wasm",
+ // );
+ // }
+ // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+ // selectedModel,
+ // { appConfig: appConfig, initProgressCallback: initProgressCallback },
+ // );
+
+ // Option 3: Instantiate MLCEngine() and call reload() separately
+ // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({
+ // appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig
+ // initProgressCallback: initProgressCallback,
+ // });
+ // await engine.reload(selectedModel);
+
+ const reply0 = await engine.chat.completions.create({
+ messages: [{ role: "user", content: "List three US states." }],
+ // below configurations are all optional
+ n: 3,
+ temperature: 1.5,
+ max_tokens: 256,
+ // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
+ // So we would have a higher chance of seeing the latter two, but never the first in the answer
+ logit_bias: {
+ "46510": -100,
+ "7188": -100,
+ "8421": 5,
+ "51325": 5,
+ },
+ logprobs: true,
+ top_logprobs: 2,
+ });
+ console.log(reply0);
+ console.log(reply0.usage);
+
+ // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
+}
+
+main();