Skip to content

Commit 9c0aec4

Browse files
authored
[Vision] Support Phi-3.5-vision, the first VLM in WebLLM (#563)
This PR supports the first Vision Language Model, Phi-3.5-vision. For a full example, see `examples/vision-model`. Overall usage follows OpenAI API. We add `Phi-3.5-vision-instruct-q4f16_1-MLC` and `Phi-3.5-vision-instruct-q4f16_1-MLC` to prebuilt model list.
1 parent cf59d7a commit 9c0aec4

18 files changed

+1690
-271
lines changed

examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ These examples demonstrate various capabilities via WebLLM's OpenAI-like API.
3636
- [json-schema](json-schema): besides guaranteeing output to be in JSON, ensure output to adhere to a specific JSON schema specified the user
3737
- [seed-to-reproduce](seed-to-reproduce): use seeding to ensure reproducible output with fields `seed`.
3838
- [function-calling](function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support).
39+
- [vision-model](vision-model): process request with image input using Vision Language Model (e.g. Phi3.5-vision)
3940

4041
#### Chrome Extension
4142

examples/vision-model/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# WebLLM Get Started App
2+
3+
This folder provides a minimum demo to show WebLLM API in a webapp setting.
4+
To try it out, you can do the following steps under this folder
5+
6+
```bash
7+
npm install
8+
npm start
9+
```
10+
11+
Note if you would like to hack WebLLM core package.
12+
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
13+
instruction in the project to build webllm locally. This option is only recommended
14+
if you would like to hack WebLLM core package.

examples/vision-model/package.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"name": "get-started",
3+
"version": "0.1.0",
4+
"private": true,
5+
"scripts": {
6+
"start": "parcel src/vision_model.html --port 8888",
7+
"build": "parcel build src/vision_model.html --dist-dir lib"
8+
},
9+
"devDependencies": {
10+
"buffer": "^5.7.1",
11+
"parcel": "^2.8.3",
12+
"process": "^0.11.10",
13+
"tslib": "^2.3.1",
14+
"typescript": "^4.9.5",
15+
"url": "^0.11.3"
16+
},
17+
"dependencies": {
18+
"@mlc-ai/web-llm": "file:../.."
19+
}
20+
}

examples/vision-model/src/utils.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
export function getImageDataFromURL(url: string): Promise<ImageData> {
2+
return new Promise((resolve, reject) => {
3+
// Converts img to any, and later `as CanvasImageSource`, otherwise build complains
4+
const img: any = new Image();
5+
img.crossOrigin = "anonymous"; // Important for CORS
6+
img.onload = () => {
7+
const canvas: HTMLCanvasElement = document.createElement("canvas");
8+
const ctx: CanvasRenderingContext2D = canvas.getContext("2d")!;
9+
canvas.width = img.width;
10+
canvas.height = img.height;
11+
ctx.drawImage(img as CanvasImageSource, 0, 0);
12+
13+
const imageData = ctx.getImageData(0, 0, img.width, img.height);
14+
resolve(imageData);
15+
};
16+
img.onerror = () => reject(new Error("Failed to load image"));
17+
img.src = url;
18+
});
19+
}
20+
21+
export async function imageURLToBase64(url: string): Promise<string> {
22+
const imageData: ImageData = await getImageDataFromURL(url);
23+
const canvas = document.createElement("canvas");
24+
const ctx = canvas.getContext("2d");
25+
26+
canvas.width = imageData.width;
27+
canvas.height = imageData.height;
28+
29+
ctx!.putImageData(imageData, 0, 0);
30+
31+
return canvas.toDataURL();
32+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<!doctype html>
2+
<html>
3+
<script>
4+
webLLMGlobal = {};
5+
</script>
6+
<body>
7+
<h2>WebLLM Test Page</h2>
8+
Open console to see output
9+
<br />
10+
<br />
11+
<label id="init-label"> </label>
12+
13+
<h3>Prompt</h3>
14+
<label id="prompt-label"> </label>
15+
16+
<h3>Response</h3>
17+
<label id="generate-label"> </label>
18+
<br />
19+
<label id="stats-label"> </label>
20+
21+
<script type="module" src="./vision_model.ts"></script>
22+
</body>
23+
</html>
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import * as webllm from "@mlc-ai/web-llm";
2+
import { imageURLToBase64 } from "./utils";
3+
4+
function setLabel(id: string, text: string) {
5+
const label = document.getElementById(id);
6+
if (label == null) {
7+
throw Error("Cannot find label " + id);
8+
}
9+
label.innerText = text;
10+
}
11+
12+
const proxyUrl = "https://cors-anywhere.herokuapp.com/";
13+
const url_https_street = "https://www.ilankelman.org/stopsigns/australia.jpg";
14+
const url_https_tree = "https://www.ilankelman.org/sunset.jpg";
15+
const url_https_sea =
16+
"https://www.islandvulnerability.org/index/silhouette.jpg";
17+
18+
async function main() {
19+
// can feed request with either base64 or http url
20+
const url_base64_street = await imageURLToBase64(proxyUrl + url_https_street);
21+
22+
const initProgressCallback = (report: webllm.InitProgressReport) => {
23+
setLabel("init-label", report.text);
24+
};
25+
const selectedModel = "Phi-3.5-vision-instruct-q4f16_1-MLC";
26+
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
27+
selectedModel,
28+
{
29+
initProgressCallback: initProgressCallback,
30+
logLevel: "INFO", // specify the log level
31+
},
32+
{
33+
context_window_size: 6144,
34+
},
35+
);
36+
37+
// 1. Single image input (with choices)
38+
const messages: webllm.ChatCompletionMessageParam[] = [
39+
{
40+
role: "user",
41+
content: [
42+
{ type: "text", text: "List the items in each image concisely." },
43+
{
44+
type: "image_url",
45+
image_url: {
46+
url: url_base64_street,
47+
},
48+
},
49+
{
50+
type: "image_url",
51+
image_url: {
52+
url: proxyUrl + url_https_sea,
53+
},
54+
},
55+
],
56+
},
57+
];
58+
const request0: webllm.ChatCompletionRequest = {
59+
stream: false, // can be streaming, same behavior
60+
messages: messages,
61+
};
62+
const reply0 = await engine.chat.completions.create(request0);
63+
const replyMessage0 = await engine.getMessage();
64+
console.log(reply0);
65+
console.log(replyMessage0);
66+
console.log(reply0.usage);
67+
68+
// 2. A follow up text-only question
69+
messages.push({ role: "assistant", content: replyMessage0 });
70+
messages.push({ role: "user", content: "What is special about each image?" });
71+
const request1: webllm.ChatCompletionRequest = {
72+
stream: false, // can be streaming, same behavior
73+
messages: messages,
74+
};
75+
const reply1 = await engine.chat.completions.create(request1);
76+
const replyMessage1 = await engine.getMessage();
77+
console.log(reply1);
78+
console.log(replyMessage1);
79+
console.log(reply1.usage);
80+
81+
// 3. A follow up multi-image question
82+
messages.push({ role: "assistant", content: replyMessage1 });
83+
messages.push({
84+
role: "user",
85+
content: [
86+
{ type: "text", text: "What about this image? Answer concisely." },
87+
{
88+
type: "image_url",
89+
image_url: { url: proxyUrl + url_https_tree },
90+
},
91+
],
92+
});
93+
const request2: webllm.ChatCompletionRequest = {
94+
stream: false, // can be streaming, same behavior
95+
messages: messages,
96+
};
97+
const reply2 = await engine.chat.completions.create(request2);
98+
const replyMessage2 = await engine.getMessage();
99+
console.log(reply2);
100+
console.log(replyMessage2);
101+
console.log(reply2.usage);
102+
}
103+
104+
main();

src/config.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ export function postInitAndCheckGenerationConfigValues(
229229
export enum ModelType {
230230
"LLM",
231231
"embedding",
232+
"VLM", // vision-language model
232233
}
233234

234235
/**
@@ -512,6 +513,37 @@ export const prebuiltAppConfig: AppConfig = {
512513
context_window_size: 1024,
513514
},
514515
},
516+
// Phi-3.5-vision-instruct
517+
{
518+
model:
519+
"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f16_1-MLC",
520+
model_id: "Phi-3.5-vision-instruct-q4f16_1-MLC",
521+
model_lib:
522+
modelLibURLPrefix +
523+
modelVersion +
524+
"/Phi-3.5-vision-instruct-q4f16_1-ctx4k_cs2k-webgpu.wasm",
525+
vram_required_MB: 3952.18,
526+
low_resource_required: true,
527+
overrides: {
528+
context_window_size: 4096,
529+
},
530+
model_type: ModelType.VLM,
531+
},
532+
{
533+
model:
534+
"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f32_1-MLC",
535+
model_id: "Phi-3.5-vision-instruct-q4f32_1-MLC",
536+
model_lib:
537+
modelLibURLPrefix +
538+
modelVersion +
539+
"/Phi-3.5-vision-instruct-q4f32_1-ctx4k_cs2k-webgpu.wasm",
540+
vram_required_MB: 5879.84,
541+
low_resource_required: true,
542+
overrides: {
543+
context_window_size: 4096,
544+
},
545+
model_type: ModelType.VLM,
546+
},
515547
// Mistral variants
516548
{
517549
model:

0 commit comments

Comments
 (0)