Skip to content

Commit 0d36c6b

Browse files
committed
add audio support on webui
1 parent e55a682 commit 0d36c6b

File tree

7 files changed

+172
-88
lines changed

7 files changed

+172
-88
lines changed

tools/server/public/index.html.gz

528 Bytes
Binary file not shown.

tools/server/tests/unit/test_vision_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def create_server():
3030
("What is this:\n", "malformed", False, None),
3131
("What is this:\n", "https://google.com/404", False, None), # non-existent image
3232
("What is this:\n", "https://ggml.ai", False, None), # non-image data
33+
# TODO @ngxson : test with multiple images, no images and with audio
3334
]
3435
)
3536
def test_vision_chat_completion(prompt, image_url, success, re_content):

tools/server/webui/src/components/ChatInputExtraContextItem.tsx

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
1+
import {
2+
DocumentTextIcon,
3+
SpeakerWaveIcon,
4+
XMarkIcon,
5+
} from '@heroicons/react/24/outline';
26
import { MessageExtra } from '../utils/types';
37
import { useState } from 'react';
48
import { classNames } from '../utils/misc';
@@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
6670
className="w-14 h-14 flex items-center justify-center"
6771
aria-description="Document icon"
6872
>
69-
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
73+
{item.type === 'audioFile' ? (
74+
<SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
75+
) : (
76+
<DocumentTextIcon className="h-8 w-8 text-gray-500" />
77+
)}
7078
</div>
7179

7280
<div className="text-xs pr-4">
@@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
98106
src={showingItem.base64Url}
99107
alt={`Preview image for ${showingItem.name}`}
100108
/>
109+
) : showingItem.type === 'audioFile' ? (
110+
<audio
111+
controls
112+
className="w-full"
113+
aria-description={`Audio file ${showingItem.name}`}
114+
>
115+
<source
116+
src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
117+
type={showingItem.mimeType}
118+
aria-description={`Audio file ${showingItem.name}`}
119+
/>
120+
Your browser does not support the audio element.
121+
</audio>
101122
) : (
102123
<div className="overflow-x-auto">
103124
<pre className="whitespace-pre-wrap break-words text-sm">

tools/server/webui/src/components/ChatScreen.tsx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,13 @@ export default function ChatScreen() {
278278

279279
function ServerInfo() {
280280
const { serverProps } = useAppContext();
281+
const modalities = [];
282+
if (serverProps?.modalities?.audio) {
283+
modalities.push('audio');
284+
}
285+
if (serverProps?.modalities?.vision) {
286+
modalities.push('vision');
287+
}
281288
return (
282289
<div
283290
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
@@ -291,6 +298,13 @@ function ServerInfo() {
291298
<br />
292299
<b>Build</b>: {serverProps?.build_info}
293300
<br />
301+
{modalities.length > 0 ? (
302+
<>
303+
<b>Supported modalities:</b> {modalities.join(', ')}
304+
</>
305+
) : (
306+
''
307+
)}
294308
</p>
295309
</div>
296310
</div>

tools/server/webui/src/components/useChatExtraContext.tsx

Lines changed: 113 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
1111
// This file handles uploading extra context items (a.k.a files)
1212
// It allows processing these kinds of files:
1313
// - image files (converted to base64)
14+
// - audio files (converted to base64)
1415
// - text files (including code files)
1516
// - pdf (converted to text)
1617

@@ -41,107 +42,114 @@ export function useChatExtraContext(): ChatExtraContextApi {
4142

4243
const isSupportVision = serverProps?.modalities?.vision;
4344

44-
const onFileAdded = (files: File[]) => {
45-
for (const file of files) {
46-
const mimeType = file.type;
47-
console.debug({ mimeType, file });
48-
if (file.size > 10 * 1024 * 1024) {
49-
toast.error('File is too large. Maximum size is 10MB.');
50-
break;
51-
}
52-
53-
if (mimeType.startsWith('image/')) {
54-
if (!isSupportVision) {
55-
toast.error('Multimodal is not supported by this server or model.');
45+
const onFileAdded = async (files: File[]) => {
46+
try {
47+
for (const file of files) {
48+
const mimeType = file.type;
49+
if (file.size > 10 * 1024 * 1024) {
50+
toast.error('File is too large. Maximum size is 10MB.');
5651
break;
5752
}
58-
const reader = new FileReader();
59-
reader.onload = async (event) => {
60-
if (event.target?.result) {
61-
let base64Url = event.target.result as string;
62-
63-
if (mimeType === 'image/svg+xml') {
64-
// Convert SVG to PNG
65-
base64Url = await svgBase64UrlToPngDataURL(base64Url);
66-
}
6753

68-
addItems([
69-
{
70-
type: 'imageFile',
71-
name: file.name,
72-
base64Url,
73-
},
74-
]);
54+
if (mimeType.startsWith('image/')) {
55+
if (!isSupportVision) {
56+
toast.error('Multimodal is not supported by this server or model.');
57+
break;
7558
}
76-
};
77-
reader.readAsDataURL(file);
78-
} else if (
79-
mimeType.startsWith('video/') ||
80-
mimeType.startsWith('audio/')
81-
) {
82-
toast.error('Video and audio files are not supported yet.');
83-
break;
84-
} else if (mimeType.startsWith('application/pdf')) {
85-
if (config.pdfAsImage && !isSupportVision) {
86-
toast(
87-
'Multimodal is not supported, PDF will be converted to text instead of image.'
88-
);
59+
60+
let base64Url = await getFileAsBase64(file);
61+
if (mimeType === 'image/svg+xml') {
62+
// Convert SVG to PNG
63+
base64Url = await svgBase64UrlToPngDataURL(base64Url);
64+
}
65+
addItems([
66+
{
67+
type: 'imageFile',
68+
name: file.name,
69+
base64Url,
70+
},
71+
]);
72+
} else if (mimeType.startsWith('video/')) {
73+
toast.error('Video files are not supported yet.');
8974
break;
90-
}
75+
} else if (mimeType.startsWith('audio/')) {
76+
if (!/mpeg|wav/.test(mimeType)) {
77+
toast.error('Only mp3 and wav audio files are supported.');
78+
break;
79+
}
9180

92-
const promise =
93-
config.pdfAsImage && isSupportVision
94-
? convertPDFToImage(file).then((base64Urls) => {
95-
addItems(
96-
base64Urls.map((base64Url) => ({
97-
type: 'imageFile',
98-
name: file.name,
99-
base64Url,
100-
}))
101-
);
102-
})
103-
: convertPDFToText(file).then((content) => {
104-
if (isSupportVision) {
105-
toast.success(
106-
'PDF file converted to text. You can also convert it to image, see in Settings.'
107-
);
108-
}
109-
addItems([
110-
{
111-
type: 'textFile',
112-
name: file.name,
113-
content,
114-
},
115-
]);
116-
});
117-
118-
promise.catch((error) => {
119-
console.error(error);
120-
toast.error('Failed to parse PDF file.');
121-
});
122-
break;
123-
} else {
124-
// Because there can be many text file types (like code file), we will not check the mime type
125-
// and will just check if the file is not binary.
126-
const reader = new FileReader();
127-
reader.onload = (event) => {
128-
if (event.target?.result) {
129-
const content = event.target.result as string;
130-
if (!isLikelyNotBinary(content)) {
131-
toast.error('File is binary. Please upload a text file.');
132-
return;
133-
}
81+
// plain base64, not a data URL
82+
const base64Data = await getFileAsBase64(file, false);
83+
addItems([
84+
{
85+
type: 'audioFile',
86+
name: file.name,
87+
mimeType,
88+
base64Data,
89+
},
90+
]);
91+
} else if (mimeType.startsWith('application/pdf')) {
92+
if (config.pdfAsImage && !isSupportVision) {
93+
toast(
94+
'Multimodal is not supported, PDF will be converted to text instead of image.'
95+
);
96+
break;
97+
}
98+
99+
if (config.pdfAsImage && isSupportVision) {
100+
// Convert PDF to images
101+
const base64Urls = await convertPDFToImage(file);
102+
addItems(
103+
base64Urls.map((base64Url) => ({
104+
type: 'imageFile',
105+
name: file.name,
106+
base64Url,
107+
}))
108+
);
109+
} else {
110+
// Convert PDF to text
111+
const content = await convertPDFToText(file);
134112
addItems([
135113
{
136114
type: 'textFile',
137115
name: file.name,
138116
content,
139117
},
140118
]);
119+
if (isSupportVision) {
120+
toast.success(
121+
'PDF file converted to text. You can also convert it to image, see in Settings.'
122+
);
123+
}
141124
}
142-
};
143-
reader.readAsText(file);
125+
break;
126+
} else {
127+
// Because there can be many text file types (like code file), we will not check the mime type
128+
// and will just check if the file is not binary.
129+
const reader = new FileReader();
130+
reader.onload = (event) => {
131+
if (event.target?.result) {
132+
const content = event.target.result as string;
133+
if (!isLikelyNotBinary(content)) {
134+
toast.error('File is binary. Please upload a text file.');
135+
return;
136+
}
137+
addItems([
138+
{
139+
type: 'textFile',
140+
name: file.name,
141+
content,
142+
},
143+
]);
144+
}
145+
};
146+
reader.readAsText(file);
147+
}
144148
}
149+
} catch (error) {
150+
const message = error instanceof Error ? error.message : String(error);
151+
const errorMessage = `Error processing file: ${message}`;
152+
toast.error(errorMessage);
145153
}
146154
};
147155

@@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
154162
};
155163
}
156164

165+
async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
166+
return new Promise((resolve, reject) => {
167+
const reader = new FileReader();
168+
reader.onload = (event) => {
169+
if (event.target?.result) {
170+
let result = event.target.result as string;
171+
if (!outputUrl) {
172+
// remove base64 url prefix and correct characters
173+
result = result.substring(result.indexOf(',') + 1);
174+
}
175+
resolve(result);
176+
} else {
177+
reject(new Error('Failed to read file.'));
178+
}
179+
};
180+
reader.readAsDataURL(file);
181+
});
182+
}
183+
157184
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
158185
return new Promise((resolve, reject) => {
159186
const reader = new FileReader();

tools/server/webui/src/utils/misc.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
8989
type: 'image_url',
9090
image_url: { url: extra.base64Url },
9191
});
92+
} else if (extra.type === 'audioFile') {
93+
contentArr.push({
94+
type: 'input_audio',
95+
input_audio: {
96+
data: extra.base64Data,
97+
format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
98+
},
99+
});
92100
} else {
93101
throw new Error('Unknown extra type');
94102
}

tools/server/webui/src/utils/types.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ export interface Message {
5151
export type MessageExtra =
5252
| MessageExtraTextFile
5353
| MessageExtraImageFile
54+
| MessageExtraAudioFile
5455
| MessageExtraContext;
5556

5657
export interface MessageExtraTextFile {
@@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
6566
base64Url: string;
6667
}
6768

69+
export interface MessageExtraAudioFile {
70+
type: 'audioFile';
71+
name: string;
72+
base64Data: string;
73+
mimeType: string;
74+
}
75+
6876
export interface MessageExtraContext {
6977
type: 'context';
7078
name: string;
@@ -79,6 +87,10 @@ export type APIMessageContentPart =
7987
| {
8088
type: 'image_url';
8189
image_url: { url: string };
90+
}
91+
| {
92+
type: 'input_audio';
93+
input_audio: { data: string; format: 'wav' | 'mp3' };
8294
};
8395

8496
export type APIMessage = {
@@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
120132
n_ctx: number;
121133
modalities?: {
122134
vision: boolean;
135+
audio: boolean;
123136
};
124137
// TODO: support params
125138
}

0 commit comments

Comments
 (0)