Skip to content

Commit 910673b

Browse files
committed
stamp: adding paper references for model magic numbers
Documenting the "magic numbers" of the `text-to-audio` exmaple, [original paper](https://arxiv.org/pdf/2306.07691)
1 parent 0c67095 commit 910673b

File tree

1 file changed

+78
-70
lines changed

1 file changed

+78
-70
lines changed

examples/text-to-audio/index.ts

Lines changed: 78 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -7,97 +7,105 @@ import { phonemize } from './phonemizer.js';
77

88
const { Tensor, RawSession } = Supabase.ai;
99

10+
/* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Model%20Training):
11+
> All datasets were resampled to 24 kHz to match LibriTTS, and the texts
12+
> were converted into phonemes using phonemizer'
13+
*/
14+
const SAMPLE_RATE = 24000; // 24 kHz
15+
16+
/* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Detailed%20Model%20Architectures):
17+
> The size of s and c is 256 × 1
18+
*/
1019
const STYLE_DIM = 256;
11-
const SAMPLE_RATE = 24000;
1220
const MODEL_ID = 'onnx-community/Kokoro-82M-ONNX';
1321

1422
// https://huggingface.co/onnx-community/Kokoro-82M-ONNX#samples
1523
const ALLOWED_VOICES = [
16-
'af_bella',
17-
'af_nicole',
18-
'af_sarah',
19-
'af_sky',
20-
'am_adam',
21-
'am_michael',
22-
'bf_emma',
23-
'bf_isabella',
24-
'bm_george',
25-
'bm_lewis',
24+
'af_bella',
25+
'af_nicole',
26+
'af_sarah',
27+
'af_sky',
28+
'am_adam',
29+
'am_michael',
30+
'bf_emma',
31+
'bf_isabella',
32+
'bm_george',
33+
'bm_lewis',
2634
];
2735

2836
const session = await RawSession.fromHuggingFace(MODEL_ID);
2937

3038
Deno.serve(async (req) => {
31-
const params = new URL(req.url).searchParams;
32-
const text = params.get('text') ?? 'Hello from Supabase!';
33-
const voice = params.get('voice') ?? 'af_bella';
34-
35-
if (!ALLOWED_VOICES.includes(voice)) {
36-
return Response.json({
37-
error: `invalid voice '${voice}'`,
38-
must_be_one_of: ALLOWED_VOICES,
39-
}, { status: 400 });
40-
}
41-
42-
const tokenizer = await loadTokenizer();
43-
const language = voice.at(0); // 'a'merican | 'b'ritish
44-
const phonemes = await phonemize(text, language);
45-
const { input_ids } = tokenizer(phonemes, {
46-
truncation: true,
47-
});
48-
49-
// Select voice style based on number of input tokens
50-
const num_tokens = Math.max(
51-
input_ids.dims.at(-1) - 2, // Without padding;
52-
0,
53-
);
54-
55-
const voiceStyle = await loadVoiceStyle(voice, num_tokens);
56-
57-
const { waveform } = await session.run({
58-
input_ids,
59-
style: voiceStyle,
60-
speed: new Tensor('float32', [1], [1]),
61-
});
62-
63-
// Do `wave` encoding from rust backend
64-
const audio = await waveform.tryEncodeAudio(SAMPLE_RATE);
65-
66-
return new Response(audio, {
67-
headers: {
68-
'Content-Type': 'audio/wav',
69-
},
70-
});
39+
const params = new URL(req.url).searchParams;
40+
const text = params.get('text') ?? 'Hello from Supabase!';
41+
const voice = params.get('voice') ?? 'af_bella';
42+
43+
if (!ALLOWED_VOICES.includes(voice)) {
44+
return Response.json({
45+
error: `invalid voice '${voice}'`,
46+
must_be_one_of: ALLOWED_VOICES,
47+
}, { status: 400 });
48+
}
49+
50+
const tokenizer = await loadTokenizer();
51+
const language = voice.at(0); // 'a'merican | 'b'ritish
52+
const phonemes = await phonemize(text, language);
53+
const { input_ids } = tokenizer(phonemes, {
54+
truncation: true,
55+
});
56+
57+
// Select voice style based on number of input tokens
58+
const num_tokens = Math.max(
59+
input_ids.dims.at(-1) - 2, // Without padding;
60+
0,
61+
);
62+
63+
const voiceStyle = await loadVoiceStyle(voice, num_tokens);
64+
65+
const { waveform } = await session.run({
66+
input_ids,
67+
style: voiceStyle,
68+
speed: new Tensor('float32', [1], [1]),
69+
});
70+
71+
// Do `wave` encoding from rust backend
72+
const audio = await waveform.tryEncodeAudio(SAMPLE_RATE);
73+
74+
return new Response(audio, {
75+
headers: {
76+
'Content-Type': 'audio/wav',
77+
},
78+
});
7179
});
7280

7381
async function loadVoiceStyle(voice: string, num_tokens: number) {
74-
const voice_url =
75-
`https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${voice}.bin?download=true`;
82+
const voice_url =
83+
`https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${voice}.bin?download=true`;
7684

77-
console.log('loading voice:', voice_url);
85+
console.log('loading voice:', voice_url);
7886

79-
const voiceBuffer = await fetch(voice_url).then(async (res) => await res.arrayBuffer());
87+
const voiceBuffer = await fetch(voice_url).then(async (res) => await res.arrayBuffer());
8088

81-
const offset = num_tokens * STYLE_DIM;
82-
const voiceData = new Float32Array(voiceBuffer).slice(
83-
offset,
84-
offset + STYLE_DIM,
85-
);
89+
const offset = num_tokens * STYLE_DIM;
90+
const voiceData = new Float32Array(voiceBuffer).slice(
91+
offset,
92+
offset + STYLE_DIM,
93+
);
8694

87-
return new Tensor('float32', voiceData, [1, STYLE_DIM]);
95+
return new Tensor('float32', voiceData, [1, STYLE_DIM]);
8896
}
8997

9098
async function loadTokenizer() {
91-
// BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
92-
// const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
99+
// BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
100+
// const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
93101

94-
const tokenizerData = await fetch(
95-
'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true',
96-
).then(async (res) => await res.json());
102+
const tokenizerData = await fetch(
103+
'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true',
104+
).then(async (res) => await res.json());
97105

98-
const tokenizerConfig = await fetch(
99-
'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true',
100-
).then(async (res) => await res.json());
106+
const tokenizerConfig = await fetch(
107+
'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true',
108+
).then(async (res) => await res.json());
101109

102-
return new PreTrainedTokenizer(tokenizerData, tokenizerConfig);
110+
return new PreTrainedTokenizer(tokenizerData, tokenizerConfig);
103111
}

0 commit comments

Comments
 (0)