@@ -7,97 +7,105 @@ import { phonemize } from './phonemizer.js';
7
7
8
8
const { Tensor, RawSession } = Supabase . ai ;
9
9
10
+ /* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Model%20Training):
11
+ > All datasets were resampled to 24 kHz to match LibriTTS, and the texts
12
+ > were converted into phonemes using phonemizer'
13
+ */
14
+ const SAMPLE_RATE = 24000 ; // 24 kHz
15
+
16
+ /* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Detailed%20Model%20Architectures):
17
+ > The size of s and c is 256 × 1
18
+ */
10
19
const STYLE_DIM = 256 ;
11
- const SAMPLE_RATE = 24000 ;
12
20
const MODEL_ID = 'onnx-community/Kokoro-82M-ONNX' ;
13
21
14
22
// https://huggingface.co/onnx-community/Kokoro-82M-ONNX#samples
15
23
const ALLOWED_VOICES = [
16
- 'af_bella' ,
17
- 'af_nicole' ,
18
- 'af_sarah' ,
19
- 'af_sky' ,
20
- 'am_adam' ,
21
- 'am_michael' ,
22
- 'bf_emma' ,
23
- 'bf_isabella' ,
24
- 'bm_george' ,
25
- 'bm_lewis' ,
24
+ 'af_bella' ,
25
+ 'af_nicole' ,
26
+ 'af_sarah' ,
27
+ 'af_sky' ,
28
+ 'am_adam' ,
29
+ 'am_michael' ,
30
+ 'bf_emma' ,
31
+ 'bf_isabella' ,
32
+ 'bm_george' ,
33
+ 'bm_lewis' ,
26
34
] ;
27
35
28
36
const session = await RawSession . fromHuggingFace ( MODEL_ID ) ;
29
37
30
38
Deno . serve ( async ( req ) => {
31
- const params = new URL ( req . url ) . searchParams ;
32
- const text = params . get ( 'text' ) ?? 'Hello from Supabase!' ;
33
- const voice = params . get ( 'voice' ) ?? 'af_bella' ;
34
-
35
- if ( ! ALLOWED_VOICES . includes ( voice ) ) {
36
- return Response . json ( {
37
- error : `invalid voice '${ voice } '` ,
38
- must_be_one_of : ALLOWED_VOICES ,
39
- } , { status : 400 } ) ;
40
- }
41
-
42
- const tokenizer = await loadTokenizer ( ) ;
43
- const language = voice . at ( 0 ) ; // 'a'merican | 'b'ritish
44
- const phonemes = await phonemize ( text , language ) ;
45
- const { input_ids } = tokenizer ( phonemes , {
46
- truncation : true ,
47
- } ) ;
48
-
49
- // Select voice style based on number of input tokens
50
- const num_tokens = Math . max (
51
- input_ids . dims . at ( - 1 ) - 2 , // Without padding;
52
- 0 ,
53
- ) ;
54
-
55
- const voiceStyle = await loadVoiceStyle ( voice , num_tokens ) ;
56
-
57
- const { waveform } = await session . run ( {
58
- input_ids,
59
- style : voiceStyle ,
60
- speed : new Tensor ( 'float32' , [ 1 ] , [ 1 ] ) ,
61
- } ) ;
62
-
63
- // Do `wave` encoding from rust backend
64
- const audio = await waveform . tryEncodeAudio ( SAMPLE_RATE ) ;
65
-
66
- return new Response ( audio , {
67
- headers : {
68
- 'Content-Type' : 'audio/wav' ,
69
- } ,
70
- } ) ;
39
+ const params = new URL ( req . url ) . searchParams ;
40
+ const text = params . get ( 'text' ) ?? 'Hello from Supabase!' ;
41
+ const voice = params . get ( 'voice' ) ?? 'af_bella' ;
42
+
43
+ if ( ! ALLOWED_VOICES . includes ( voice ) ) {
44
+ return Response . json ( {
45
+ error : `invalid voice '${ voice } '` ,
46
+ must_be_one_of : ALLOWED_VOICES ,
47
+ } , { status : 400 } ) ;
48
+ }
49
+
50
+ const tokenizer = await loadTokenizer ( ) ;
51
+ const language = voice . at ( 0 ) ; // 'a'merican | 'b'ritish
52
+ const phonemes = await phonemize ( text , language ) ;
53
+ const { input_ids } = tokenizer ( phonemes , {
54
+ truncation : true ,
55
+ } ) ;
56
+
57
+ // Select voice style based on number of input tokens
58
+ const num_tokens = Math . max (
59
+ input_ids . dims . at ( - 1 ) - 2 , // Without padding;
60
+ 0 ,
61
+ ) ;
62
+
63
+ const voiceStyle = await loadVoiceStyle ( voice , num_tokens ) ;
64
+
65
+ const { waveform } = await session . run ( {
66
+ input_ids,
67
+ style : voiceStyle ,
68
+ speed : new Tensor ( 'float32' , [ 1 ] , [ 1 ] ) ,
69
+ } ) ;
70
+
71
+ // Do `wave` encoding from rust backend
72
+ const audio = await waveform . tryEncodeAudio ( SAMPLE_RATE ) ;
73
+
74
+ return new Response ( audio , {
75
+ headers : {
76
+ 'Content-Type' : 'audio/wav' ,
77
+ } ,
78
+ } ) ;
71
79
} ) ;
72
80
73
81
async function loadVoiceStyle ( voice : string , num_tokens : number ) {
74
- const voice_url =
75
- `https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${ voice } .bin?download=true` ;
82
+ const voice_url =
83
+ `https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${ voice } .bin?download=true` ;
76
84
77
- console . log ( 'loading voice:' , voice_url ) ;
85
+ console . log ( 'loading voice:' , voice_url ) ;
78
86
79
- const voiceBuffer = await fetch ( voice_url ) . then ( async ( res ) => await res . arrayBuffer ( ) ) ;
87
+ const voiceBuffer = await fetch ( voice_url ) . then ( async ( res ) => await res . arrayBuffer ( ) ) ;
80
88
81
- const offset = num_tokens * STYLE_DIM ;
82
- const voiceData = new Float32Array ( voiceBuffer ) . slice (
83
- offset ,
84
- offset + STYLE_DIM ,
85
- ) ;
89
+ const offset = num_tokens * STYLE_DIM ;
90
+ const voiceData = new Float32Array ( voiceBuffer ) . slice (
91
+ offset ,
92
+ offset + STYLE_DIM ,
93
+ ) ;
86
94
87
- return new Tensor ( 'float32' , voiceData , [ 1 , STYLE_DIM ] ) ;
95
+ return new Tensor ( 'float32' , voiceData , [ 1 , STYLE_DIM ] ) ;
88
96
}
89
97
90
98
async function loadTokenizer ( ) {
91
- // BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
92
- // const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
99
+ // BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
100
+ // const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
93
101
94
- const tokenizerData = await fetch (
95
- 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true' ,
96
- ) . then ( async ( res ) => await res . json ( ) ) ;
102
+ const tokenizerData = await fetch (
103
+ 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true' ,
104
+ ) . then ( async ( res ) => await res . json ( ) ) ;
97
105
98
- const tokenizerConfig = await fetch (
99
- 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true' ,
100
- ) . then ( async ( res ) => await res . json ( ) ) ;
106
+ const tokenizerConfig = await fetch (
107
+ 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true' ,
108
+ ) . then ( async ( res ) => await res . json ( ) ) ;
101
109
102
- return new PreTrainedTokenizer ( tokenizerData , tokenizerConfig ) ;
110
+ return new PreTrainedTokenizer ( tokenizerData , tokenizerConfig ) ;
103
111
}
0 commit comments