Skip to content

Commit 44e9e02

Browse files
julien-cSBrandeis
andauthored
OuteAI/OuteTTS-0.3-1B is a text-to-speech (#1117)
Co-authored-by: Simon Brandeis <[email protected]>
1 parent c83cc3e commit 44e9e02

File tree

6 files changed

+173
-12
lines changed

6 files changed

+173
-12
lines changed

packages/inference/src/lib/makeRequestOptions.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ export async function makeRequestOptions(
3232
chatCompletion?: boolean;
3333
}
3434
): Promise<{ url: string; info: RequestInit }> {
35-
const { accessToken, endpointUrl, provider: maybeProvider, model: maybeModel, ...otherArgs } = args;
35+
const { accessToken, endpointUrl, provider: maybeProvider, model: maybeModel, ...remainingArgs } = args;
36+
let otherArgs = remainingArgs;
3637
const provider = maybeProvider ?? "hf-inference";
3738

3839
const { forceTask, includeCredentials, taskHint, wait_for_model, use_cache, dont_load_model, chatCompletion } =
@@ -120,12 +121,13 @@ export async function makeRequestOptions(
120121
credentials = "include";
121122
}
122123

123-
/*
124+
/**
125+
* Replicate models wrap all inputs inside { input: ... }
124126
* Versioned Replicate models in the format `owner/model:version` expect the version in the body
125127
*/
126-
if (provider === "replicate" && model.includes(":")) {
127-
const version = model.split(":")[1];
128-
(otherArgs as typeof otherArgs & { version: string }).version = version;
128+
if (provider === "replicate") {
129+
const version = model.includes(":") ? model.split(":")[1] : undefined;
130+
(otherArgs as unknown) = { input: otherArgs, version };
129131
}
130132

131133
const info: RequestInit = {

packages/inference/src/providers/replicate.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export const REPLICATE_SUPPORTED_MODEL_IDS: ProviderMapping<ReplicateId> = {
1010
"ByteDance/SDXL-Lightning":
1111
"bytedance/sdxl-lightning-4step:5599ed30703defd1d160a25a63321b4dec97101d98b4674bcc56e41f62f35637",
1212
},
13-
// "text-to-speech": {
14-
// "SWivid/F5-TTS": "x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e"
15-
// },
13+
"text-to-speech": {
14+
"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26",
15+
},
1616
};

packages/inference/src/tasks/audio/textToSpeech.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,31 @@ export type TextToSpeechArgs = BaseArgs & {
1010
};
1111

1212
export type TextToSpeechOutput = Blob;
13-
13+
interface OutputUrlTextToSpeechGeneration {
14+
output: string | string[];
15+
}
1416
/**
1517
* This task synthesize an audio of a voice pronouncing a given text.
1618
* Recommended model: espnet/kan-bayashi_ljspeech_vits
1719
*/
1820
export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
19-
const res = await request<TextToSpeechOutput>(args, {
21+
const res = await request<TextToSpeechOutput | OutputUrlTextToSpeechGeneration>(args, {
2022
...options,
2123
taskHint: "text-to-speech",
2224
});
25+
if (res && typeof res === "object") {
26+
if ("output" in res) {
27+
if (typeof res.output === "string") {
28+
const urlResponse = await fetch(res.output);
29+
const blob = await urlResponse.blob();
30+
return blob;
31+
} else if (Array.isArray(res.output)) {
32+
const urlResponse = await fetch(res.output[0]);
33+
const blob = await urlResponse.blob();
34+
return blob;
35+
}
36+
}
37+
}
2338
const isValidOutput = res && res instanceof Blob;
2439
if (!isValidOutput) {
2540
throw new InferenceOutputError("Expected Blob");

packages/inference/src/tasks/cv/textToImage.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ interface OutputUrlImageGeneration {
5959
export async function textToImage(args: TextToImageArgs, options?: Options): Promise<TextToImageOutput> {
6060
if (args.provider === "together" || args.provider === "fal-ai") {
6161
args.prompt = args.inputs;
62-
args.inputs = "";
62+
delete (args as unknown as { inputs: unknown }).inputs;
6363
args.response_format = "base64";
6464
} else if (args.provider === "replicate") {
65-
args.input = { prompt: args.inputs };
65+
args.prompt = args.inputs;
6666
delete (args as unknown as { inputs: unknown }).inputs;
6767
}
6868
const res = await request<TextToImageOutput | Base64ImageGeneration | OutputUrlImageGeneration>(args, {

packages/inference/test/HfInference.spec.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,16 @@ describe.concurrent("HfInference", () => {
829829
});
830830
expect(res).toBeInstanceOf(Blob);
831831
});
832+
833+
it("textToSpeech OuteTTS", async () => {
834+
const res = await client.textToSpeech({
835+
model: "OuteAI/OuteTTS-0.3-500M",
836+
provider: "replicate",
837+
inputs: "OuteTTS is a frontier TTS model for its size of 1 Billion parameters",
838+
});
839+
840+
expect(res).toBeInstanceOf(Blob);
841+
});
832842
},
833843
TIMEOUT
834844
);

packages/inference/test/tapes.json

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4283,5 +4283,139 @@
42834283
"vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
42844284
}
42854285
}
4286+
},
4287+
"0588849bd8db5bca4bbb36916af37a03031b04788867d0db6634ff93cf19ded0": {
4288+
"url": "https://api.replicate.com/v1/predictions",
4289+
"init": {
4290+
"headers": {
4291+
"Content-Type": "application/json",
4292+
"Prefer": "wait"
4293+
},
4294+
"method": "POST",
4295+
"body": "{\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\"}"
4296+
},
4297+
"response": {
4298+
"body": "{\"id\":\"vxnyb0rbe9rm80cmgj1vs1t53w\",\"model\":\"jbilcke/oute-tts\",\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\",\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"logs\":\"\",\"output\":\"https://replicate.delivery/xezq/U5zqJthcGtqOAJcfYTF6Xudm8txQmSELa9oqcxR6ZWXZThDKA/output.wav\",\"data_removed\":false,\"error\":null,\"status\":\"processing\",\"created_at\":\"2025-01-20T16:19:28.242Z\",\"urls\":{\"cancel\":\"https://api.replicate.com/v1/predictions/vxnyb0rbe9rm80cmgj1vs1t53w/cancel\",\"get\":\"https://api.replicate.com/v1/predictions/vxnyb0rbe9rm80cmgj1vs1t53w\",\"stream\":\"https://stream.replicate.com/v1/files/bcwr-4lk3m6mpf6hkv27z2a625rwft7baeha2ryx5nd6pdpnfxt3wqbja\"}}",
4299+
"status": 201,
4300+
"statusText": "Created",
4301+
"headers": {
4302+
"alt-svc": "h3=\":443\"; ma=86400",
4303+
"cf-cache-status": "DYNAMIC",
4304+
"cf-ray": "905062e3cfc2d642-CDG",
4305+
"connection": "keep-alive",
4306+
"content-type": "application/json; charset=UTF-8",
4307+
"nel": "{\"success_fraction\":0,\"report_to\":\"cf-nel\",\"max_age\":604800}",
4308+
"preference-applied": "wait=60",
4309+
"ratelimit-remaining": "599",
4310+
"ratelimit-reset": "1",
4311+
"report-to": "{\"endpoints\":[{\"url\":\"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=OXZ56%2FXmwQi53DKlAoxLJ9Ib85Mm0WyD8%2F6BfygHgwaealJK7sn4mztJhIlQybIWJwTUdh1m%2B2XxemHSbupiRN5lMqyLYNLeH3u6WYxkUFOK6v%2FpSsjN9D27mvwJ2JBsfmGnDGbc4AhqoTLMSpmi\"}],\"group\":\"cf-nel\",\"max_age\":604800}",
4312+
"server": "cloudflare",
4313+
"server-timing": "cfL4;desc=\"?proto=TCP&rtt=5348&min_rtt=5271&rtt_var=2131&sent=5&recv=5&lost=0&retrans=0&sent_bytes=2849&recv_bytes=979&delivery_rate=686704&cwnd=252&unsent_bytes=0&cid=08dbc93a75c5b1df&ts=34187&x=0\"",
4314+
"strict-transport-security": "max-age=15552000",
4315+
"vary": "Accept-Encoding"
4316+
}
4317+
}
4318+
},
4319+
"89c2957dc10eb8b7ac6415cdd14447d4c54437354183fe0c7d0c3ef78cf34ad2": {
4320+
"url": "https://replicate.delivery/xezq/U5zqJthcGtqOAJcfYTF6Xudm8txQmSELa9oqcxR6ZWXZThDKA/output.wav",
4321+
"init": {},
4322+
"response": {
4323+
"body": "",
4324+
"status": 200,
4325+
"statusText": "OK",
4326+
"headers": {
4327+
"accept-ranges": "bytes",
4328+
"access-control-allow-origin": "*",
4329+
"alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000",
4330+
"cache-control": "public,max-age=3600",
4331+
"cache-id": "PAR-31976c84",
4332+
"cache-status": "miss",
4333+
"content-type": "audio/x-wav",
4334+
"etag": "\"f118ce7abd9171ff463e1319fd4c27cc\"",
4335+
"last-modified": "Mon, 20 Jan 2025 16:20:02 GMT",
4336+
"server": "UploadServer"
4337+
}
4338+
}
4339+
},
4340+
"46c9f8f46a2b7559af00877ac5eba3cf1723570f8f562de386dfd42820b61f29": {
4341+
"url": "https://fal.run/fal-ai/flux/schnell",
4342+
"init": {
4343+
"headers": {
4344+
"Content-Type": "application/json"
4345+
},
4346+
"method": "POST",
4347+
"body": "{\"prompt\":\"black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot\",\"response_format\":\"base64\"}"
4348+
},
4349+
"response": {
4350+
"body": "{\"images\":[{\"url\":\"https://fal.media/files/lion/diFpxNG0A6E45szVv6Zee.png\",\"width\":1024,\"height\":768,\"content_type\":\"image/jpeg\"}],\"timings\":{\"inference\":0.3554951280821115},\"seed\":288907632,\"has_nsfw_concepts\":[false],\"prompt\":\"black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot\"}",
4351+
"status": 200,
4352+
"statusText": "OK",
4353+
"headers": {
4354+
"connection": "keep-alive",
4355+
"content-type": "application/json",
4356+
"strict-transport-security": "max-age=31536000; includeSubDomains"
4357+
}
4358+
}
4359+
},
4360+
"7f69e94a720f6c2c4702c164c1004dd035606c8b53bf31beb27f497c60c834cd": {
4361+
"url": "https://fal.media/files/lion/diFpxNG0A6E45szVv6Zee.png",
4362+
"init": {},
4363+
"response": {
4364+
"body": "",
4365+
"status": 200,
4366+
"statusText": "OK",
4367+
"headers": {
4368+
"access-control-allow-headers": "*",
4369+
"access-control-allow-methods": "*",
4370+
"access-control-allow-origin": "*",
4371+
"access-control-max-age": "86400",
4372+
"cf-ray": "90506d1b8b2a6981-CDG",
4373+
"connection": "keep-alive",
4374+
"content-type": "image/jpeg",
4375+
"server": "cloudflare",
4376+
"vary": "Accept-Encoding"
4377+
}
4378+
}
4379+
},
4380+
"d09e45016a7c762377dd75b54fee128379098a0bf9fff7bf4f2e8341c6aba53e": {
4381+
"url": "https://api.together.xyz/v1/images/generations",
4382+
"init": {
4383+
"headers": {
4384+
"Content-Type": "application/json"
4385+
},
4386+
"method": "POST",
4387+
"body": "{\"prompt\":\"award winning high resolution photo of a giant tortoise\",\"response_format\":\"base64\",\"model\":\"stabilityai/stable-diffusion-xl-base-1.0\"}"
4388+
},
4389+
"response": {
4390+
"body": "{\"id\":\"90506d280b7b2298-CDG\",\"model\":\"stabilityai/stable-diffusion-xl-base-1.0\",\"object\":\"list\",\"data\":[{\"timings\":{\"inference\":4469},\"index\":0,\"b64_json\":\"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDlh2OKQkZ4pi5MZPcGnDOK5Ch4XceeKcBzTd22kUkmgC0hVRk09JfmOMY9Kr/Me5P409I3ZsDINKwxztzTVRnyw6euQKVouM5Jp65WMDB9qAGIdpPJJHel83PakDDkY/M0wjJ4ppgS7u9OU/8A66iBx6U7dQBOAMc05cE4qAMa\"}]}",
4391+
"status": 200,
4392+
"statusText": "OK",
4393+
"headers": {
4394+
"access-control-allow-origin": "*",
4395+
"alt-svc": "h3=\":443\"; ma=86400",
4396+
"cf-cache-status": "DYNAMIC",
4397+
"cf-ray": "90506d280b7b2298-CDG",
4398+
"connection": "keep-alive",
4399+
"content-encoding": "gzip",
4400+
"content-type": "application/json; charset=utf-8",
4401+
"etag": "W/\"1e79b-iMsh29xGKx5YyNetamSki3RBjcM\"",
4402+
"retry-after": "2",
4403+
"server": "cloudflare",
4404+
"strict-transport-security": "max-age=15552000; includeSubDomains",
4405+
"transfer-encoding": "chunked"
4406+
}
4407+
}
4408+
},
4409+
"45fbd4bfe447bc77eb34c8638a144658ed90fcd6d8163f907228a4bdff595518": {
4410+
"url": "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDlh2OKQkZ4pi5MZPcGnDOK5Ch4XceeKcBzTd22kUkmgC0hVRk09JfmOMY9Kr/Me5P409I3ZsDINKwxztzTVRnyw6euQKVouM5Jp65WMDB9qAGIdpPJJHel83PakDDkY/M0wjJ4ppgS7u9OU/8A66iBx6U7dQBOAMc05cE4qAMa",
4411+
"init": {},
4412+
"response": {
4413+
"body": "",
4414+
"status": 200,
4415+
"statusText": "OK",
4416+
"headers": {
4417+
"content-type": "image/jpeg"
4418+
}
4419+
}
42864420
}
42874421
}

0 commit comments

Comments
 (0)