Merge text-to-audio pipeline

xenova · xenova · commit 46c8320d98cc · 2025-11-19T19:30:32.000-05:00
diff --git a/src/pipelines/text-to-audio.js b/src/pipelines/text-to-audio.js
@@ -23,6 +23,9 @@ import { AutoModel } from '../models.js';
  *
  * @typedef {Object} TextToAudioPipelineOptions Parameters specific to text-to-audio pipelines.
  * @property {Tensor|Float32Array|string|URL} [speaker_embeddings=null] The speaker embeddings (if the model requires it).
+ * @property {number} [num_inference_steps] The number of denoising steps (if the model supports it).
+ * More denoising steps usually lead to higher quality audio but slower inference.
+ * @property {number} [speed] The speed of the generated audio (if the model supports it).
  *
  * @callback TextToAudioPipelineCallback Generates speech/audio from the inputs.
  * @param {string|string[]} texts The text(s) to generate.
@@ -36,31 +39,24 @@ import { AutoModel } from '../models.js';
  * Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`.
  * This pipeline generates an audio file from an input text and optional other conditional inputs.
  *
- * **Example:** Generate audio from text with `Xenova/speecht5_tts`.
+ * **Example:** Generate audio from text with `onnx-community/Supertonic-TTS-ONNX`.
  * ```javascript
- * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
- * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
- * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
+ * const synthesizer = await pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX');
+ * const speaker_embeddings = 'https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin';
+ * const output = await synthesizer('Hello there, how are you doing?', { speaker_embeddings });
  * // RawAudio {
- * //   audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
- * //   sampling_rate: 16000
+ * //   audio: Float32Array(95232) [-0.000482565927086398, -0.0004853440332226455, ...],
+ * //   sampling_rate: 44100
  * // }
- * ```
- *
- * You can then save the audio to a .wav file with the `wavefile` package:
- * ```javascript
- * import wavefile from 'wavefile';
- * import fs from 'fs';
- *
- * const wav = new wavefile.WaveFile();
- * wav.fromScratch(1, out.sampling_rate, '32f', out.audio);
- * fs.writeFileSync('out.wav', wav.toBuffer());
+ * 
+ * // Optional: Save the audio to a .wav file or Blob
+ * await output.save('output.wav'); // You can also use `output.toBlob()` to access the audio as a Blob
  * ```
  *
  * **Example:** Multilingual speech generation with `Xenova/mms-tts-fra`. See [here](https://huggingface.co/models?pipeline_tag=text-to-speech&other=vits&sort=trending) for the full list of available languages (1107).
  * ```javascript
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
- * const out = await synthesizer('Bonjour');
+ * const output = await synthesizer('Bonjour');
  * // RawAudio {
  * //   audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
  * //   sampling_rate: 16000
@@ -83,17 +79,78 @@ export class TextToAudioPipeline
         this.vocoder = options.vocoder ?? null;
     }
 
+    async _prepare_speaker_embeddings(speaker_embeddings) {
+        // Load speaker embeddings as Float32Array from path/URL
+        if (typeof speaker_embeddings === 'string' || speaker_embeddings instanceof URL) {
+            // Load from URL with fetch
+            speaker_embeddings = new Float32Array(
+                await (await fetch(speaker_embeddings)).arrayBuffer()
+            );
+        }
+
+        if (speaker_embeddings instanceof Float32Array) {
+            speaker_embeddings = new Tensor(
+                'float32',
+                speaker_embeddings,
+                [speaker_embeddings.length]
+            )
+        } else if (!(speaker_embeddings instanceof Tensor)) {
+            throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.")
+        }
+
+        return speaker_embeddings;
+    }
+
     /** @type {TextToAudioPipelineCallback} */
-    async _call(text_inputs, { speaker_embeddings = null } = {}) {
+    async _call(text_inputs, {
+        speaker_embeddings = null,
+        num_inference_steps,
+        speed,
+    } = {}) {
+
         // If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
         if (this.processor) {
             return this._call_text_to_spectrogram(text_inputs, { speaker_embeddings });
+        } else if (
+            this.model.config.model_type === "supertonic"
+        ) {
+            return this._call_supertonic(text_inputs, { speaker_embeddings, num_inference_steps, speed });
         } else {
             return this._call_text_to_waveform(text_inputs);
         }
     }
 
+    async _call_supertonic(text_inputs, { speaker_embeddings, num_inference_steps, speed }) {
+        if (!speaker_embeddings) {
+            throw new Error("Speaker embeddings must be provided for Supertonic models.");
+        }
+        speaker_embeddings = await this._prepare_speaker_embeddings(speaker_embeddings);
+
+        // @ts-expect-error TS2339
+        const { sampling_rate, style_dim } = this.model.config;
+
+        speaker_embeddings = (/** @type {Tensor} */ (speaker_embeddings)).view(1, -1, style_dim);
+        const inputs = this.tokenizer(text_inputs, {
+            padding: true,
+            truncation: true,
+        });
+
+        // @ts-expect-error TS2339
+        const { waveform } = await this.model.generate_speech({
+            ...inputs,
+            style: speaker_embeddings,
+            num_inference_steps,
+            speed,
+        });
+
+        return new RawAudio(
+            waveform.data,
+            sampling_rate,
+        )
+    }
+
     async _call_text_to_waveform(text_inputs) {
+
         // Run tokenization
         const inputs = this.tokenizer(text_inputs, {
             padding: true,
@@ -105,39 +162,36 @@ export class TextToAudioPipeline
 
         // @ts-expect-error TS2339
         const sampling_rate = this.model.config.sampling_rate;
-        return new RawAudio(waveform.data, sampling_rate);
+        return new RawAudio(
+            waveform.data,
+            sampling_rate,
+        )
     }
 
     async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
+
         // Load vocoder, if not provided
         if (!this.vocoder) {
             console.log('No vocoder specified, using default HifiGan vocoder.');
             this.vocoder = await AutoModel.from_pretrained(this.DEFAULT_VOCODER_ID, { dtype: 'fp32' });
         }
 
-        // Load speaker embeddings as Float32Array from path/URL
-        if (typeof speaker_embeddings === 'string' || speaker_embeddings instanceof URL) {
-            // Load from URL with fetch
-            speaker_embeddings = new Float32Array(await (await fetch(speaker_embeddings)).arrayBuffer());
-        }
-
-        if (speaker_embeddings instanceof Float32Array) {
-            speaker_embeddings = new Tensor('float32', speaker_embeddings, [1, speaker_embeddings.length]);
-        } else if (!(speaker_embeddings instanceof Tensor)) {
-            throw new Error('Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.');
-        }
-
         // Run tokenization
         const { input_ids } = this.tokenizer(text_inputs, {
             padding: true,
             truncation: true,
         });
 
-        // NOTE: At this point, we are guaranteed that `speaker_embeddings` is a `Tensor`
-        // @ts-ignore
+        speaker_embeddings = await this._prepare_speaker_embeddings(speaker_embeddings);
+        speaker_embeddings = speaker_embeddings.view(1, -1);
+
+        // @ts-expect-error TS2339
         const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
 
         const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
-        return new RawAudio(waveform.data, sampling_rate);
+        return new RawAudio(
+            waveform.data,
+            sampling_rate,
+        )
     }
 }