diff --git a/README.md b/README.md index 7a51c5aaf..b386eedb5 100644 --- a/README.md +++ b/README.md @@ -403,6 +403,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://huggingface.co/papers/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://huggingface.co/papers/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. 1. **[PaliGemma](https://huggingface.co/docs/transformers/main/model_doc/paligemma)** (from Google) released with the papers [PaliGemma: A versatile 3B VLM for transfer](https://huggingface.co/papers/2407.07726) and [PaliGemma 2: A Family of Versatile VLMs for Transfer](https://huggingface.co/papers/2412.03555) by the PaliGemma Google team. +1. **[Parakeet](https://huggingface.co/docs/transformers/main/model_doc/parakeet)** (from NVIDIA) released with the blog post [Introducing the Parakeet ASR family](https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/) by the NVIDIA NeMo team. 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://huggingface.co/papers/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://huggingface.co/papers/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://huggingface.co/papers/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://huggingface.co/papers/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index faca40aaa..c68974fdc 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -117,6 +117,7 @@ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://huggingface.co/papers/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://huggingface.co/papers/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. 1. **[PaliGemma](https://huggingface.co/docs/transformers/main/model_doc/paligemma)** (from Google) released with the papers [PaliGemma: A versatile 3B VLM for transfer](https://huggingface.co/papers/2407.07726) and [PaliGemma 2: A Family of Versatile VLMs for Transfer](https://huggingface.co/papers/2412.03555) by the PaliGemma Google team. +1. **[Parakeet](https://huggingface.co/docs/transformers/main/model_doc/parakeet)** (from NVIDIA) released with the blog post [Introducing the Parakeet ASR family](https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/) by the NVIDIA NeMo team. 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://huggingface.co/papers/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://huggingface.co/papers/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://huggingface.co/papers/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://huggingface.co/papers/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee. diff --git a/src/models.js b/src/models.js index 075f3a088..edb6cae72 100644 --- a/src/models.js +++ b/src/models.js @@ -6155,6 +6155,21 @@ export class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PreTrainedModel } ////////////////////////////////////////////////// +////////////////////////////////////////////////// +// Parakeet models +export class ParakeetPreTrainedModel extends PreTrainedModel { }; +export class ParakeetForCTC extends ParakeetPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// + ////////////////////////////////////////////////// // PyAnnote models @@ -8140,6 +8155,7 @@ const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]], ['wavlm', ['WavLMForCTC', WavLMForCTC]], ['hubert', ['HubertForCTC', HubertForCTC]], + ['parakeet_ctc', ['ParakeetForCTC', ParakeetForCTC]], ]); const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js index bf5fdac27..ea27930f9 100644 --- a/src/models/feature_extractors.js +++ b/src/models/feature_extractors.js @@ -5,6 +5,7 @@ export * from './clap/feature_extraction_clap.js'; export * from './dac/feature_extraction_dac.js'; export * from './gemma3n/feature_extraction_gemma3n.js'; export * from './moonshine/feature_extraction_moonshine.js'; +export * from './parakeet/feature_extraction_parakeet.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; export * from './snac/feature_extraction_snac.js'; diff --git a/src/models/parakeet/feature_extraction_parakeet.js b/src/models/parakeet/feature_extraction_parakeet.js new file mode 100644 index 000000000..ac166bdfe --- /dev/null +++ b/src/models/parakeet/feature_extraction_parakeet.js @@ -0,0 +1,121 @@ +import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; +import { Tensor } from '../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; + +const EPSILON = 1e-5; + +export class ParakeetFeatureExtractor extends FeatureExtractor { + + constructor(config) { + super(config); + + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. + this.config.mel_filters ??= mel_filter_bank( + Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins + this.config.feature_size, // num_mel_filters + 0.0, // min_frequency + this.config.sampling_rate / 2, // max_frequency + this.config.sampling_rate, // sampling_rate + "slaney", // norm + "slaney", // mel_scale + ); + + const window = window_function(this.config.win_length, 'hann', { + periodic: false, + }); + + this.window = new Float64Array(this.config.n_fft); + const offset = Math.floor((this.config.n_fft - this.config.win_length) / 2); + this.window.set(window, offset); + } + + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @returns {Promise} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + async _extract_fbank_features(waveform) { + // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once + const preemphasis = this.config.preemphasis; + waveform = new Float64Array(waveform); // Clone to avoid destructive changes + for (let j = waveform.length - 1; j >= 1; --j) { + waveform[j] -= preemphasis * waveform[j - 1]; + } + + const features = await spectrogram( + waveform, + this.window, // window + this.window.length, // frame_length + this.config.hop_length, // hop_length + { + fft_length: this.config.n_fft, + power: 2.0, + mel_filters: this.config.mel_filters, + log_mel: 'log', + mel_floor: -Infinity, + pad_mode: 'constant', + center: true, + + // Custom + transpose: true, + mel_offset: 2 ** -24, + } + ) + + return features; + } + + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_features: Tensor; attention_mask: Tensor; }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + async _call(audio) { + validate_audio_inputs(audio, 'ParakeetFeatureExtractor'); + + const features = await this._extract_fbank_features(audio); + + const features_length = Math.floor( + (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length + ); + + const features_data = /** @type {Float32Array} */ (features.data); + features_data.fill(0, features_length * features.dims[1]); + + // normalize mel features, ignoring padding + const [num_frames, num_features] = features.dims; + const sum = new Float64Array(num_features); + const sum_sq = new Float64Array(num_features); + + for (let i = 0; i < features_length; ++i) { + const offset = i * num_features; + for (let j = 0; j < num_features; ++j) { + const val = features_data[offset + j]; + sum[j] += val; + sum_sq[j] += val * val; + } + } + + // Calculate mean and standard deviation, then normalize + const divisor = features_length > 1 ? features_length - 1 : 1; + for (let j = 0; j < num_features; ++j) { + const mean = sum[j] / features_length; + const variance = (sum_sq[j] - features_length * mean * mean) / divisor; + const std = Math.sqrt(variance) + EPSILON; + const inv_std = 1 / std; + + for (let i = 0; i < features_length; ++i) { + const index = i * num_features + j; + features_data[index] = (features_data[index] - mean) * inv_std; + } + } + + const mask_data = new BigInt64Array(num_frames); + mask_data.fill(1n, 0, features_length); + + return { + input_features: features.unsqueeze_(0), + attention_mask: new Tensor('int64', mask_data, [1, num_frames]), + }; + } +} \ No newline at end of file diff --git a/src/pipelines.js b/src/pipelines.js index 6304ce445..6c84403e2 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -1750,6 +1750,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options case 'unispeech': case 'unispeech-sat': case 'hubert': + case 'parakeet_ctc': return this._call_wav2vec2(audio, kwargs) case 'moonshine': return this._call_moonshine(audio, kwargs) @@ -1790,7 +1791,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options for (const item of logits) { predicted_ids.push(max(item.data)[1]) } - const predicted_sentences = this.tokenizer.decode(predicted_ids) + const predicted_sentences = this.tokenizer.decode(predicted_ids, { skip_special_tokens: true }).trim(); toReturn.push({ text: predicted_sentences }) } return single ? toReturn[0] : toReturn; diff --git a/src/utils/audio.js b/src/utils/audio.js index 548e2b0d8..fb7745f83 100644 --- a/src/utils/audio.js +++ b/src/utils/audio.js @@ -470,6 +470,7 @@ function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range = * @param {number} [options.min_num_frames=null] If provided, ensures the number of frames to compute is at least this value. * @param {boolean} [options.do_pad=true] If `true`, pads the output spectrogram to have `max_num_frames` frames. * @param {boolean} [options.transpose=false] If `true`, the returned spectrogram will have shape `(num_frames, num_frequency_bins/num_mel_filters)`. If `false`, the returned spectrogram will have shape `(num_frequency_bins/num_mel_filters, num_frames)`. + * @param {number} [options.mel_offset=0] Offset to add to the mel spectrogram to avoid taking the log of zero. * @returns {Promise} Spectrogram of shape `(num_frequency_bins, length)` (regular spectrogram) or shape `(num_mel_filters, length)` (mel spectrogram). */ export async function spectrogram( @@ -498,6 +499,7 @@ export async function spectrogram( max_num_frames = null, do_pad = true, transpose = false, + mel_offset = 0, } = {} ) { const window_length = window.length; @@ -530,11 +532,23 @@ export async function spectrogram( } if (center) { - if (pad_mode !== 'reflect') { - throw new Error(`pad_mode="${pad_mode}" not implemented yet.`) + switch (pad_mode) { + case 'reflect': { + const half_window = Math.floor((fft_length - 1) / 2) + 1; + waveform = padReflect(waveform, half_window, half_window); + break; + } + case 'constant': { + const padding = Math.floor(fft_length / 2); + // @ts-expect-error ts(2351) + const padded = new waveform.constructor(waveform.length + 2 * padding); + padded.set(waveform, padding); + waveform = padded; + break; + } + default: + throw new Error(`pad_mode="${pad_mode}" not implemented yet.`); } - const half_window = Math.floor((fft_length - 1) / 2) + 1; - waveform = padReflect(waveform, half_window, half_window); } // split waveform into frames of frame_length size @@ -641,7 +655,7 @@ export async function spectrogram( const mel_spec_data = /** @type {Float32Array} */(mel_spec.data); for (let i = 0; i < mel_spec_data.length; ++i) { - mel_spec_data[i] = Math.max(mel_floor, mel_spec_data[i]); + mel_spec_data[i] = mel_offset + Math.max(mel_floor, mel_spec_data[i]); } if (power !== null && log_mel !== null) { diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/tests/pipelines/test_pipelines_automatic_speech_recognition.js index da9dd88b4..6f896d551 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -114,7 +114,7 @@ export default () => { "default", async () => { const output = await pipe(audios[0], { max_new_tokens }); - const target = { text: "K" }; + const target = { text: "K" }; expect(output).toEqual(target); }, MAX_TEST_EXECUTION_TIME,