Add support for Moonshine ASR (#1099)

xenova · web-flow · commit aa603023b9fb · 2024-12-15T16:36:13.000+02:00
* Add support for Moonshine ASR

* Add ASR pipeline API support for moonshine

* Add moonshine feature extractor unit test

* Pass moonshine pipeline generation kwargs to generate
diff --git a/README.md b/README.md
@@ -366,6 +366,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat.
+1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
@@ -81,6 +81,7 @@
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat.
+1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
diff --git a/src/configs.js b/src/configs.js
@@ -186,6 +186,7 @@ function getNormalizedConfig(config) {
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
             break;
         case 'musicgen_decoder':
+        case 'moonshine':
             mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
             mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';
diff --git a/src/models.js b/src/models.js
@@ -3359,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
 }
 //////////////////////////////////////////////////
 
+
+//////////////////////////////////////////////////
+// Moonshine models
+export class MoonshinePreTrainedModel extends PreTrainedModel {
+
+    requires_attention_mask = false;
+    main_input_name = 'input_values';
+    forward_params = [
+        'input_values',
+        'decoder_input_ids',
+        'past_key_values',
+    ];
+};
+
+/**
+ * MoonshineModel class for training Moonshine models without a language model head.
+ */
+export class MoonshineModel extends MoonshinePreTrainedModel { }
+
+export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { } 
+//////////////////////////////////////////////////
+
+
 //////////////////////////////////////////////////
 /**
  * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
@@ -7013,6 +7036,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
 const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
     ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
     ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
+    ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
 ]);
 
 const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js
@@ -1,6 +1,7 @@
 
 export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
 export * from './clap/feature_extraction_clap.js';
+export * from './moonshine/feature_extraction_moonshine.js';
 export * from './pyannote/feature_extraction_pyannote.js';
 export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
 export * from './speecht5/feature_extraction_speecht5.js';
diff --git a/src/models/moonshine/feature_extraction_moonshine.js b/src/models/moonshine/feature_extraction_moonshine.js
@@ -0,0 +1,26 @@
+import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+
+
+export class MoonshineFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts input values from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
+     */
+    async _call(audio) {
+        validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
+
+        if (audio instanceof Float64Array) {
+            audio = new Float32Array(audio);
+        }
+
+        const shape = [
+            1,            /* batch_size */
+            audio.length, /* num_samples */
+        ];
+        return {
+            input_values: new Tensor('float32', audio, shape),
+        };
+    }
+}
diff --git a/src/models/moonshine/processing_moonshine.js b/src/models/moonshine/processing_moonshine.js
@@ -0,0 +1,20 @@
+import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
+import { AutoTokenizer } from "../../tokenizers.js"
+import { Processor } from "../../base/processing_utils.js"
+
+/**
+ * Represents a MoonshineProcessor that extracts features from an audio input.
+ */
+export class MoonshineProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static feature_extractor_class = AutoFeatureExtractor
+
+    /**
+     * Calls the feature_extractor function with the given audio input.
+     * @param {any} audio The audio input to extract features from.
+     * @returns {Promise<any>} A Promise that resolves with the extracted features.
+     */
+    async _call(audio) {
+        return await this.feature_extractor(audio);
+    }
+}
diff --git a/src/models/processors.js b/src/models/processors.js
@@ -1,5 +1,6 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
+export * from './moonshine/processing_moonshine.js';
 export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
diff --git a/src/pipelines.js b/src/pipelines.js
@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
             case 'unispeech-sat':
             case 'hubert':
                 return this._call_wav2vec2(audio, kwargs)
+            case 'moonshine':
+                return this._call_moonshine(audio, kwargs)
             default:
                 throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
         }
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
         }
         return single ? toReturn[0] : toReturn;
     }
+
+    /**
+     * @type {AutomaticSpeechRecognitionPipelineCallback}
+     * @private
+     */
+    async _call_moonshine(audio, kwargs) {
+        const single = !Array.isArray(audio);
+        if (single) {
+            audio = [/** @type {AudioInput} */ (audio)];
+        }
+        const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
+        const preparedAudios = await prepareAudios(audio, sampling_rate);
+        const toReturn = [];
+        for (const aud of preparedAudios) {
+            const inputs = await this.processor(aud);
+
+            // According to the [paper](https://arxiv.org/pdf/2410.15608):
+            // "We use greedy decoding, with a heuristic limit of 6 output tokens
+            // per second of audio to avoid repeated output sequences."
+            const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
+            const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
+
+            const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
+            toReturn.push({ text });
+        }
+        return single ? toReturn[0] : toReturn;
+    }
+
 }
 
 /**
diff --git a/tests/models/moonshine/test_feature_extraction_moonshine.js b/tests/models/moonshine/test_feature_extraction_moonshine.js
@@ -0,0 +1,30 @@
+import { AutoFeatureExtractor, MoonshineFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // MoonshineFeatureExtractor
+  describe("MoonshineFeatureExtractor", () => {
+    const model_id = "onnx-community/moonshine-tiny-ONNX";
+
+    /** @type {MoonshineFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "default",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_values } = await feature_extractor(audio);
+        expect(input_values.dims).toEqual([1, 208000]);
+        expect(input_values.mean().item()).toBeCloseTo(-1.5654930507480458e-7, 6);
+        expect(input_values.data[0]).toBeCloseTo(0.0067138671875, 6);
+        expect(input_values.data.at(-1)).toBeCloseTo(-0.013427734375, 6);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};