diff --git a/README.md b/README.md index 6fe4f7d9c..f095b003b 100644 --- a/README.md +++ b/README.md @@ -407,6 +407,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SigLIP](https://huggingface.co/docs/transformers/main/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. 1. **[SmolVLM](https://huggingface.co/docs/transformers/main/model_doc/smolvlm) (from Hugging Face) released with the blog posts [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm) and [SmolVLM Grows Smaller – Introducing the 250M & 500M Models!](https://huggingface.co/blog/smolervlm) by the Hugging Face TB Research team. +1. **SNAC** (from Papla Media, ETH Zurich) released with the paper [SNAC: Multi-Scale Neural Audio Codec](https://arxiv.org/abs/2410.14411) by Hubert Siuzdak, Florian Grötschla, Luca A. Lanzendörfer. 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. 1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 2e4d5b141..7db2f2521 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -121,6 +121,7 @@ 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SigLIP](https://huggingface.co/docs/transformers/main/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. 1. **[SmolVLM](https://huggingface.co/docs/transformers/main/model_doc/smolvlm) (from Hugging Face) released with the blog posts [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm) and [SmolVLM Grows Smaller – Introducing the 250M & 500M Models!](https://huggingface.co/blog/smolervlm) by the Hugging Face TB Research team. +1. **SNAC** (from Papla Media, ETH Zurich) released with the paper [SNAC: Multi-Scale Neural Audio Codec](https://arxiv.org/abs/2410.14411) by Hubert Siuzdak, Florian Grötschla, Luca A. Lanzendörfer. 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. 1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. diff --git a/src/models.js b/src/models.js index 0617ba51b..2b2d64227 100644 --- a/src/models.js +++ b/src/models.js @@ -7287,6 +7287,60 @@ export class DacDecoderModel extends DacPreTrainedModel { } ////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// Snac models +export class SnacPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_values'; + forward_params = ['input_values']; +} + +/** + * The SNAC (Multi-Scale Neural Audio Codec) model. + */ +export class SnacModel extends SnacPreTrainedModel { + /** + * Encodes the input audio waveform into discrete codes. + * @param {Object} inputs Model inputs + * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). + * @returns {Promise>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`. + */ + async encode(inputs) { + return await sessionRun(this.sessions['encoder_model'], inputs); + } + + /** + * Decodes the given frames into an output audio waveform. + * @param {Record} inputs The encoded audio codes. + * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`. + */ + async decode(inputs) { + return await sessionRun(this.sessions['decoder_model'], inputs); + } +} + +export class SnacEncoderModel extends SnacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'encoder_model', + }); + } +} +export class SnacDecoderModel extends SnacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'decoder_model', + }); + } +} +////////////////////////////////////////////////// + ////////////////////////////////////////////////// // AutoModels, used to simplify construction of PreTrainedModels // (uses config to instantiate correct class) @@ -7468,6 +7522,7 @@ const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([ const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([ ['mimi', ['MimiModel', MimiModel]], ['dac', ['DacModel', DacModel]], + ['snac', ['SnacModel', SnacModel]], ]); const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ @@ -7873,6 +7928,8 @@ const CUSTOM_MAPPING = [ ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly], ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly], ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly], + ['SnacEncoderModel', SnacEncoderModel, MODEL_TYPES.EncoderOnly], + ['SnacDecoderModel', SnacDecoderModel, MODEL_TYPES.EncoderOnly], ] for (const [name, model, type] of CUSTOM_MAPPING) { MODEL_TYPE_MAPPING.set(name, type); diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js index d5aa592cc..df1bf5720 100644 --- a/src/models/feature_extractors.js +++ b/src/models/feature_extractors.js @@ -6,6 +6,7 @@ export * from './dac/feature_extraction_dac.js'; export * from './moonshine/feature_extraction_moonshine.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; +export * from './snac/feature_extraction_snac.js'; export * from './speecht5/feature_extraction_speecht5.js'; export * from './wav2vec2/feature_extraction_wav2vec2.js'; export * from './wespeaker/feature_extraction_wespeaker.js'; diff --git a/src/models/snac/feature_extraction_snac.js b/src/models/snac/feature_extraction_snac.js new file mode 100644 index 000000000..9fc223004 --- /dev/null +++ b/src/models/snac/feature_extraction_snac.js @@ -0,0 +1,3 @@ +import { DacFeatureExtractor } from '../dac/feature_extraction_dac.js'; + +export class SnacFeatureExtractor extends DacFeatureExtractor { }