diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ed99dc8477231..4867f0f39c055 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3651,6 +3651,32 @@ def set_gguf_parameters(self): self.gguf_writer.add_causal_attention(False) +class SnacDecModel(TextModel): + model_arch = gguf.MODEL_ARCH.SNAC_DEC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid + + if name.endswith("_g") or name.endswith("_v"): + logger.debug(f"Skipping weight_norm parameter {name!r}") + return [] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams.get("codebook_size", 4096)) + self.gguf_writer.add_block_count(len(self.hparams.get("decoder_rates", [7, 7, 3, 3]))) + self.gguf_writer.add_embedding_length(self.hparams.get("latent_dim", 1536)) + self.gguf_writer.add_feed_forward_length(self.hparams.get("decoder_dim", 1536)) + self.gguf_writer.add_causal_attention(False) + + @ModelBase.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE diff --git a/docs/SNAC_IMPLEMENTATION.md b/docs/SNAC_IMPLEMENTATION.md new file mode 100644 index 0000000000000..864bdd2b8ab94 --- /dev/null +++ b/docs/SNAC_IMPLEMENTATION.md @@ -0,0 +1,185 @@ +# SNAC Decoder Implementation for Orpheus TTS + +## Overview + +This document describes the implementation of SNAC (Multi-Scale Neural Audio Codec) decoder support in llama.cpp for Orpheus TTS models. + +## Current Status + +### ✅ Completed + +1. **Architecture Infrastructure** + - Added `LLM_ARCH_SNAC_DEC` architecture enum + - Registered "snac-dec" architecture name + - Defined 27 SNAC-specific tensor types + - Added tensor name mappings for decoder and quantizer components + +2. **GGUF Constants** + - Added `MODEL_ARCH.SNAC_DEC` to gguf constants + - Defined tensor enums for all SNAC components + - Added tensor name format strings + +3. **Model Conversion** + - Implemented `SnacDecModel` class in `convert_hf_to_gguf.py` + - Handles weight_norm parameters (skips _g and _v suffixes) + - Configures SNAC-specific hyperparameters + +### 🚧 In Progress / TODO + +1. **Model Loading (llama-model.cpp)** + - Need to implement SNAC decoder model loading + - Load decoder convolution layers + - Load vector quantizer components (in_proj, out_proj, codebook) + - Load attention layers if present + - Handle Snake activation parameters + +2. **Forward Pass Implementation (llama.cpp)** + - Implement SNAC decoder forward pass + - Vector quantization decoding (from_codes) + - Decoder blocks with: + - Transposed convolutions (upsampling) + - Residual units with dilated convolutions + - Snake activation function + - Local multi-head attention (if present) + - Output convolution and tanh activation + +3. **TTS Tool Integration (tools/tts/tts.cpp)** + - Add SNAC decoder option to TTS tool + - Support for multi-scale code input + - Audio generation from hierarchical codes + - Integration with Orpheus TTS models + +4. **Testing** + - Download and convert SNAC models from HuggingFace + - Test with Orpheus TTS models + - Validate audio quality + - Performance benchmarking + +## SNAC Architecture + +### Components + +1. **Encoder** (not needed for TTS, only for training) + - Input convolution + - Encoder blocks with strided convolutions + - Local attention (optional) + - Output convolution + +2. **Vector Quantizer** (needed for decoding) + - 4 quantization levels with different strides [8, 4, 2, 1] + - Each level has: + - `in_proj`: Projects latent to codebook dimension + - `codebook`: Embedding table (4096 x 8) + - `out_proj`: Projects back to latent dimension + - Residual quantization across levels + +3. **Decoder** (main component needed) + - Input convolution (or direct from quantizer output) + - Local attention (optional) + - Decoder blocks (4 blocks for standard config): + - Transposed convolution for upsampling + - 3 residual units with dilations [1, 3, 9] + - Snake activation + - Output convolution + tanh + +### Snake Activation + +Formula: `x + (1/alpha) * sin^2(alpha * x)` + +Can be implemented using existing ggml operations: +```c +// x_scaled = x * alpha +// sin_x = sin(x_scaled) +// sin2_x = sin_x * sin_x +// result = x + sin2_x / alpha +``` + +### Tensor Naming Convention + +Decoder tensors: +- `decoder.conv_in` - Input convolution +- `decoder.attn_norm`, `decoder.attn_q/k/v/out` - Attention (if present) +- `decoder.block.{i}.conv_up` - Upsampling transposed conv +- `decoder.block.{i}.conv1/2/3` - Residual unit convolutions +- `decoder.block.{i}.snake_alpha` - Snake activation parameters +- `decoder.conv_out` - Output convolution + +Quantizer tensors: +- `quantizer.{i}.in_proj` - Input projection for level i +- `quantizer.{i}.out_proj` - Output projection for level i +- `quantizer.{i}.codebook` - Codebook embeddings for level i + +## Model Conversion + +### Converting SNAC Models + +```bash +# Download SNAC model +git clone https://huggingface.co/hubertsiuzdak/snac_24khz + +# Convert to GGUF +python convert_hf_to_gguf.py snac_24khz \ + --outfile snac-24khz-f16.gguf \ + --outtype f16 +``` + +### Expected Hyperparameters + +From SNAC config.json: +```json +{ + "sampling_rate": 24000, + "encoder_dim": 64, + "encoder_rates": [3, 3, 7, 7], + "latent_dim": 1344, + "decoder_dim": 1536, + "decoder_rates": [7, 7, 3, 3], + "attn_window_size": 32, + "codebook_size": 4096, + "codebook_dim": 8, + "vq_strides": [8, 4, 2, 1] +} +``` + +## Integration with Orpheus TTS + +Orpheus TTS uses a two-model architecture: +1. **Text-to-Codes Model**: LLM that generates hierarchical audio codes +2. **Codes-to-Speech Model**: SNAC decoder that converts codes to audio + +Usage flow: +``` +Text → Orpheus LLM → Multi-scale codes → SNAC Decoder → Audio waveform +``` + +## References + +- SNAC Paper: https://arxiv.org/abs/2410.14411 +- SNAC GitHub: https://github.com/hubertsiuzdak/snac +- Orpheus Models: https://huggingface.co/collections/canopylabs/orpheus-tts-67d9ea3f6c05a941c06ad9d2 +- OuteTTS Reference: PR #10784 in llama.cpp + +## Implementation Notes + +### Key Differences from WavTokenizer + +1. **Multi-scale Quantization**: SNAC uses 4 levels with different temporal resolutions +2. **Snake Activation**: Custom activation function (WavTokenizer uses standard activations) +3. **Simpler Architecture**: No PosNet or ConvNext blocks +4. **Hierarchical Codes**: Variable-length codes at different scales + +### Performance Considerations + +- SNAC is designed for low bitrate (0.98-2.6 kbps) +- Decoder is relatively lightweight +- Main computation in transposed convolutions and residual blocks +- Attention is optional and can be disabled for faster inference + +## Next Steps + +1. Implement model loading in `llama-model.cpp` +2. Implement forward pass in `llama.cpp` +3. Add SNAC support to TTS tool +4. Test with Orpheus models +5. Add documentation and examples +6. Performance optimization diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1b71fb3749aaa..2151617f10350 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -400,6 +400,7 @@ class MODEL_ARCH(IntEnum): GRANITE_HYBRID = auto() CHAMELEON = auto() WAVTOKENIZER_DEC = auto() + SNAC_DEC = auto() PLM = auto() BAILINGMOE = auto() BAILINGMOE2 = auto() @@ -600,6 +601,33 @@ class MODEL_TENSOR(IntEnum): SHORTCONV_CONV = auto() SHORTCONV_INPROJ = auto() SHORTCONV_OUTPROJ = auto() + SNAC_ENC_CONV_IN = auto() + SNAC_ENC_BLK_CONV1 = auto() + SNAC_ENC_BLK_CONV2 = auto() + SNAC_ENC_BLK_CONV3 = auto() + SNAC_ENC_BLK_CONV_DS = auto() + SNAC_ENC_BLK_SNAKE_ALPHA = auto() + SNAC_ENC_CONV_OUT = auto() + SNAC_ENC_ATTN_NORM = auto() + SNAC_ENC_ATTN_Q = auto() + SNAC_ENC_ATTN_K = auto() + SNAC_ENC_ATTN_V = auto() + SNAC_ENC_ATTN_OUT = auto() + SNAC_VQ_IN_PROJ = auto() + SNAC_VQ_OUT_PROJ = auto() + SNAC_VQ_CODEBOOK = auto() + SNAC_DEC_CONV_IN = auto() + SNAC_DEC_ATTN_NORM = auto() + SNAC_DEC_ATTN_Q = auto() + SNAC_DEC_ATTN_K = auto() + SNAC_DEC_ATTN_V = auto() + SNAC_DEC_ATTN_OUT = auto() + SNAC_DEC_BLK_CONV_UP = auto() + SNAC_DEC_BLK_CONV1 = auto() + SNAC_DEC_BLK_CONV2 = auto() + SNAC_DEC_BLK_CONV3 = auto() + SNAC_DEC_BLK_SNAKE_ALPHA = auto() + SNAC_DEC_CONV_OUT = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -745,6 +773,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE_HYBRID: "granitehybrid", MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", + MODEL_ARCH.SNAC_DEC: "snac-dec", MODEL_ARCH.PLM: "plm", MODEL_ARCH.BAILINGMOE: "bailingmoe", MODEL_ARCH.BAILINGMOE2: "bailingmoe2", @@ -946,6 +975,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", + MODEL_TENSOR.SNAC_DEC_CONV_IN: "decoder.conv_in", + MODEL_TENSOR.SNAC_DEC_ATTN_NORM: "decoder.attn_norm", + MODEL_TENSOR.SNAC_DEC_ATTN_Q: "decoder.attn_q", + MODEL_TENSOR.SNAC_DEC_ATTN_K: "decoder.attn_k", + MODEL_TENSOR.SNAC_DEC_ATTN_V: "decoder.attn_v", + MODEL_TENSOR.SNAC_DEC_ATTN_OUT: "decoder.attn_out", + MODEL_TENSOR.SNAC_DEC_BLK_CONV_UP: "decoder.block.{bid}.conv_up", + MODEL_TENSOR.SNAC_DEC_BLK_CONV1: "decoder.block.{bid}.conv1", + MODEL_TENSOR.SNAC_DEC_BLK_CONV2: "decoder.block.{bid}.conv2", + MODEL_TENSOR.SNAC_DEC_BLK_CONV3: "decoder.block.{bid}.conv3", + MODEL_TENSOR.SNAC_DEC_BLK_SNAKE_ALPHA: "decoder.block.{bid}.snake_alpha", + MODEL_TENSOR.SNAC_DEC_CONV_OUT: "decoder.conv_out", + MODEL_TENSOR.SNAC_VQ_IN_PROJ: "quantizer.{bid}.in_proj", + MODEL_TENSOR.SNAC_VQ_OUT_PROJ: "quantizer.{bid}.out_proj", + MODEL_TENSOR.SNAC_VQ_CODEBOOK: "quantizer.{bid}.codebook", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2518,6 +2562,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], + MODEL_ARCH.SNAC_DEC: [ + MODEL_TENSOR.SNAC_DEC_CONV_IN, + MODEL_TENSOR.SNAC_DEC_ATTN_NORM, + MODEL_TENSOR.SNAC_DEC_ATTN_Q, + MODEL_TENSOR.SNAC_DEC_ATTN_K, + MODEL_TENSOR.SNAC_DEC_ATTN_V, + MODEL_TENSOR.SNAC_DEC_ATTN_OUT, + MODEL_TENSOR.SNAC_DEC_BLK_CONV_UP, + MODEL_TENSOR.SNAC_DEC_BLK_CONV1, + MODEL_TENSOR.SNAC_DEC_BLK_CONV2, + MODEL_TENSOR.SNAC_DEC_BLK_CONV3, + MODEL_TENSOR.SNAC_DEC_BLK_SNAKE_ALPHA, + MODEL_TENSOR.SNAC_DEC_CONV_OUT, + MODEL_TENSOR.SNAC_VQ_IN_PROJ, + MODEL_TENSOR.SNAC_VQ_OUT_PROJ, + MODEL_TENSOR.SNAC_VQ_CODEBOOK, + ], MODEL_ARCH.BAILINGMOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8ca769c5fd2ef..5ce3f3568a6ac 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -83,6 +83,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_SNAC_DEC, "snac-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE2, "bailingmoe2" }, @@ -1926,6 +1927,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, + { + LLM_ARCH_SNAC_DEC, + { + { LLM_TENSOR_SNAC_DEC_CONV_IN, "decoder.conv_in" }, + { LLM_TENSOR_SNAC_DEC_ATTN_NORM, "decoder.attn_norm" }, + { LLM_TENSOR_SNAC_DEC_ATTN_Q, "decoder.attn_q" }, + { LLM_TENSOR_SNAC_DEC_ATTN_K, "decoder.attn_k" }, + { LLM_TENSOR_SNAC_DEC_ATTN_V, "decoder.attn_v" }, + { LLM_TENSOR_SNAC_DEC_ATTN_OUT, "decoder.attn_out" }, + { LLM_TENSOR_SNAC_DEC_BLK_CONV_UP, "decoder.block.%d.conv_up" }, + { LLM_TENSOR_SNAC_DEC_BLK_CONV1, "decoder.block.%d.conv1" }, + { LLM_TENSOR_SNAC_DEC_BLK_CONV2, "decoder.block.%d.conv2" }, + { LLM_TENSOR_SNAC_DEC_BLK_CONV3, "decoder.block.%d.conv3" }, + { LLM_TENSOR_SNAC_DEC_BLK_SNAKE_ALPHA, "decoder.block.%d.snake_alpha" }, + { LLM_TENSOR_SNAC_DEC_CONV_OUT, "decoder.conv_out" }, + { LLM_TENSOR_SNAC_VQ_IN_PROJ, "quantizer.%d.in_proj" }, + { LLM_TENSOR_SNAC_VQ_OUT_PROJ, "quantizer.%d.out_proj" }, + { LLM_TENSOR_SNAC_VQ_CODEBOOK, "quantizer.%d.codebook" }, + }, + }, { LLM_ARCH_BAILINGMOE, { diff --git a/src/llama-arch.h b/src/llama-arch.h index dea725c1a753a..b9398d46955c5 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -87,6 +87,7 @@ enum llm_arch { LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_CHAMELEON, LLM_ARCH_WAVTOKENIZER_DEC, + LLM_ARCH_SNAC_DEC, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE2, @@ -461,6 +462,33 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_SNAC_ENC_CONV_IN, + LLM_TENSOR_SNAC_ENC_BLK_CONV1, + LLM_TENSOR_SNAC_ENC_BLK_CONV2, + LLM_TENSOR_SNAC_ENC_BLK_CONV3, + LLM_TENSOR_SNAC_ENC_BLK_CONV_DS, + LLM_TENSOR_SNAC_ENC_BLK_SNAKE_ALPHA, + LLM_TENSOR_SNAC_ENC_CONV_OUT, + LLM_TENSOR_SNAC_ENC_ATTN_NORM, + LLM_TENSOR_SNAC_ENC_ATTN_Q, + LLM_TENSOR_SNAC_ENC_ATTN_K, + LLM_TENSOR_SNAC_ENC_ATTN_V, + LLM_TENSOR_SNAC_ENC_ATTN_OUT, + LLM_TENSOR_SNAC_VQ_IN_PROJ, + LLM_TENSOR_SNAC_VQ_OUT_PROJ, + LLM_TENSOR_SNAC_VQ_CODEBOOK, + LLM_TENSOR_SNAC_DEC_CONV_IN, + LLM_TENSOR_SNAC_DEC_ATTN_NORM, + LLM_TENSOR_SNAC_DEC_ATTN_Q, + LLM_TENSOR_SNAC_DEC_ATTN_K, + LLM_TENSOR_SNAC_DEC_ATTN_V, + LLM_TENSOR_SNAC_DEC_ATTN_OUT, + LLM_TENSOR_SNAC_DEC_BLK_CONV_UP, + LLM_TENSOR_SNAC_DEC_BLK_CONV1, + LLM_TENSOR_SNAC_DEC_BLK_CONV2, + LLM_TENSOR_SNAC_DEC_BLK_CONV3, + LLM_TENSOR_SNAC_DEC_BLK_SNAKE_ALPHA, + LLM_TENSOR_SNAC_DEC_CONV_OUT, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e460996330080..cc3e1b26f05cd 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -20290,6 +20290,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: + case LLM_ARCH_SNAC_DEC: case LLM_ARCH_NEMOTRON_H: return LLAMA_ROPE_TYPE_NONE;