fixes on maya1.

foldl · foldl · commit ffa5f6a45b15 · 2025-11-07T11:20:53.000+08:00
diff --git a/docs/models.md b/docs/models.md
@@ -354,8 +354,6 @@ Please use `--format completion` for these models.
 
         Use `--set voice XX` to describe the voice. [More info](https://huggingface.co/maya-research/maya1/blob/fbd30e2b3ec92d2e227df20005a73e172bc5d2de/prompt.txt).
 
-        IMPORTANT: don't forget to use `--max-new-tokens N` to control the length of result.
-
 * Orpheus TTS
     * [x] 3B: [EN](https://huggingface.co/canopylabs/orpheus-3b-0.1-ft/tree/4206a56e5a68cf6cf96900a8a78acd3370c02eb6), [ZH](https://huggingface.co/canopylabs/3b-zh-ft-research_release/commit/29d016d6d0e5a2688267d3b3e432b7e23f043876), etc
 
diff --git a/models/maya.cpp b/models/maya.cpp
@@ -6,7 +6,7 @@ namespace chatllm::maya::v1
     typedef orpheus::tts::Config Config;
 
 
-    class Tokenizer : public  orpheus::tts::Tokenizer
+    class Tokenizer : public orpheus::tts::Tokenizer
     {
     public:
         using  orpheus::tts::Tokenizer::Tokenizer;
@@ -26,18 +26,46 @@ namespace chatllm::maya::v1
             oss << "\"> ";
             oss << text;
 
+            // https://github.com/MayaResearch/maya1-fastapi/blob/63099d92bce2431a301982e01d52cd018d69a99b/transformers_inference.py#L25-L30
+            // which is exactly the same as Orpheus
+            const int SOH_ID = 128259;
+            const int EOH_ID = 128260;
+            const int SOA_ID = 128261;
+            const int BOS_ID = 128000;
+            const int TEXT_EOT_ID = 128009;
+            const int CODE_START_TOKEN_ID = 128257;
+
+            ids.push_back(SOH_ID);
             ids.push_back(bos_token_id);
             BaseTokenizer::encode(oss.str(), ids);
-            ids.push_back(128009);
+            ids.push_back(TEXT_EOT_ID);
+            ids.push_back(EOH_ID);
+            ids.push_back(SOA_ID);
+            ids.push_back(CODE_START_TOKEN_ID);
         }
     };
 
     class ConditionalGeneration : public orpheus::tts::ConditionalGeneration
     {
     public:
         ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config):
-            orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128256, 156938) // 128266, 156937)
+            orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128266, 156937)
+        {
+        }
+    protected:
+        void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples) override
         {
+            id = id % codec_config.codebook_size;
+
+            pcm_samples.clear();
+
+            vocoder_ids.push_back(id);
+            auto count = vocoder_ids.size();
+            if (((count % orpheus::snac::FRAME_SIZE) == 0) && (count >= 28))
+            {
+                std::vector<int> multiframe(vocoder_ids.end() - 28, vocoder_ids.end());
+                codec->decode_frame(gen_config, backend_context, multiframe, count, pcm_samples);
+            }
         }
     };
 
diff --git a/models/orpheus.cpp b/models/orpheus.cpp
@@ -403,7 +403,7 @@ namespace chatllm::orpheus::tts
     }
 
     ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
-        : ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128256, 156938)
+        : ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128266, 156937)
     {
     }
 
@@ -517,21 +517,14 @@ namespace chatllm::orpheus::tts
 
     void ConditionalGeneration::decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples)
     {
-        id = id - 10 - ((vocoder_ids.size() % 7) * 4096);
+        id = id - ((vocoder_ids.size() % snac::FRAME_SIZE) * codec_config.codebook_size);
         if (id < 0) return;
-        
-        // Ensure the SNAC code is within codebook bounds
-        if (id >= codec_config.codebook_size) {
-            ggml::log(GGML_LOG_LEVEL_WARN, "SNAC code %d exceeds codebook_size %d, clamping with modulo\n",
-                      id, codec_config.codebook_size);
-            id = id % codec_config.codebook_size;
-        }
 
         pcm_samples.clear();
 
         vocoder_ids.push_back(id);
         auto count = vocoder_ids.size();
-        if (((count % 7) == 0) && (count >= 28))
+        if (((count % snac::FRAME_SIZE) == 0) && (count >= 28))
         {
             std::vector<int> multiframe(vocoder_ids.end() - 28, vocoder_ids.end());
             codec->decode_frame(gen_config, backend_context, multiframe, count, pcm_samples);
diff --git a/models/orpheus.h b/models/orpheus.h
@@ -7,6 +7,7 @@ namespace chatllm::orpheus::snac
 {
     const int MAX_VQ_STRIDES = 4;
     const int MAX_RATES = 4;
+    const int FRAME_SIZE = 7;
 
     struct Config
     {
@@ -169,7 +170,7 @@ namespace chatllm::orpheus::tts
         ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end);
 
         void reset_decoder(void);
-        void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
+        virtual void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
     protected:
         InitContext snac_ctx;
         snac::Config codec_config;

Original file line number	Diff line number	Diff line change
`@@ -403,7 +403,7 @@ namespace chatllm::orpheus::tts`
`403`	`403`	`}`
`404`	`404`
`405`	`405`	`ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)`
`406`		`- : ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128256, 156938)`
	`406`	`+ : ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128266, 156937)`
`407`	`407`	`{`
`408`	`408`	`}`
`409`	`409`
`@@ -517,21 +517,14 @@ namespace chatllm::orpheus::tts`
`517`	`517`
`518`	`518`	`void ConditionalGeneration::decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples)`
`519`	`519`	`{`
`520`		`- id = id - 10 - ((vocoder_ids.size() % 7) * 4096);`
	`520`	`+ id = id - ((vocoder_ids.size() % snac::FRAME_SIZE) * codec_config.codebook_size);`
`521`	`521`	`if (id < 0) return;`
`522`		`-`
`523`		`- // Ensure the SNAC code is within codebook bounds`
`524`		`- if (id >= codec_config.codebook_size) {`
`525`		`- ggml::log(GGML_LOG_LEVEL_WARN, "SNAC code %d exceeds codebook_size %d, clamping with modulo\n",`
`526`		`- id, codec_config.codebook_size);`
`527`		`- id = id % codec_config.codebook_size;`
`528`		`- }`
`529`	`522`
`530`	`523`	`pcm_samples.clear();`
`531`	`524`
`532`	`525`	`vocoder_ids.push_back(id);`
`533`	`526`	`auto count = vocoder_ids.size();`
`534`		`- if (((count % 7) == 0) && (count >= 28))`
	`527`	`+ if (((count % snac::FRAME_SIZE) == 0) && (count >= 28))`
`535`	`528`	`{`
`536`	`529`	`std::vector<int> multiframe(vocoder_ids.end() - 28, vocoder_ids.end());`
`537`	`530`	`codec->decode_frame(gen_config, backend_context, multiframe, count, pcm_samples);`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ namespace chatllm::orpheus::snac`
`7`	`7`	`{`
`8`	`8`	`const int MAX_VQ_STRIDES = 4;`
`9`	`9`	`const int MAX_RATES = 4;`
	`10`	`+ const int FRAME_SIZE = 7;`
`10`	`11`
`11`	`12`	`struct Config`
`12`	`13`	`{`
`@@ -169,7 +170,7 @@ namespace chatllm::orpheus::tts`
`169`	`170`	`ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end);`
`170`	`171`
`171`	`172`	`void reset_decoder(void);`
`172`		`- void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);`
	`173`	`+ virtual void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);`
`173`	`174`	`protected:`
`174`	`175`	`InitContext snac_ctx;`
`175`	`176`	`snac::Config codec_config;`