Skip to content

Commit ffa5f6a

Browse files
committed
fixes on maya1.
1 parent 8c54094 commit ffa5f6a

File tree

4 files changed

+36
-16
lines changed

4 files changed

+36
-16
lines changed

docs/models.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,8 +354,6 @@ Please use `--format completion` for these models.
354354

355355
Use `--set voice XX` to describe the voice. [More info](https://huggingface.co/maya-research/maya1/blob/fbd30e2b3ec92d2e227df20005a73e172bc5d2de/prompt.txt).
356356

357-
IMPORTANT: don't forget to use `--max-new-tokens N` to control the length of result.
358-
359357
* Orpheus TTS
360358
* [x] 3B: [EN](https://huggingface.co/canopylabs/orpheus-3b-0.1-ft/tree/4206a56e5a68cf6cf96900a8a78acd3370c02eb6), [ZH](https://huggingface.co/canopylabs/3b-zh-ft-research_release/commit/29d016d6d0e5a2688267d3b3e432b7e23f043876), etc
361359

models/maya.cpp

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ namespace chatllm::maya::v1
66
typedef orpheus::tts::Config Config;
77

88

9-
class Tokenizer : public orpheus::tts::Tokenizer
9+
class Tokenizer : public orpheus::tts::Tokenizer
1010
{
1111
public:
1212
using orpheus::tts::Tokenizer::Tokenizer;
@@ -26,18 +26,46 @@ namespace chatllm::maya::v1
2626
oss << "\"> ";
2727
oss << text;
2828

29+
// https://github.com/MayaResearch/maya1-fastapi/blob/63099d92bce2431a301982e01d52cd018d69a99b/transformers_inference.py#L25-L30
30+
// which is exactly the same as Orpheus
31+
const int SOH_ID = 128259;
32+
const int EOH_ID = 128260;
33+
const int SOA_ID = 128261;
34+
const int BOS_ID = 128000;
35+
const int TEXT_EOT_ID = 128009;
36+
const int CODE_START_TOKEN_ID = 128257;
37+
38+
ids.push_back(SOH_ID);
2939
ids.push_back(bos_token_id);
3040
BaseTokenizer::encode(oss.str(), ids);
31-
ids.push_back(128009);
41+
ids.push_back(TEXT_EOT_ID);
42+
ids.push_back(EOH_ID);
43+
ids.push_back(SOA_ID);
44+
ids.push_back(CODE_START_TOKEN_ID);
3245
}
3346
};
3447

3548
class ConditionalGeneration : public orpheus::tts::ConditionalGeneration
3649
{
3750
public:
3851
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config):
39-
orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128256, 156938) // 128266, 156937)
52+
orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128266, 156937)
53+
{
54+
}
55+
protected:
56+
void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples) override
4057
{
58+
id = id % codec_config.codebook_size;
59+
60+
pcm_samples.clear();
61+
62+
vocoder_ids.push_back(id);
63+
auto count = vocoder_ids.size();
64+
if (((count % orpheus::snac::FRAME_SIZE) == 0) && (count >= 28))
65+
{
66+
std::vector<int> multiframe(vocoder_ids.end() - 28, vocoder_ids.end());
67+
codec->decode_frame(gen_config, backend_context, multiframe, count, pcm_samples);
68+
}
4169
}
4270
};
4371

models/orpheus.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ namespace chatllm::orpheus::tts
403403
}
404404

405405
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
406-
: ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128256, 156938)
406+
: ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128266, 156937)
407407
{
408408
}
409409

@@ -517,21 +517,14 @@ namespace chatllm::orpheus::tts
517517

518518
void ConditionalGeneration::decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples)
519519
{
520-
id = id - 10 - ((vocoder_ids.size() % 7) * 4096);
520+
id = id - ((vocoder_ids.size() % snac::FRAME_SIZE) * codec_config.codebook_size);
521521
if (id < 0) return;
522-
523-
// Ensure the SNAC code is within codebook bounds
524-
if (id >= codec_config.codebook_size) {
525-
ggml::log(GGML_LOG_LEVEL_WARN, "SNAC code %d exceeds codebook_size %d, clamping with modulo\n",
526-
id, codec_config.codebook_size);
527-
id = id % codec_config.codebook_size;
528-
}
529522

530523
pcm_samples.clear();
531524

532525
vocoder_ids.push_back(id);
533526
auto count = vocoder_ids.size();
534-
if (((count % 7) == 0) && (count >= 28))
527+
if (((count % snac::FRAME_SIZE) == 0) && (count >= 28))
535528
{
536529
std::vector<int> multiframe(vocoder_ids.end() - 28, vocoder_ids.end());
537530
codec->decode_frame(gen_config, backend_context, multiframe, count, pcm_samples);

models/orpheus.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ namespace chatllm::orpheus::snac
77
{
88
const int MAX_VQ_STRIDES = 4;
99
const int MAX_RATES = 4;
10+
const int FRAME_SIZE = 7;
1011

1112
struct Config
1213
{
@@ -169,7 +170,7 @@ namespace chatllm::orpheus::tts
169170
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end);
170171

171172
void reset_decoder(void);
172-
void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
173+
virtual void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
173174
protected:
174175
InitContext snac_ctx;
175176
snac::Config codec_config;

0 commit comments

Comments
 (0)