diff --git a/pyproject.toml b/pyproject.toml index 999739f31..b6f90274c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ authors = [ {name = "resemble-ai", email = "engineering@resemble.ai"} ] dependencies = [ - "numpy>=1.24.0,<1.26.0", + "numpy>=1.24.0,<2.4", "librosa==0.11.0", "s3tokenizer", "torch==2.6.0", @@ -20,16 +20,21 @@ dependencies = [ "conformer==0.3.2", "safetensors==0.5.3", "spacy-pkuseg", - "pykakasi==2.3.0", - "gradio==5.44.1", "pyloudnorm", - "omegaconf" ] [project.urls] Homepage = "https://github.com/resemble-ai/chatterbox" Repository = "https://github.com/resemble-ai/chatterbox" +[project.optional-dependencies] +examples = [ + "gradio==5.44.1", +] +jp = [ + "pykakasi==2.3.0", +] + [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" diff --git a/src/chatterbox/models/s3gen/flow.py b/src/chatterbox/models/s3gen/flow.py index 12f6715ec..28d8d1e44 100644 --- a/src/chatterbox/models/s3gen/flow.py +++ b/src/chatterbox/models/s3gen/flow.py @@ -21,8 +21,6 @@ from torch.nn import functional as F from .utils.mask import make_pad_mask from .configs import CFM_PARAMS -from omegaconf import DictConfig - logger = logging.getLogger(__name__) @@ -40,6 +38,18 @@ def _repeat_batch_dim(tnsr, B, ndim): return tnsr +DEFAULT_DECODER_CONF = { + 'in_channels': 240, + 'out_channel': 80, + 'spk_emb_dim': 80, + 'n_spks': 1, + 'cfm_params': {'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}, + 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}, +} + +DEFAULT_MEL_FEAT_CONF = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000} + + class CausalMaskedDiffWithXvec(torch.nn.Module): def __init__(self, input_size: int = 512, @@ -53,17 +63,8 @@ def __init__(self, pre_lookahead_len: int = 3, encoder: torch.nn.Module = None, decoder: torch.nn.Module = None, - decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, - 'cfm_params': DictConfig( - {'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', - 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, - 'reg_loss_type': 'l1'}), - 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, - 'attention_head_dim': 64, - 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, - 'act_fn': 'gelu'}}, - mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, - 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + decoder_conf: Dict = DEFAULT_DECODER_CONF, + mel_feat_conf: Dict = DEFAULT_MEL_FEAT_CONF): super().__init__() self.input_size = input_size self.output_size = output_size