UniFlow-Audio/modeling_uniflow_audio.py at master · wsntxxn/UniFlow-Audio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from typing import Any, Sequence
from pathlib import Path
import json
import os
import shutil

import h5py
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from safetensors.torch import load_file
import hydra
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import T5EncoderModel, T5Tokenizer


class UniFlowAudioModel(nn.Module):
    def __init__(self, model_name: str = "wsntxxn/UniFlow-Audio-large"):
        assert model_name in (
            "wsntxxn/UniFlow-Audio-large",
            "wsntxxn/UniFlow-Audio-medium",
            "wsntxxn/UniFlow-Audio-small",
        )
        super().__init__()
        model_dir = snapshot_download(repo_id=model_name)
        model_dir = Path(model_dir)
        self.config = OmegaConf.load(model_dir / "config.yaml")
        self.config["model"]["autoencoder"]["pretrained_ckpt"] = str(
            model_dir / self.config["model"]["autoencoder"]["pretrained_ckpt"]
        )
        flan_t5_path = os.environ.get("FLAN_T5_PATH", "google/flan-t5-large")
        try:
            tokenizer = T5Tokenizer.from_pretrained(flan_t5_path)
            encoder = T5EncoderModel.from_pretrained(flan_t5_path)
        except Exception as e:
            raise RuntimeError(
                "Failed to initialize Flan-T5, please download it manually and set the `FLAN_T5_PATH`"
                "environment variable to the path of the downloaded model."
            ) from e
        self.config["model"]["content_encoder"]["text_encoder"]["model_name"
                                                               ] = flan_t5_path
        self.model = hydra.utils.instantiate(
            self.config["model"], _convert_="all"
        )
        state_dict = load_file(model_dir / "model.safetensors")
        self.model.load_pretrained(state_dict)
        self.model.eval()

        self.g2p_model_path = model_dir / "mfa_g2p" / "english_us_arpa_unhashed.zip"
        if not self.g2p_model_path.exists():
            ori_model_path = (model_dir / "mfa_g2p" /
                              "english_us_arpa.zip").resolve()
            shutil.copy(ori_model_path, self.g2p_model_path)

        self.tts_phone_set_path = model_dir / "mfa_g2p" / "phone_set.json"
        self.tts_word2phone_dict_path = model_dir / "mfa_g2p" / "word2phone.json"
        self.build_tts_phone_mapping()
        self.svs_phone_set_path = model_dir / "svs" / "phone_set.json"
        singers = json.load(open(model_dir / "svs" / "spk_set.json", "r"))
        self.svs_singer_mapping = {
            singer: i
            for i, singer in enumerate(singers)
        }
        self.svs_pinyin2ph = model_dir / "svs" / "m4singer_pinyin2ph.txt"

        self.task_to_instructions = {}
        with h5py.File(model_dir / "instructions" / "t5_embeddings.h5") as hf:
            for key in hf.keys():
                self.task_to_instructions[key] = hf[key][()]

        self.init_instruction_encoder()

    def build_tts_phone_mapping(self):
        with open(self.tts_phone_set_path, "r", encoding="utf-8") as f:
            phone_set = json.load(f)

        self.tts_phone2id = {p: i for i, p in enumerate(phone_set)}

    def init_instruction_encoder(self):
        flan_t5_path = os.environ.get("FLAN_T5_PATH", "google/flan-t5-large")
        try:
            self.instruction_tokenizer = T5Tokenizer.from_pretrained(
                flan_t5_path
            )
            self.instruction_encoder = T5EncoderModel.from_pretrained(
                flan_t5_path
            )
        except Exception as e:
            raise RuntimeError(
                "Failed to initialize Flan-T5, please download it manually and set the `FLAN_T5_PATH`"
                "environment variable to the path of the downloaded model."
            ) from e

        self.instruction_encoder.eval()

    @torch.inference_mode()
    def encode_instruction(self, instruction: list[str], device: torch.device):
        with torch.amp.autocast(enabled=False):
            tokens = self.instruction_tokenizer(
                instruction,
                max_length=self.instruction_tokenizer.model_max_length,
                padding=True,
                truncation=True,
                return_tensors="pt",
            )
            input_ids = tokens.input_ids.to(device)
            attention_mask = tokens.attention_mask.to(device)
            output = self.instruction_encoder(
                input_ids=input_ids, attention_mask=attention_mask
            )
            output = output.last_hidden_state
            length = attention_mask.sum(dim=1)
            return output, length

    @torch.inference_mode()
    def sample(
        self,
        content: list[Any],
        task: list[str],
        is_time_aligned: Sequence[bool],
        instruction: list[str] | None = None,
        instruction_idx: list[int] | None = None,
        num_steps: int = 20,
        sway_sampling_coef: float | None = -1.0,
        guidance_scale: float = 3.0,
        disable_progress: bool = True,
    ):
        device = self.model.dummy_param.device

        if instruction is None:
            instructions = []
            instruction_lengths = []
            for sample_idx, task_ in enumerate(task):
                if instruction_idx:
                    instruction_idx_ = instruction_idx[sample_idx]
                else:
                    instruction_idx_ = 0
                instruction_ = self.task_to_instructions[
                    f"{task_}_{instruction_idx_}"]
                instructions.append(torch.as_tensor(instruction_))
                instruction_lengths.append(instruction_.shape[0])
            instructions = pad_sequence(instructions,
                                        batch_first=True).to(device)
            instruction_lengths = torch.as_tensor(instruction_lengths
                                                 ).to(device)
        else:
            instructions, instruction_lengths = self.encode_instruction(
                instruction, device
            )

        return self.model.inference(
            content, task, is_time_aligned, instructions, instruction_lengths,
            num_steps, sway_sampling_coef, guidance_scale, disable_progress
        )