Skip to content

Commit f956b33

Browse files
committed
optimize(infer): move onnx into rvc
1 parent e81b7c5 commit f956b33

File tree

12 files changed

+107
-144
lines changed

12 files changed

+107
-144
lines changed

infer/lib/infer_pack/models_onnx.py

Lines changed: 0 additions & 119 deletions
This file was deleted.

rvc/onnx/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from .infer import RVC
2+
from .exporter import export_onnx

infer/modules/onnx/export.py renamed to rvc/onnx/exporter.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import torch
22

3-
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
3+
from .synthesizer import SynthesizerTrnMsNSFsid
44

55

6-
def export_onnx(ModelPath, ExportedPath):
7-
cpt = torch.load(ModelPath, map_location="cpu")
6+
def export_onnx(from_cpkt_pth: str, to_onnx_pth: str) -> str:
7+
cpt = torch.load(from_cpkt_pth, map_location="cpu")
88
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
99
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
1010

@@ -17,8 +17,8 @@ def export_onnx(ModelPath, ExportedPath):
1717

1818
device = "cpu" # 导出时设备(不影响使用模型)
1919

20-
net_g = SynthesizerTrnMsNSFsidM(
21-
*cpt["config"], is_half=False, encoder_dim=vec_channels
20+
net_g = SynthesizerTrnMsNSFsid(
21+
*cpt["config"], encoder_dim=vec_channels
2222
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
2323
net_g.load_state_dict(cpt["weight"], strict=False)
2424
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
@@ -36,7 +36,7 @@ def export_onnx(ModelPath, ExportedPath):
3636
test_ds.to(device),
3737
test_rnd.to(device),
3838
),
39-
ExportedPath,
39+
to_onnx_pth,
4040
dynamic_axes={
4141
"phone": [1],
4242
"pitch": [1],

rvc/onnx/f0predictors/dio.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
2323
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
2424
for index, pitch in enumerate(f0):
2525
f0[index] = round(pitch, 1)
26-
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
26+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
2727

2828
def compute_f0_uv(
2929
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
@@ -40,4 +40,4 @@ def compute_f0_uv(
4040
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
4141
for index, pitch in enumerate(f0):
4242
f0[index] = round(pitch, 1)
43-
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
43+
return self.interpolate_f0(self.resize_f0(f0, p_len))

rvc/onnx/f0predictors/f0.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def compute_f0_uv(
1818
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
1919
): ...
2020

21-
def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
21+
def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
2222
"""
2323
对F0进行插值处理
2424
"""
@@ -56,7 +56,7 @@ def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
5656

5757
return ip_data[:, 0], vuv_vector[:, 0]
5858

59-
def __resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
59+
def resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
6060
source = np.array(x)
6161
source[source < 0.001] = np.nan
6262
target = np.interp(

rvc/onnx/f0predictors/harvest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
2121
frame_period=1000 * self.hop_length / self.sampling_rate,
2222
)
2323
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
24-
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
24+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
2525

2626
def compute_f0_uv(
2727
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
@@ -36,4 +36,4 @@ def compute_f0_uv(
3636
frame_period=1000 * self.hop_length / self.sampling_rate,
3737
)
3838
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
39-
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
39+
return self.interpolate_f0(self.resize_f0(f0, p_len))

rvc/onnx/f0predictors/pm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
3131
pad_size = (p_len - len(f0) + 1) // 2
3232
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
3333
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
34-
f0, uv = self.__interpolate_f0(f0)
34+
f0, uv = self.interpolate_f0(f0)
3535
return f0
3636

3737
def compute_f0_uv(
@@ -57,5 +57,5 @@ def compute_f0_uv(
5757
pad_size = (p_len - len(f0) + 1) // 2
5858
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
5959
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
60-
f0, uv = self.__interpolate_f0(f0)
60+
f0, uv = self.interpolate_f0(f0)
6161
return f0, uv

rvc/onnx/infer.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import typing
2+
import os
3+
14
import librosa
25
import numpy as np
36
import onnxruntime
4-
import typing
5-
import os
67

7-
from onnx.f0predictors import (
8+
from .f0predictors import (
89
PMF0Predictor,
910
HarvestF0Predictor,
1011
DioF0Predictor,
@@ -15,7 +16,7 @@
1516
class Model:
1617
def __init__(
1718
self,
18-
path: str | bytes | os.PathLike,
19+
path: typing.Union[str, bytes, os.PathLike],
1920
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
2021
):
2122
if device == "cpu":
@@ -32,7 +33,7 @@ def __init__(
3233
class ContentVec(Model):
3334
def __init__(
3435
self,
35-
vec_path: str | bytes | os.PathLike,
36+
vec_path: typing.Union[str, bytes, os.PathLike],
3637
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
3738
):
3839
super().__init__(vec_path, device)
@@ -66,9 +67,9 @@ def get_f0_predictor(
6667
class RVC(Model):
6768
def __init__(
6869
self,
69-
model_path: str | bytes | os.PathLike,
70+
model_path: typing.Union[str, bytes, os.PathLike],
7071
hop_len=512,
71-
vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx",
72+
vec_path: typing.Union[str, bytes, os.PathLike] = "vec-768-layer-12.onnx",
7273
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
7374
):
7475
super().__init__(model_path, device)

rvc/onnx/synthesizer.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from typing import List, Optional, Union
2+
3+
import torch
4+
5+
from rvc.layers.synthesizers import SynthesizerTrnMsNSFsid as SynthesizerBase
6+
7+
8+
class SynthesizerTrnMsNSFsid(SynthesizerBase):
9+
def __init__(
10+
self,
11+
spec_channels: int,
12+
segment_size: int,
13+
inter_channels: int,
14+
hidden_channels: int,
15+
filter_channels: int,
16+
n_heads: int,
17+
n_layers: int,
18+
kernel_size: int,
19+
p_dropout: int,
20+
resblock: str,
21+
resblock_kernel_sizes: List[int],
22+
resblock_dilation_sizes: List[List[int]],
23+
upsample_rates: List[int],
24+
upsample_initial_channel: int,
25+
upsample_kernel_sizes: List[int],
26+
spk_embed_dim: int,
27+
gin_channels: int,
28+
sr: Optional[Union[str, int]],
29+
encoder_dim: int,
30+
):
31+
super().__init__(
32+
spec_channels,
33+
segment_size,
34+
inter_channels,
35+
hidden_channels,
36+
filter_channels,
37+
n_heads,
38+
n_layers,
39+
kernel_size,
40+
p_dropout,
41+
resblock,
42+
resblock_kernel_sizes,
43+
resblock_dilation_sizes,
44+
upsample_rates,
45+
upsample_initial_channel,
46+
upsample_kernel_sizes,
47+
spk_embed_dim,
48+
gin_channels,
49+
sr,
50+
encoder_dim,
51+
True,
52+
)
53+
self.speaker_map = None
54+
55+
def remove_weight_norm(self):
56+
self.dec.remove_weight_norm()
57+
self.flow.remove_weight_norm()
58+
self.enc_q.remove_weight_norm()
59+
60+
def construct_spkmixmap(self):
61+
self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels))
62+
for i in range(self.n_speaker):
63+
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
64+
self.speaker_map = self.speaker_map.unsqueeze(0)
65+
66+
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
67+
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
68+
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
69+
g = g * self.speaker_map # [N, S, B, 1, H]
70+
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
71+
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
72+
else:
73+
g = g.unsqueeze(0)
74+
g = self.emb_g(g).transpose(1, 2)
75+
76+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
77+
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
78+
z = self.flow(z_p, x_mask, g=g, reverse=True)
79+
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
80+
return o

tools/onnx/export.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from infer.modules.onnx.export import export_onnx
1+
from rvc.onnx import export_onnx
22

33
export_onnx("pt/Justin Bieber.pth", "pt/TestRvc_Rvc.onnx")

0 commit comments

Comments
 (0)