Skip to content

Commit ac1397f

Browse files
authored
add input wav and delay time monitor (#1295)
1 parent a47aad5 commit ac1397f

File tree

2 files changed

+63
-29
lines changed

2 files changed

+63
-29
lines changed

gui_v1.py

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import multiprocessing
1515

1616
logger = logging.getLogger(__name__)
17-
17+
stream_latency = -1
1818

1919
class Harvest(multiprocessing.Process):
2020
def __init__(self, inp_q, opt_q):
@@ -100,7 +100,8 @@ class GUI:
100100
def __init__(self) -> None:
101101
self.config = GUIConfig()
102102
self.flag_vc = False
103-
103+
self.function = 'vc'
104+
self.delay_time = 0
104105
self.launcher()
105106

106107
def load(self):
@@ -112,6 +113,10 @@ def load(self):
112113
data["harvest"] = data["f0method"] == "harvest"
113114
data["crepe"] = data["f0method"] == "crepe"
114115
data["rmvpe"] = data["f0method"] == "rmvpe"
116+
if data["sg_input_device"] not in input_devices:
117+
data["sg_input_device"] = input_devices[sd.default.device[0]]
118+
if data["sg_output_device"] not in output_devices:
119+
data["sg_output_device"] = output_devices[sd.default.device[1]]
115120
except:
116121
with open("configs/config.json", "w") as j:
117122
data = {
@@ -342,6 +347,22 @@ def launcher(self):
342347
[
343348
sg.Button(i18n("开始音频转换"), key="start_vc"),
344349
sg.Button(i18n("停止音频转换"), key="stop_vc"),
350+
sg.Radio(
351+
i18n("输入监听"),
352+
"function",
353+
key="im",
354+
default=False,
355+
enable_events=True,
356+
),
357+
sg.Radio(
358+
i18n("输出变声"),
359+
"function",
360+
key="vc",
361+
default=True,
362+
enable_events=True,
363+
),
364+
sg.Text(i18n("算法延迟(ms):")),
365+
sg.Text("0", key="delay_time"),
345366
sg.Text(i18n("推理时间(ms):")),
346367
sg.Text("0", key="infer_time"),
347368
],
@@ -403,9 +424,16 @@ def event_handler(self):
403424
}
404425
with open("configs/config.json", "w") as j:
405426
json.dump(settings, j)
427+
global stream_latency
428+
while stream_latency < 0:
429+
time.sleep(0.01)
430+
self.delay_time = stream_latency + values["block_time"] + values["crossfade_length"] + 0.01
431+
if values["I_noise_reduce"]:
432+
self.delay_time += values["crossfade_length"]
433+
self.window["delay_time"].update(int(self.delay_time * 1000))
406434
if event == "stop_vc" and self.flag_vc == True:
407435
self.flag_vc = False
408-
436+
stream_latency = -1
409437
# Parameter hot update
410438
if event == "threhold":
411439
self.config.threhold = values["threhold"]
@@ -423,11 +451,17 @@ def event_handler(self):
423451
self.config.f0method = event
424452
elif event == "I_noise_reduce":
425453
self.config.I_noise_reduce = values["I_noise_reduce"]
454+
if stream_latency > 0:
455+
self.delay_time += (1 if values["I_noise_reduce"] else -1) * values["crossfade_length"]
456+
self.window["delay_time"].update(int(self.delay_time * 1000))
426457
elif event == "O_noise_reduce":
427458
self.config.O_noise_reduce = values["O_noise_reduce"]
459+
elif event in ["vc", "im"]:
460+
self.function = event
428461
elif event != "start_vc" and self.flag_vc == True:
429462
# Other parameters do not support hot update
430463
self.flag_vc = False
464+
stream_latency = -1
431465

432466
def set_values(self, values):
433467
if len(values["pth_path"].strip()) == 0:
@@ -565,7 +599,9 @@ def soundinput(self):
565599
blocksize=self.block_frame,
566600
samplerate=self.config.samplerate,
567601
dtype="float32",
568-
):
602+
) as stream:
603+
global stream_latency
604+
stream_latency = stream.latency[-1]
569605
while self.flag_vc:
570606
time.sleep(self.config.block_time)
571607
logger.debug("Audio block passed.")
@@ -597,7 +633,7 @@ def audio_callback(
597633
self.block_frame_16k :
598634
].clone()
599635
# input noise reduction and resampling
600-
if self.config.I_noise_reduce:
636+
if self.config.I_noise_reduce and self.function == 'vc':
601637
input_wav = self.input_wav[
602638
-self.crossfade_frame - self.block_frame - 2 * self.zc :
603639
]
@@ -621,23 +657,28 @@ def audio_callback(
621657
self.input_wav[-self.block_frame - 2 * self.zc :]
622658
)[160:]
623659
# infer
624-
f0_extractor_frame = self.block_frame_16k + 800
625-
if self.config.f0method == "rmvpe":
626-
f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
627-
infer_wav = self.rvc.infer(
628-
self.input_wav_res,
629-
self.input_wav_res[-f0_extractor_frame:].cpu().numpy(),
630-
self.block_frame_16k,
631-
self.valid_rate,
632-
self.pitch,
633-
self.pitchf,
634-
self.config.f0method,
635-
)
636-
infer_wav = infer_wav[
637-
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
638-
]
660+
if self.function == 'vc':
661+
f0_extractor_frame = self.block_frame_16k + 800
662+
if self.config.f0method == "rmvpe":
663+
f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
664+
infer_wav = self.rvc.infer(
665+
self.input_wav_res,
666+
self.input_wav_res[-f0_extractor_frame:].cpu().numpy(),
667+
self.block_frame_16k,
668+
self.valid_rate,
669+
self.pitch,
670+
self.pitchf,
671+
self.config.f0method,
672+
)
673+
infer_wav = infer_wav[
674+
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
675+
]
676+
else:
677+
infer_wav = self.input_wav[
678+
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
679+
].clone()
639680
# output noise reduction
640-
if self.config.O_noise_reduce:
681+
if (self.config.O_noise_reduce and self.function == 'vc') or (self.config.I_noise_reduce and self.function == 'im'):
641682
self.output_buffer[: -self.block_frame] = self.output_buffer[
642683
self.block_frame :
643684
].clone()
@@ -646,7 +687,7 @@ def audio_callback(
646687
infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)
647688
).squeeze(0)
648689
# volume envelop mixing
649-
if self.config.rms_mix_rate < 1:
690+
if self.config.rms_mix_rate < 1 and self.function == 'vc':
650691
rms1 = librosa.feature.rms(
651692
y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :]
652693
.cpu()

tools/torchgate/torchgate.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,6 @@ def forward(
211211
Returns:
212212
torch.Tensor: The denoised audio signal, with the same shape as the input signal.
213213
"""
214-
assert x.ndim == 2
215-
if x.shape[-1] < self.win_length * 2:
216-
raise Exception(f"x must be bigger than {self.win_length * 2}")
217-
218-
assert xn is None or xn.ndim == 1 or xn.ndim == 2
219-
if xn is not None and xn.shape[-1] < self.win_length * 2:
220-
raise Exception(f"xn must be bigger than {self.win_length * 2}")
221214

222215
# Compute short-time Fourier transform (STFT)
223216
X = torch.stft(

0 commit comments

Comments
 (0)