Skip to content

Commit 52b3d0d

Browse files
Yosshi999Hiroshiba
andauthored
fix compat breaking: revive workaround padding in decode() (#867)
Co-authored-by: Hiroshiba <[email protected]>
1 parent 918f226 commit 52b3d0d

File tree

3 files changed

+123
-104
lines changed

3 files changed

+123
-104
lines changed

crates/voicevox_core/src/synthesizer.rs

Lines changed: 101 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ mod inner {
100100
use std::{
101101
io::{Cursor, Write as _},
102102
marker::PhantomData,
103+
ops::Range,
103104
sync::Arc,
104105
};
105106
use tracing::info;
@@ -127,6 +128,30 @@ mod inner {
127128
use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions};
128129

129130
const DEFAULT_SAMPLING_RATE: u32 = 24000;
131+
/// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(フレーム数)
132+
const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round()
133+
/// 音声生成の際、音声特徴量の前後に確保すべきマージン幅(フレーム数)
134+
/// モデルの受容野から計算される
135+
const MARGIN: usize = 14;
136+
/// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す
137+
fn crop_with_margin(audio: &AudioFeature, range: Range<usize>) -> ndarray::ArrayView2<'_, f32> {
138+
if range.start > audio.frame_length || range.end > audio.frame_length {
139+
panic!(
140+
"{range:?} is out of range for audio feature of length {frame_length}",
141+
frame_length = audio.frame_length,
142+
);
143+
}
144+
if range.start > range.end {
145+
panic!("{range:?} is invalid because start > end",);
146+
}
147+
let range = range.start..range.end + 2 * MARGIN;
148+
audio.internal_state.slice(ndarray::s![range, ..])
149+
}
150+
/// 追加した安全マージンを生成音声から取り除く
151+
fn trim_margin_from_wave(wave_with_margin: ndarray::Array1<f32>) -> ndarray::Array1<f32> {
152+
let len = wave_with_margin.len();
153+
wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256])
154+
}
130155

131156
/// 音声の中間表現。
132157
pub struct AudioFeature {
@@ -138,8 +163,6 @@ mod inner {
138163
pub frame_length: usize,
139164
/// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。
140165
pub frame_rate: f64,
141-
/// workaroundとして付け足されているパディング長。
142-
padding_frame_length: usize,
143166
/// 生成時に利用したクエリ。
144167
audio_query: AudioQuery,
145168
}
@@ -375,28 +398,12 @@ mod inner {
375398
}
376399
}
377400

378-
// 音が途切れてしまうのを避けるworkaround処理が入っている
379-
// NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。
380-
// TODO: 改善したらここのpadding処理を取り除く
381-
const PADDING_SIZE: f64 = 0.4;
382-
let padding_size =
383-
((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
384-
let start_and_end_padding_size = 2 * padding_size;
385-
let length_with_padding = f0.len() + start_and_end_padding_size;
386-
let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size);
387-
let phoneme_with_padding = make_phoneme_with_padding(
388-
phoneme.as_flattened(),
389-
OjtPhoneme::num_phoneme(),
390-
length_with_padding,
391-
padding_size,
392-
);
393-
394401
let spec = self
395402
.generate_full_intermediate(
396-
f0_with_padding.len(),
403+
f0.len(),
397404
OjtPhoneme::num_phoneme(),
398-
&f0_with_padding,
399-
&phoneme_with_padding,
405+
&f0,
406+
phoneme.as_flattened(),
400407
style_id,
401408
)
402409
.await?;
@@ -405,7 +412,6 @@ mod inner {
405412
style_id,
406413
frame_length: f0.len(),
407414
frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0,
408-
padding_frame_length: padding_size,
409415
audio_query: audio_query.clone(),
410416
});
411417

@@ -457,46 +463,6 @@ mod inner {
457463
pitch,
458464
}
459465
}
460-
461-
fn make_f0_with_padding(
462-
f0_slice: &[f32],
463-
length_with_padding: usize,
464-
padding_size: usize,
465-
) -> Vec<f32> {
466-
// 音が途切れてしまうのを避けるworkaround処理
467-
// 改善したらこの関数を削除する
468-
let mut f0_with_padding = Vec::with_capacity(length_with_padding);
469-
let padding = vec![0.0; padding_size];
470-
f0_with_padding.extend_from_slice(&padding);
471-
f0_with_padding.extend_from_slice(f0_slice);
472-
f0_with_padding.extend_from_slice(&padding);
473-
f0_with_padding
474-
}
475-
476-
fn make_phoneme_with_padding(
477-
phoneme_slice: &[f32],
478-
phoneme_size: usize,
479-
length_with_padding: usize,
480-
padding_size: usize,
481-
) -> Vec<f32> {
482-
// 音が途切れてしまうのを避けるworkaround処理
483-
// 改善したらこの関数を削除する
484-
let mut padding_phoneme = vec![0.0; phoneme_size];
485-
padding_phoneme[0] = 1.0;
486-
let padding_phoneme_len = padding_phoneme.len();
487-
let padding_phonemes: Vec<f32> = padding_phoneme
488-
.into_iter()
489-
.cycle()
490-
.take(padding_phoneme_len * padding_size)
491-
.collect();
492-
let mut phoneme_with_padding =
493-
Vec::with_capacity(phoneme_size * length_with_padding);
494-
phoneme_with_padding.extend_from_slice(&padding_phonemes);
495-
phoneme_with_padding.extend_from_slice(phoneme_slice);
496-
phoneme_with_padding.extend_from_slice(&padding_phonemes);
497-
498-
phoneme_with_padding
499-
}
500466
}
501467

502468
pub(super) async fn render(
@@ -506,41 +472,20 @@ mod inner {
506472
end: usize,
507473
) -> Result<Vec<u8>> {
508474
// TODO: 44.1kHzなどの対応
509-
const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
510-
use std::cmp::min;
511-
// 実態(workaround paddingを含まない)上での区間
512-
let clipped_start = min(start, audio.frame_length);
513-
let clipped_end = min(end, audio.frame_length);
514-
// 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン
515-
if (clipped_start..clipped_end).is_empty() {
475+
if (start..end).is_empty() {
476+
// 指定区間が空のときは早期リターン
516477
return Ok(vec![]);
517478
}
518-
// マージンがデータからはみ出さないことを保証
519-
// cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
520-
if MARGIN > audio.padding_frame_length + clipped_start
521-
|| MARGIN > audio.padding_frame_length + (audio.frame_length - clipped_end)
522-
{
523-
unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
524-
}
525-
let left_margin = MARGIN;
526-
let right_margin = MARGIN;
527-
// 安全マージンを追加したデータ上での区間
528-
let slice_start = audio.padding_frame_length + clipped_start - left_margin;
529-
let slice_end = audio.padding_frame_length + clipped_end + right_margin;
530-
let segment = audio
531-
.internal_state
532-
.slice(ndarray::s![slice_start..slice_end, ..]);
479+
let spec_segment = crop_with_margin(audio, start..end);
533480
let wave_with_margin = self
534-
.render_audio_segment(segment.into_owned(), audio.style_id)
481+
.render_audio_segment(spec_segment.to_owned(), audio.style_id)
535482
.await?;
536-
// 変換前に追加した安全マージンを生成音声から取り除く
537-
let wave = wave_with_margin
538-
.slice(ndarray::s![
539-
left_margin * 256..wave_with_margin.len() - right_margin * 256
540-
])
541-
.into_owned()
542-
.into_raw_vec();
543-
return Ok(to_s16le_pcm(&wave, &audio.audio_query));
483+
let wave = trim_margin_from_wave(wave_with_margin);
484+
return Ok(to_s16le_pcm(
485+
wave.as_slice()
486+
.expect("`trim_margin_from_wave` should just trim an array"),
487+
&audio.audio_query,
488+
));
544489

545490
fn to_s16le_pcm(
546491
wave: &[f32],
@@ -999,6 +944,10 @@ mod inner {
999944
Ok(output.into_raw_vec())
1000945
}
1001946

947+
/// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。
948+
///
949+
/// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。
950+
///
1002951
/// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。
1003952
fn generate_full_intermediate(
1004953
&self,
@@ -1010,17 +959,69 @@ mod inner {
1010959
) -> Result<ndarray::Array2<f32>> {
1011960
let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
1012961

1013-
let GenerateFullIntermediateOutput { spec } = self.run_session(
962+
// 音が途切れてしまうのを避けるworkaround処理が入っている
963+
// TODO: 改善したらここのpadding処理を取り除く
964+
let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH;
965+
let length_with_padding = f0.len() + start_and_end_padding_size;
966+
let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH);
967+
let phoneme_with_padding = make_phoneme_with_padding(
968+
phoneme_vector.into_shape([length, phoneme_size]).unwrap(),
969+
PADDING_FRAME_LENGTH,
970+
);
971+
972+
let GenerateFullIntermediateOutput {
973+
spec: spec_with_padding,
974+
} = self.run_session(
1014975
model_id,
1015976
GenerateFullIntermediateInput {
1016-
f0: f0.into_shape([length, 1]).unwrap(),
1017-
phoneme: phoneme_vector.into_shape([length, phoneme_size]).unwrap(),
977+
f0: f0_with_padding
978+
.into_shape([length_with_padding, 1])
979+
.unwrap(),
980+
phoneme: phoneme_with_padding,
1018981
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
1019982
},
1020983
)?;
1021-
Ok(spec)
984+
985+
// マージンがデータからはみ出さないことを保証
986+
// cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
987+
if MARGIN > PADDING_FRAME_LENGTH {
988+
unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
989+
}
990+
// マージン分を両端に残して音声特徴量を返す
991+
return Ok(spec_with_padding
992+
.slice(ndarray::s![
993+
PADDING_FRAME_LENGTH - MARGIN
994+
..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN,
995+
..
996+
])
997+
.to_owned());
998+
999+
fn make_f0_with_padding(
1000+
f0_slice: ndarray::Array1<f32>,
1001+
padding_size: usize,
1002+
) -> ndarray::Array1<f32> {
1003+
// 音が途切れてしまうのを避けるworkaround処理
1004+
// 改善したらこの関数を削除する
1005+
let padding = ndarray::Array1::<f32>::zeros(padding_size);
1006+
ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding]
1007+
}
1008+
1009+
fn make_phoneme_with_padding(
1010+
phoneme_slice: ndarray::Array2<f32>,
1011+
padding_size: usize,
1012+
) -> ndarray::Array2<f32> {
1013+
// 音が途切れてしまうのを避けるworkaround処理
1014+
// 改善したらこの関数を削除する
1015+
let mut padding =
1016+
ndarray::Array2::<f32>::zeros((padding_size, phoneme_slice.ncols()));
1017+
padding
1018+
.slice_mut(ndarray::s![.., 0])
1019+
.assign(&ndarray::arr0(1.0));
1020+
ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding]
1021+
}
10221022
}
10231023

1024+
/// 与えられた音声特徴量で音声生成。
10241025
/// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。
10251026
fn render_audio_segment(
10261027
&self,
@@ -1049,8 +1050,9 @@ mod inner {
10491050
phoneme_vector,
10501051
style_id,
10511052
)?;
1052-
let output = self.render_audio_segment(intermediate, style_id)?;
1053-
Ok(output.into_raw_vec())
1053+
let output_with_margin = self.render_audio_segment(intermediate, style_id)?;
1054+
let output = trim_margin_from_wave(output_with_margin);
1055+
Ok(output.to_vec())
10541056
}
10551057
}
10561058

crates/voicevox_core_python_api/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ mod blocking {
282282

283283
use camino::Utf8PathBuf;
284284
use pyo3::{
285+
exceptions::{PyIndexError, PyValueError},
285286
pyclass, pymethods,
286287
types::{IntoPyDict as _, PyBytes, PyDict, PyList},
287288
Py, PyAny, PyObject, PyRef, PyResult, Python,
@@ -709,6 +710,20 @@ mod blocking {
709710
end: usize,
710711
py: Python<'py>,
711712
) -> PyResult<&'py PyBytes> {
713+
if start > audio.frame_length() || end > audio.frame_length() {
714+
return Err(PyIndexError::new_err(format!(
715+
"({}, {}) is out of range for audio feature of length {}",
716+
start,
717+
end,
718+
audio.frame_length(),
719+
)));
720+
}
721+
if start > end {
722+
return Err(PyValueError::new_err(format!(
723+
"({}, {}) is invalid range because start > end",
724+
start, end,
725+
)));
726+
}
712727
let wav = &self
713728
.synthesizer
714729
.read()?

example/python/run.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ def main() -> None:
5353
if streaming:
5454
logger.info("%s", "In streaming mode")
5555
chunk_sec = 1.0
56-
intermediate = synthesizer.precompute_render(audio_query, style_id)
57-
chunk_frames = int(intermediate.frame_rate * chunk_sec)
56+
audio_feature = synthesizer.precompute_render(audio_query, style_id)
57+
chunk_frames = int(audio_feature.frame_rate * chunk_sec)
5858
pcm = b""
59-
for i in range(0, intermediate.frame_length, chunk_frames):
60-
logger.info("%s", f"{i/intermediate.frame_length:.2%}")
61-
pcm += synthesizer.render(intermediate, i, i + chunk_frames)
59+
for i in range(0, audio_feature.frame_length, chunk_frames):
60+
logger.info("%s", f"{i/audio_feature.frame_length:.2%}")
61+
pcm += synthesizer.render(
62+
audio_feature, i, min(i + chunk_frames, audio_feature.frame_length)
63+
)
6264
logger.info("%s", f"100%")
6365
wav = wav_from_s16le(
6466
pcm, audio_query.output_sampling_rate, audio_query.output_stereo

0 commit comments

Comments
 (0)