@@ -33,11 +33,11 @@ def fold(x, n_group):
3333 """Fold audio or spectrogram's temporal dimension in to groups.
3434
3535 Args:
36- x(Tensor): The input tensor. shape=(\ *, time_steps)
36+ x(Tensor): The input tensor. shape=(*, time_steps)
3737 n_group(int): The size of a group.
3838
3939 Returns:
40- Tensor: Folded tensor. shape=(\ *, time_steps // n_group, group)
40+ Tensor: Folded tensor. shape=(*, time_steps // n_group, group)
4141 """
4242 spatial_shape = list (x .shape [:- 1 ])
4343 time_steps = paddle .shape (x )[- 1 ]
@@ -98,11 +98,11 @@ def forward(self, x, trim_conv_artifact=False):
9898 trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
9999
100100 Returns:
101- Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \ * upsample_factor)
101+ Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor)
102102
103103 Notes:
104104 If trim_conv_artifact is ``True``, the output time steps is less
105- than ``time_steps \ * upsample_factors``.
105+ than ``time_steps * upsample_factors``.
106106 """
107107 x = paddle .unsqueeze (x , 1 ) # (B, C, T) -> (B, 1, C, T)
108108 for layer in self :
@@ -641,7 +641,7 @@ def infer(self, mel):
641641 mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
642642
643643 Returns:
644- Tensor: The synthesized audio, where``T <= T_mel \ * upsample_factors``. shape=(B, T)
644+ Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
645645 """
646646 start = time .time ()
647647 condition = self .encoder (mel , trim_conv_artifact = True ) # (B, C, T)
0 commit comments