openvpi
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 36 additions & 13 deletions b/‎README.md‎
Lines changed: 36 additions & 13 deletions
diff --git a/‎README_en.md‎
Lines changed: 38 additions & 16 deletions b/‎README_en.md‎
Lines changed: 38 additions & 16 deletions
diff --git a/‎configs/base_hifi.yaml‎
Lines changed: 1 addition & 1 deletion b/‎configs/base_hifi.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/ft_hifigan.yaml‎
Lines changed: 9 additions & 8 deletions b/‎configs/ft_hifigan.yaml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎export_ckpt.py‎
Lines changed: 5 additions & 0 deletions b/‎export_ckpt.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎models/HiFivae/models.py‎
Lines changed: 0 additions & 4 deletions b/‎models/HiFivae/models.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎models/nsf_HiFigan/models.py‎
Lines changed: 4 additions & 5 deletions b/‎models/nsf_HiFigan/models.py‎
Lines changed: 4 additions & 5 deletions
@@ -157,4 +157,8 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+/data/
+/experiments/
+/pretrained/
@@ -20,7 +20,7 @@ data_input_path: [] # 这个是你的 wav 的输入目录
 
 data_out_path: [] # 这个是你的 npz 的输出目录, 预处理之后的格式是 npz
 
-val_num: 1 # 这个是你要的 val 数量 
+val_num: 10 # 这个是你要的 val 数量 
 ```
 
 例子
@@ -30,20 +30,11 @@ data_input_path: ['wav/in1','wav/in2'] # 这个是你的wav的输入目录
 data_out_path: ['wav/out1','wav/out2'] # 这个是你的npz的输出目录
 val_num: 5 # 这个是你要的 val 数量，预处理的时候会自动抽取文件
 # 两个列表里面的路径是一一对应的所以说数量要一样
-# 然后预处理的时候会扫描全部的 .wav 文件，包括子文件夹的
+# 然后预处理的时候会扫描全部的 .wav 和 .flac 文件，包括子文件夹的
 # 正常情况下只有这三个要改
 ```
-# 离线数据增强
-将预处理脚本替换为[process_aug.py](process_aug.py) 并增添配置项
-```yaml
-key_aug: false # 表示训练时不进行增强
-aug_min: 0.9 # 最小变调倍数
-aug_max: 1.4 # 最大变调倍数
-aug_num: 1 # 数据增强倍数
-```
-即可，注意数据增强可能会损伤音质！
 # 在线数据增强（推荐）
-增加配置项，注意使用在线数据增强请使用[process.py](process.py) 脚本，否则会造成离线增强与在线增强叠加
+增加配置项
 ```yaml
 key_aug: true # 表示在训练时进行增强
 key_aug_prob: 0.5 # 增强概率
@@ -82,13 +73,29 @@ python [export_ckpt.py](export_ckpt.py) --ckpt_path ckpt路径  --save_path 导
 ```yaml
 data_input_path: [] # 这个列表 是你原始wav文件的路径
 data_out_path: [] # 此列表 预处理输出的npz文件的路径
-val_num: 1 # 这个是在验证的时候 抽取的音频文件数量
+val_num: 10 # 这个是在验证的时候 抽取的音频文件数量
 ```
 然后执行预处理
 ```sh
 python process.py --config (your config path) --num_cpu (Number of cpu threads used in preprocessing)  --strx (1 for a forced absolute path 0 for a relative path)
 ```
 ## 训练
+根据自己的显卡修改配置项
+（默认开启mini_nsf和pc_aug，特殊需要请自行关闭并修改配置文件，此处不作推荐）
+
+以下是24G显卡推荐配置（默认设定无需修改）
+```yaml
+crop_mel_frames: 48
+batch_size: 10
+pc_aug_rate: 0.5
+```
+以下是16G显卡推荐配置（需手动编辑或添加配置）
+```yaml
+crop_mel_frames: 32
+batch_size: 10
+pc_aug_rate: 0.4
+```
+训练命令
 ```sh
 python train.py --config (your config path) --exp_name (your ckpt name) --work_dir Working catalogue (optional)
 ```
@@ -140,3 +147,19 @@ python export_ckpt.py --ckpt_path (your ckpt path)  --save_path (output ckpt pat
 [univnet.yaml](configs/univnet.yaml) 训练原版univnet
 
 [lvc_base_ddspgan.yaml](configs/lvc_base_ddspgan.yaml) 训练使用lvc滤波器的 ddsp模型
+
+# 特别声明
+
+我们遗憾地公示一份经核实的《不友好行为备案清单》（下附）。该名单记录了长期对开发团队实施破坏性行径的个人/实体。
+我们郑重声明：
+
+1. 强烈建议所有使用者在下载和使用此声码器前阅读本备案清单
+2. 当前未对名单主体施加任何技术或法律层面的使用限制，因为声码器仍基于 CC BY-NC-SA 4.0 许可
+3. 若持续发生恶意行为，保留进一步施加限制的权利
+
+## 不友好行为备案清单
+
+|        名称        | 标识                                                                     | 原因                                                             |
+|:----------------:|:-----------------------------------------------------------------------|:---------------------------------------------------------------|
+| 旋转_turning_point | QQ：2673587414；<br/>Bilibili UID：285801087；<br/>Discord 用户名：colstone233 | 长期对开发者进行敌对和人身攻击，反复传播关于 DiffSinger 和开发团队的虚假信息，干扰声码器及其他社区项目的开发进程 |
+
@@ -13,11 +13,11 @@ python process.py --config (your config path) --num_cpu (Number of cpu threads u
 The following configuration items are what you need to change during preprocessing
 ```yaml
 
-data_input_path: []  the path for your data
+data_input_path: []  # the path for your data
 
-data_out_path: [] the path for the preprocessed output
+data_out_path: []  # the path for the preprocessed output
 
-val_num: 1 the number of validation audio
+val_num: 10  # the number of validation audio
 ```
 An example
 ```yaml
@@ -30,12 +30,27 @@ val_num: 5 # This is the number of valves you want.
 
  # (The paths in the two lists are one-to-one, so the number should be the same.)
 
- # (Then, the preprocessor scans all .wav files, including subfolders.)
+ # (Then, the preprocessor scans all .wav and .flac files, including subfolders.)
 
  # (Normally, there are only these three to change.)
 ```
 
 ## Training
+Adjust config according to your GPU memory
+(mini_nsf and pc_aug is enabled by default)
+
+For 24GB memory (default)
+```yaml
+crop_mel_frames: 48
+batch_size: 10
+pc_aug_rate: 0.5
+```
+For 16GB memory (need manual editing)
+```yaml
+crop_mel_frames: 32
+batch_size: 10
+pc_aug_rate: 0.4
+```
 Run the following training script
 ```sh
 python train.py --config (your config path) --exp_name (your ckpt name) --work_dir (working directory, optional)
@@ -54,18 +69,8 @@ if you finish training you can use this script to export the diffsinger vocoder
 python export_ckpt.py --ckpt_path (your ckpt path)  --save_path (output ckpt path) --work_dir (working directory, optional)
 ```
 
-# Offline data augmentation
-Replace the preprocessing script with [process_aug.py](process_aug.py) and add configuration entries
-```yaml
-key_aug: false (Do not augment during training)
-aug_min: 0.9  (Minimum f0 adjustment multiplier)
-aug_max: 1.4   (Maximum f0 adjustment multiplier)
-aug_num: 1   (Data augmentation multiplier)
-```
-That's it. Note that data augmentation may damage the sound quality!
-
 # Online data augmentation (recommend)
-Note that to use the online data augmentation, use the [process.py](process.py) script, otherwise offline and online augmentation will be superimposed
+add config
 ```yaml
 key_aug: true (Do augment during training)
 key_aug_prob: 0.5 (Data augmentation probability)
@@ -110,4 +115,21 @@ Almost 2k steps is enough for fine-tuning of small dataset.
 
 [univnet.yaml](configs%2Funivnet.yaml) Training original univnet
 
-[lvc_base_ddspgan.yaml](configs%2Flvc_base_ddspgan.yaml) Training ddsp model with lvc filters
+[lvc_base_ddspgan.yaml](configs%2Flvc_base_ddspgan.yaml) Training ddsp model with lvc filters
+
+# Special Statements
+
+We regret to publish a verified Registry of Hostile Conduct (shown as below). This registry documents individuals/entities who have engaged in long-term destructive activities against the development team.
+
+We solemnly declare:
+
+1. Strongly recommend all users review this registry before downloading and using this vocoder
+2. No technical or legal restrictions are currently imposed on listed parties, as the vocoder is
+   still licensed under CC BY-NC-SA 4.0
+3. Reserve the right to apply further restrictions in case of persistent malicious acts
+
+## Registry of Hostile Conduct
+
+|       Name       | Identifiers                                                                    | Reason                                                                                                                                                                                                                                                   |
+|:----------------:|:-------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 旋转_turning_point | QQ: 2673587414;<br/>Bilibili UID: 285801087;<br/>Discord username: colstone233 | Engaging in long-term hostile and personal attacks against developers, repeatedly spreading false information about DiffSinger and the development team, and interfering with the development process of the vocoder and other projects in the community |
@@ -11,7 +11,7 @@ f0_min: 65
 f0_max: 1100
 
 pc_aug: false # pc-nsf training method
-pc_aug_prob: 0.5
+pc_aug_rate: 0.5
 pc_aug_key: 5
 
 aug_min: 0.9
 
@@ -4,22 +4,22 @@ base_config:
 
 data_input_path: []
 data_out_path: []
-val_num: 1
+val_num: 10
 
 pe: 'parselmouth' # 'parselmouth' or 'harvest'
 f0_min: 65
 f0_max: 1100
 
-pc_aug: false # pc-nsf training method
-pc_aug_prob: 0.5
-pc_aug_key: 5
-
 aug_min: 0.9
 aug_max: 1.4
 aug_num: 1
 key_aug: false
 key_aug_prob: 0.5
 
+pc_aug: true # pc-nsf training method
+pc_aug_rate: 0.4
+pc_aug_key: 12
+
 use_stftloss: false
 loss_fft_sizes: [2048, 2048, 4096, 1024, 512, 256, 128,1024, 2048, 512]
 loss_hop_sizes: [512, 240, 480, 100, 50, 25, 12,120, 240, 50]
@@ -66,7 +66,8 @@ crop_mel_frames: 32
 
 #model_cls: training.nsf_HiFigan_task.nsf_HiFigan
 model_args:
-  mini_nsf: false
+  mini_nsf: true
+  noise_sigma: 0.0
   upsample_rates: [ 8, 8, 2, 2, 2 ]
   upsample_kernel_sizes: [ 16,16, 4, 4, 4 ]
   upsample_initial_channel: 512
@@ -113,7 +114,7 @@ sampler_frame_count_grid: 6
 ds_workers: 4
 dataloader_prefetch_factor: 2
 
-batch_size: 6
+batch_size: 10
 
 
 
@@ -147,7 +148,7 @@ seed: 114514
 ###########
 
 finetune_enabled: true
-finetune_ckpt_path: nsf_hifigan_44.1k_hop512_128bin_2024.02.ckpt
+finetune_ckpt_path: pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.ckpt
 finetune_ignored_params: []
 finetune_strict_shapes: true
 
 
@@ -7,6 +7,7 @@
 from utils import get_latest_checkpoint_path
 from utils.config_utils import read_full_config, print_config
 
+
 @click.command(help='')
 @click.option('--exp_name', required=False, metavar='EXP', help='Name of the experiment')
 @click.option('--ckpt_path', required=False, metavar='FILE', help='Path to the checkpoint file')
@@ -32,6 +33,7 @@ def export(exp_name, ckpt_path, save_path, work_dir):
         if 'generator.' in i:
             # print(i)
             ckpt[i.replace('generator.', '')] = temp_dict[i]
+    pathlib.Path(save_path).parent.mkdir(parents=True, exist_ok=True)
     torch.save({'generator': ckpt}, save_path)
     print("Export checkpoint file successfully: ", save_path)
 
@@ -53,9 +55,12 @@ def export(exp_name, ckpt_path, save_path, work_dir):
             new_config['pc_aug'] = config['pc_aug'] 
         if 'mini_nsf' not in new_config.keys():
             new_config['mini_nsf'] = False
+        if 'noise_sigma' not in new_config.keys():
+            new_config['noise_sigma'] = 0.0
 
         json_file.write(json.dumps(new_config, indent=1))
         print("Export configuration file successfully: ", new_config_file)
 
+
 if __name__ == '__main__':
     export()
@@ -338,12 +338,10 @@ def forward(self, x):
         for l in self.convs:
             x = l(x)
             x = F.leaky_relu(x, LRELU_SLOPE, inplace=True)
-            x = torch.nan_to_num(x)
 
             fmap.append(x)
 
         x = self.conv_post(x)
-        x = torch.nan_to_num(x)
         fmap.append(x)
         x = torch.flatten(x, 1, -1)
 
@@ -394,11 +392,9 @@ def forward(self, x):
         for l in self.convs:
             x = l(x)
             x = F.leaky_relu(x, LRELU_SLOPE, inplace=True)
-            x = torch.nan_to_num(x)
             fmap.append(x)
 
         x = self.conv_post(x)
-        x = torch.nan_to_num(x)
         fmap.append(x)
         x = torch.flatten(x, 1, -1)
 
 
@@ -202,7 +202,8 @@ def __init__(self, h):
         self.num_kernels = len(h.resblock_kernel_sizes)
         self.num_upsamples = len(h.upsample_rates)
         self.mini_nsf = h.mini_nsf
-            
+        self.noise_sigma = h.noise_sigma
+        
         if h.mini_nsf:
             self.source_sr = h.sampling_rate / int(np.prod(h.upsample_rates[2: ]))
             self.upp = int(np.prod(h.upsample_rates[: 2]))
@@ -260,6 +261,8 @@ def forward(self, x, f0):
         else:
             har_source = self.m_source(f0, self.upp).transpose(1, 2)
         x = self.conv_pre(x)
+        if self.noise_sigma is not None and self.noise_sigma > 0:
+            x += self.noise_sigma * torch.randn_like(x)
         for i in range(self.num_upsamples):
             x = F.leaky_relu(x, LRELU_SLOPE)
             x = self.ups[i](x)
@@ -354,12 +357,10 @@ def forward(self, x):
         for l in self.convs:
             x = l(x)
             x = F.leaky_relu(x, LRELU_SLOPE, inplace=True)
-            x = torch.nan_to_num(x)
 
             fmap.append(x)
 
         x = self.conv_post(x)
-        x = torch.nan_to_num(x)
         fmap.append(x)
         x = torch.flatten(x, 1, -1)
 
@@ -412,11 +413,9 @@ def forward(self, x):
         for l in self.convs:
             x = l(x)
             x = F.leaky_relu(x, LRELU_SLOPE, inplace=True)
-            x = torch.nan_to_num(x)
             fmap.append(x)
 
         x = self.conv_post(x)
-        x = torch.nan_to_num(x)
         fmap.append(x)
         x = torch.flatten(x, 1, -1)