fix: respect hop512 settings in RefineGAN downsample/upsample rates

colstone · colstone · commit ebc6e3f61cc6 · 2025-11-29T23:33:55.000+08:00
diff --git a/modules/refinegan/generator.py b/modules/refinegan/generator.py
@@ -369,11 +369,35 @@ def __init__(
 
         self.sampling_rate = sampling_rate
         self.hop_length = hop_length
-        self.downsample_rates = downsample_rates
-        self.upsample_rates = upsample_rates
         self.leaky_relu_slope = leaky_relu_slope
 
-        assert np.prod(downsample_rates) == np.prod(upsample_rates) == hop_length
+        def _scale_last(rates, factor):
+            rates = list(rates)
+            rates[-1] = rates[-1] * factor
+            return tuple(rates)
+
+        total_down = np.prod(downsample_rates)
+        total_up = np.prod(upsample_rates)
+        if total_down != hop_length:
+            if hop_length % total_down != 0:
+                raise ValueError(
+                    f"RefineGAN: hop_length {hop_length} not divisible by prod(downsample_rates) {total_down}"
+                )
+            scale = hop_length // total_down
+            downsample_rates = _scale_last(downsample_rates, scale)
+            print(f"| adjust RefineGAN downsample_rates -> {downsample_rates} to match hop_length {hop_length}")
+        if total_up != hop_length:
+            if hop_length % total_up != 0:
+                raise ValueError(
+                    f"RefineGAN: hop_length {hop_length} not divisible by prod(upsample_rates) {total_up}"
+                )
+            scale = hop_length // total_up
+            upsample_rates = _scale_last(upsample_rates, scale)
+            print(f"| adjust RefineGAN upsample_rates -> {upsample_rates} to match hop_length {hop_length}")
+
+        # Keep the possibly-adjusted rates
+        self.downsample_rates = tuple(downsample_rates)
+        self.upsample_rates = tuple(upsample_rates)
 
         self.template_type = template_generator
         if template_generator == "comb":
@@ -524,4 +548,3 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor) -> torch.Tensor:
         x = torch.tanh(x)
 
         return x
-
diff --git a/modules/vocoders/refinegan.py b/modules/vocoders/refinegan.py
@@ -55,6 +55,13 @@ def _select_config_file(path: pathlib.Path) -> Optional[pathlib.Path]:
 
 def _extract_gen_kwargs(cfg: Dict[str, Any]) -> Dict[str, Any]:
     model_args = cfg.get("model_args") or cfg.get("generator") or {}
+    def pick(key: str, default: Any):
+        if key in model_args:
+            return model_args[key]
+        if key in cfg:
+            return cfg[key]
+        return default
+
     return {
         "sampling_rate": cfg.get("audio_sample_rate")
         or cfg.get("sampling_rate")
@@ -68,11 +75,11 @@ def _extract_gen_kwargs(cfg: Dict[str, Any]) -> Dict[str, Any]:
         or cfg.get("hop_length")
         or model_args.get("hop_length")
         or hparams["hop_size"],
-        "downsample_rates": tuple(model_args.get("downsample_rates", (2, 2, 8, 8))),
-        "upsample_rates": tuple(model_args.get("upsample_rates", (8, 8, 2, 2))),
-        "leaky_relu_slope": float(model_args.get("leaky_relu_slope", 0.2)),
-        "start_channels": int(model_args.get("start_channels", 16)),
-        "template_generator": model_args.get("template_generator", "comb"),
+        "downsample_rates": tuple(pick("downsample_rates", (2, 2, 8, 8))),
+        "upsample_rates": tuple(pick("upsample_rates", (8, 8, 2, 2))),
+        "leaky_relu_slope": float(pick("leaky_relu_slope", 0.2)),
+        "start_channels": int(pick("start_channels", 16)),
+        "template_generator": pick("template_generator", "comb"),
     }
 
 
@@ -216,4 +223,3 @@ def spec2wav(self, mel, **kwargs):
         with torch.no_grad():
             wav = self.spec2wav_torch(mel_np, f0=f0_t)
         return wav.cpu().numpy()
-