jeankassio
diff --git a/‎__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ace_step_ksampler.py‎
Lines changed: 223 additions & 7 deletions b/‎ace_step_ksampler.py‎
Lines changed: 223 additions & 7 deletions
diff --git a/‎ace_step_post_process.py‎
Lines changed: 93 additions & 0 deletions b/‎ace_step_post_process.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎ace_step_prompt_gen.py‎
Lines changed: 75 additions & 4 deletions b/‎ace_step_prompt_gen.py‎
Lines changed: 75 additions & 4 deletions
diff --git a/‎ace_step_vocoder_adapter.py‎
Lines changed: 191 additions & 0 deletions b/‎ace_step_vocoder_adapter.py‎
Lines changed: 191 additions & 0 deletions
@@ -5,14 +5,16 @@
 from .ace_step_prompt_gen import NODE_CLASS_MAPPINGS as PROMPT_MAPPINGS, NODE_DISPLAY_NAMES as PROMPT_NAMES
 from .lyrics_nodes import NODE_CLASS_MAPPINGS as LYRICS_MAPPINGS, NODE_DISPLAY_NAMES as LYRICS_NAMES
 from .ace_step_save_text import NODE_CLASS_MAPPINGS as SAVETEXT_MAPPINGS, NODE_DISPLAY_NAMES as SAVETEXT_NAMES
+from .ace_step_post_process import NODE_CLASS_MAPPINGS as POSTPROCESS_MAPPINGS, NODE_DISPLAY_NAMES as POSTPROCESS_NAMES
+from .ace_step_vocoder_adapter import NODE_CLASS_MAPPINGS as VOCODER_ADAPTER_MAPPINGS, NODE_DISPLAY_NAMES as VOCODER_ADAPTER_NAMES
 # DISABLED: optimization_nodes removed (torch.compile incompatibility)
 # from .optimization_nodes import NODE_CLASS_MAPPINGS as OPT_MAPPINGS, NODE_DISPLAY_NAMES as OPT_NAMES
 # DISABLED: torch_compile_node causes incompatibility with ACE-Step
 # from .torch_compile_node import NODE_CLASS_MAPPINGS as COMPILE_MAPPINGS, NODE_DISPLAY_NAMES as COMPILE_NAMES
 
 # Combine all node mappings
-NODE_CLASS_MAPPINGS = {**KSAMPLER_MAPPINGS, **PROMPT_MAPPINGS, **LYRICS_MAPPINGS, **SAVETEXT_MAPPINGS}
-NODE_DISPLAY_NAMES = {**KSAMPLER_NAMES, **PROMPT_NAMES, **LYRICS_NAMES, **SAVETEXT_NAMES}
+NODE_CLASS_MAPPINGS = {**KSAMPLER_MAPPINGS, **PROMPT_MAPPINGS, **LYRICS_MAPPINGS, **SAVETEXT_MAPPINGS, **POSTPROCESS_MAPPINGS, **VOCODER_ADAPTER_MAPPINGS}
+NODE_DISPLAY_NAMES = {**KSAMPLER_NAMES, **PROMPT_NAMES, **LYRICS_NAMES, **SAVETEXT_NAMES, **POSTPROCESS_NAMES, **VOCODER_ADAPTER_NAMES}
 
 # Register custom samplers with ComfyUI
 def add_samplers():
 
@@ -0,0 +1,93 @@
+import torch
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class AceStepPostProcess:
+    """Simple post-process node focused on removing metallic sibilance and adding soft breath mix."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+            },
+            "optional": {
+                "de_esser_strength": ("FLOAT", {"default": 0.12, "min": 0.0, "max": 0.6, "step": 0.01}),
+                "spectral_smoothing": ("FLOAT", {"default": 0.08, "min": 0.0, "max": 0.5, "step": 0.01}),
+                "breath_mix": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 0.2, "step": 0.01}),
+                "breath_audio": ("AUDIO",),
+            },
+        }
+
+    RETURN_TYPES = ("AUDIO",)
+    RETURN_NAMES = ("audio",)
+    FUNCTION = "process"
+    CATEGORY = "JK AceStep Nodes/PostProcess"
+
+    def process(self, audio, de_esser_strength=0.12, spectral_smoothing=0.08, breath_mix=0.0, breath_audio=None):
+        try:
+            waveform = audio["waveform"] if isinstance(audio, dict) and "waveform" in audio else audio
+            if isinstance(waveform, torch.Tensor):
+                x = waveform
+                # Expect shape [B, C, T]
+                if x.dim() == 2:
+                    x = x.unsqueeze(1)
+
+                B, C, T = x.shape
+                # Short-time Fourier Transform parameters
+                n_fft = 2048
+                hop_length = 512
+                win = torch.hann_window(n_fft).to(x.device)
+                # Apply STFT per channel
+                out = x.clone()
+                for b in range(B):
+                    for c in range(C):
+                        sig = x[b, c]
+                        stft = torch.stft(sig, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=win, return_complex=True)
+                        mag = torch.abs(stft)
+                        phase = torch.angle(stft)
+                        # Apply de-esser: reduce energy above 6kHz proportionally
+                        sr = audio.get('sample_rate', 44100) if isinstance(audio, dict) else 44100
+                        freqs = torch.fft.rfftfreq(n_fft, 1.0/sr).to(x.device)
+                        mask = (freqs > 6000).float().view(1, -1)
+                        mag = mag * (1.0 - (de_esser_strength * mask))
+                        # Spectral smoothing across frequency
+                        if spectral_smoothing > 0.0:
+                            kernel = torch.tensor([0.25, 0.5, 0.25], dtype=mag.dtype, device=mag.device).view(1, 1, -1)
+                            padded = torch.nn.functional.pad(mag, (1, 1, 0, 0), mode='reflect')
+                            smoothed_mag = torch.nn.functional.conv1d(padded, kernel, padding=0)
+                            mag = (1.0 - spectral_smoothing) * mag + spectral_smoothing * smoothed_mag
+                        complex_spec = torch.polar(mag, phase)
+                        sig_rec = torch.istft(complex_spec, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=win, length=T)
+                        out[b, c] = sig_rec
+                # Add breath overlay if provided
+                if breath_mix > 0.0 and breath_audio is not None and isinstance(breath_audio, dict):
+                    breath_wave = breath_audio.get('waveform', None)
+                    if breath_wave is not None and isinstance(breath_wave, torch.Tensor):
+                        # Only add first channel
+                        out[:, 0, :min(out.shape[2], breath_wave.shape[-1])] += breath_mix * breath_wave[:, 0, :out.shape[2]]
+
+                # Re-normalize a tiny bit
+                out = out / (out.abs().max().clamp(min=1e-5))
+                if isinstance(audio, dict):
+                    audio["waveform"] = out
+                    return (audio,)
+                else:
+                    return ({"waveform": out, "sample_rate": audio.get('sample_rate', 44100)},)
+            else:
+                logger.warning("Input audio is not a torch.Tensor, skipping post-processing.")
+                return (audio,)
+        except Exception as e:
+            logger.error(f"Post processing failed: {e}")
+            return (audio,)
+
+
+NODE_CLASS_MAPPINGS = {
+    "AceStepPostProcess": AceStepPostProcess,
+}
+
+NODE_DISPLAY_NAMES = {
+    "AceStepPostProcess": "Ace-Step Post Process",
+}
@@ -257,19 +257,90 @@ def INPUT_TYPES(cls):
                     },
                 ),
             },
+            "optional": {
+                "voice_style": (
+                    [
+                        "none",
+                        "natural_female",
+                        "breathy_female",
+                        "powerful_female",
+                        "ethereal_female",
+                        "soulful_female",
+                        "deep_female",
+                        "natural_male",
+                        "breathy_male",
+                        "powerful_male",
+                        "deep_male",
+                        "soulful_male",
+                        "tenor_male",
+                        "baritone_male",
+                        "reference_singer",
+                        "androgynous",
+                        "vocal_blend",
+                        "robotic_vocal"
+                    ],
+                    {
+                        "default": "none",
+                        "tooltip": "Optional voice style hints that are appended to the prompt to improve vocal realism. Female (6 options), Male (6 options), Blended (2 options), Robotic (1), None (auto)."
+                    }
+                ),
+            },
         }
 
     RETURN_TYPES = ("STRING", "STRING")
     RETURN_NAMES = ("prompt", "template")
     FUNCTION = "generate"
     CATEGORY = "JK AceStep Nodes/Prompt"
 
-    def generate(self, style: str, extra: str = ""):
+    def generate(self, style: str, extra: str = "", voice_style: str = "none"):
         template = STYLE_PROMPTS.get(style, "")
+        voice_hint = ""
+        
+        # FEMALE VOCALS (6 options)
+        if voice_style == "natural_female":
+            voice_hint = "natural female voice with micro pitch variation, soft breath, realistic vibrato and avoid robotic quantization"
+        elif voice_style == "breathy_female":
+            voice_hint = "breathy female voice, intimate mic proximity, audible breaths, warm vowel resonances, minimal autotune"
+        elif voice_style == "powerful_female":
+            voice_hint = "powerful female lead vocal, energetic performance, controlled vibrato, clear consonant articulation, live vocal tone"
+        elif voice_style == "ethereal_female":
+            voice_hint = "ethereal female voice, airy and light, floating above the beat, delicate phrasing, spacious reverb, dreamy quality"
+        elif voice_style == "soulful_female":
+            voice_hint = "soulful female voice, rich emotional depth, warm tone, blues influences, expressive phrasing, powerful presence"
+        elif voice_style == "deep_female":
+            voice_hint = "deep female mezzo-soprano voice, lower register, sultry tone, sophisticated delivery, jazz influences"
+        
+        # MALE VOCALS (6 options)
+        elif voice_style == "natural_male":
+            voice_hint = "natural male voice with micro pitch variation and natural prosody, warm tone, realistic breath"
+        elif voice_style == "breathy_male":
+            voice_hint = "breathy male voice, intimate vocal delivery, audible breath texture, vulnerable performance, close-mic warmth"
+        elif voice_style == "powerful_male":
+            voice_hint = "powerful male lead vocal, strong projection, controlled vibrato, clear articulation, commanding presence"
+        elif voice_style == "deep_male":
+            voice_hint = "deep male baritone-bass voice, rich resonance, lower register dominance, warm sonority, authoritative tone"
+        elif voice_style == "soulful_male":
+            voice_hint = "soulful male voice, emotional depth, R&B influences, smooth phrasing, expressive delivery, warm presence"
+        elif voice_style == "tenor_male":
+            voice_hint = "bright tenor male voice, soaring high notes, clear articulation, pop sensibility, energetic performance"
+        
+        # BLENDED VOCALS (2 options)
+        elif voice_style == "reference_singer":
+            voice_hint = "use a reference lead singer performance: natural, human vocal delivery, no robotic artifacts"
+        elif voice_style == "androgynous":
+            voice_hint = "androgynous voice quality, neutral gender presentation, balanced tone between male and female characteristics"
+        elif voice_style == "vocal_blend":
+            voice_hint = "layered vocal blend with multiple voices, rich harmonic texture, complementary vocal ranges, ensemble quality"
+        # ROBOTIC VOCAL
+        elif voice_style == "robotic_vocal":
+            voice_hint = "robotic vocal style, vocoder or autotune effect, synthetic timbre, precise pitch, electronic articulation, minimal human expressiveness, classic EDM/Daft Punk/house vocal texture"
+        
+        parts = [template] if template else []
         if extra.strip():
-            final_prompt = f"{template}. {extra.strip()}"
-        else:
-            final_prompt = template
+            parts.append(extra.strip())
+        if voice_hint:
+            parts.append(voice_hint)
+        final_prompt = ". ".join(parts)
         return (final_prompt, template)
 
 
 
@@ -0,0 +1,191 @@
+"""Vocoder Adapter Node
+
+This node is a minimal adapter to help feed Ace-Step latents or decoded audio into a vocoder.
+It tries to detect the expected input type of the provided vocoder object and calls the right API.
+Supported flows:
+- Latent -> VAE -> waveform -> mel -> vocoder
+- Latent -> mel (if latent appears to be mel) -> vocoder
+- Clean waveform -> vocoder (if vocoder expects waveform for final polish)
+
+Notes:
+- Vocoder objects must be Python objects exposed to ComfyUI nodes (i.e., selected via a model node).
+- The node supports `mel_transform` using `librosa` if present, otherwise uses Torch-based mel filter.
+"""
+import logging
+import torch
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class AceStepVocoderAdapter:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "vocoder": ("MODEL",),
+                "vae": ("VAE",),
+                "latent": ("LATENT",),
+            },
+            "optional": {
+                "sample_rate": ("INT", {"default": 44100}),
+                "n_mels": ("INT", {"default": 128}),
+                "n_fft": ("INT", {"default": 2048}),
+                "hop_length": ("INT", {"default": 512}),
+            }
+        }
+
+    RETURN_TYPES = ("AUDIO",)
+    RETURN_NAMES = ("audio",)
+    FUNCTION = "adapt"
+    CATEGORY = "JK AceStep Nodes/Vocoder"
+
+    def _to_mel_torch(self, waveform, sr=44100, n_fft=2048, hop=512, n_mels=128):
+        # waveform: [B, C, T] or [T]
+        import torch.nn.functional as F
+        if waveform.dim() == 3:
+            wav = waveform[:, 0]
+        elif waveform.dim() == 2:
+            wav = waveform[:, 0]
+        else:
+            wav = waveform.unsqueeze(0)
+        try:
+            import torchaudio
+            mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop, n_mels=n_mels)(wav)
+            log_mel = torch.log(torch.clamp(mel_spec, 1e-9))
+            return log_mel
+        except Exception:
+            # Lowest-effort fallback: compute STFT magnitude and map bins
+            stft = torch.stft(wav, n_fft=n_fft, hop_length=hop, return_complex=True)
+            mag = torch.abs(stft)
+            # naive spectral-to-mel via linear downsampling
+            mel = F.interpolate(mag.unsqueeze(1), size=n_mels, mode='linear').squeeze(1)
+            return torch.log(torch.clamp(mel, 1e-9))
+
+    def adapt(self, vocoder, vae, latent, sample_rate=44100, n_mels=None, n_fft=2048, hop_length=512):
+        # optional parameters set by node UI can be passed through **kwargs later
+        # Try to introspect vocoder for n_mels/hop_length/etc
+        if n_mels is None:
+            n_mels = getattr(vocoder, 'n_mels', None)
+        if n_mels is None and hasattr(vocoder, 'config') and getattr(vocoder.config, 'n_mels', None) is not None:
+            n_mels = vocoder.config.n_mels
+        if n_mels is None:
+            n_mels = 128
+        n_fft = getattr(vocoder, 'n_fft', n_fft)
+        hop = getattr(vocoder, 'hop_length', hop_length)
+        # Step 1: If 'latent' is a dict and contains 'samples', try decode
+        audio_wave = None
+        if isinstance(latent, dict) and 'samples' in latent:
+            try:
+                audio_wave = vae.decode(latent['samples']).movedim(-1, 1)
+            except Exception as e:
+                logger.warning(f"VAE decode failed: {e}")
+                audio_wave = None
+
+        # Step 1b: If the latent seems to be mel already (many vocoders expect mel)
+        latent_is_mel = False
+        if isinstance(latent, dict) and 'samples' in latent:
+            samples = latent['samples']
+            # Heuristic: if last frequency dim is <= n_mels and reasonably small, it's probably a mel
+            if samples.dim() == 4 and samples.shape[-1] <= max(128, n_mels):
+                latent_is_mel = True
+            elif samples.dim() == 3 and samples.shape[1] == n_mels:
+                latent_is_mel = True
+
+        # Step 2: Form mel if needed
+        mel = None
+        if latent_is_mel:
+            samples = latent['samples']
+            if samples.dim() == 4:
+                # [B, C, T, F] -> collapse C by mean -> [B, T, F], then permute to [B, F, T]
+                mel = samples.mean(dim=1).permute(0, 2, 1)
+                # interpolate frequencies to n_mels if different
+                if mel.shape[1] != n_mels:
+                    mel = torch.nn.functional.interpolate(mel.unsqueeze(1), size=(n_mels, mel.shape[2]), mode='bilinear', align_corners=False).squeeze(1)
+            elif samples.dim() == 3:
+                # [B, C, T] - assume channel dim is n_mels
+                if samples.shape[1] != n_mels:
+                    mel = torch.nn.functional.interpolate(samples.unsqueeze(1), size=(n_mels, samples.shape[2]), mode='bilinear', align_corners=False).squeeze(1)
+                else:
+                    mel = samples
+            else:
+                mel = samples
+        elif audio_wave is not None:
+            # if needed, resample audio to vocoder sampling rate
+            vocoder_sr = getattr(vocoder, 'sampling_rate', getattr(vocoder, 'sample_rate', sample_rate))
+            if audio_wave is not None and hasattr(audio_wave, 'shape') and int(vocoder_sr) != int(sample_rate):
+                try:
+                    import torchaudio
+                    resampler = torchaudio.transforms.Resample(orig_freq=int(sample_rate), new_freq=int(vocoder_sr))
+                    audio_wave = resampler(audio_wave)
+                    sample_rate = int(vocoder_sr)
+                except Exception:
+                    # fallback: log warning and continue
+                    logger.warning('Resample failed: torchaudio not available or error during resample')
+            mel = self._to_mel_torch(audio_wave, sr=sample_rate, n_fft=n_fft, hop=hop, n_mels=n_mels)
+        else:
+            # last resort: try using the latent values collapsed
+            try:
+                s = latent['samples']
+                mel = s.mean(dim=1) if s.dim() >= 4 else s
+            except Exception as e:
+                logger.error(f"Failed to derive mel: {e}")
+                raise RuntimeError("Unable to derive mel from latent")
+
+        # Step 3: Try call the vocoder
+        if mel is None:
+            raise RuntimeError('Failed to produce mel for vocoder input')
+
+        # Many vocoders accept (batch, n_mels, T) or (n_mels, T)
+        if mel.dim() == 2:
+            batched = mel.unsqueeze(0)
+        else:
+            batched = mel
+
+        # Try common method names
+        # Several vocoders expect inputs with log mel spec shape [B, n_mels, T]
+        # If mel is not yet log-scaled, apply log
+        try:
+            if batched.min() >= 0:
+                batched = torch.log(torch.clamp(batched, min=1e-9))
+        except Exception:
+            pass
+
+        if hasattr(vocoder, 'infer'):
+            try:
+                out = vocoder.infer(batched)
+                return ({'waveform': out, 'sample_rate': sample_rate},)
+            except Exception as e:
+                logger.warning(f"vocoder.infer() failed: {e}")
+        if hasattr(vocoder, 'synthesize'):
+            try:
+                out = vocoder.synthesize(batched)
+                return ({'waveform': out, 'sample_rate': sample_rate},)
+            except Exception as e:
+                logger.warning(f"vocoder.synthesize() failed: {e}")
+        if hasattr(vocoder, 'decode'):
+            try:
+                out = vocoder.decode(batched)
+                return ({'waveform': out, 'sample_rate': sample_rate},)
+            except Exception as e:
+                logger.warning(f"vocoder.decode() failed: {e}")
+
+        # If vocoder is a function, call it directly
+        if callable(vocoder):
+            try:
+                out = vocoder(batched)
+                return ({'waveform': out, 'sample_rate': sample_rate},)
+            except Exception as e:
+                logger.warning(f"vocoder callable failed: {e}")
+
+        logger.error('No known vocoder API found. Please use a vocoder object exposing infer/synthesize/decode or pass a callable.')
+        raise RuntimeError('Unsupported vocoder object')
+
+
+NODE_CLASS_MAPPINGS = {
+    'AceStepVocoderAdapter': AceStepVocoderAdapter,
+}
+
+NODE_DISPLAY_NAMES = {
+    'AceStepVocoderAdapter': 'Ace-Step Vocoder Adapter',
+}