Skip to content

Commit 1f7dea1

Browse files
authored
Merge pull request #561 from kylehowells/main
Add native MLX DeepFilterNet speech enhancement (v1/v2/v3)
2 parents 3c874c6 + 6db73c5 commit 1f7dea1

File tree

18 files changed

+3482
-2
lines changed

18 files changed

+3482
-2
lines changed

CONTRIBUTIONS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,10 @@ This file acknowledges the original authors and contributors of models ported to
88
- **Copyright**: Speech Lab, Alibaba Group
99
- **License**: Apache License 2.0
1010
- **MLX Port**: Dmitry Starkov ([@starkdmi](https://github.com/starkdmi))
11+
12+
## DeepFilterNet (Speech Enhancement)
13+
14+
- **Original**: [Rikorose/DeepFilterNet](https://github.com/Rikorose/DeepFilterNet)
15+
- **Copyright**: Hendrik Schröter and contributors
16+
- **License**: MIT / Apache-2.0
17+
- **MLX Port**: Kyle Howells ([@kylehowells](https://github.com/kylehowells))

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,9 @@ See the [Sortformer README](mlx_audio/vad/models/sortformer/README.md) for API d
119119
| Model | Description | Use Case | Repo |
120120
|-------|-------------|----------|------|
121121
| **SAM-Audio** | Text-guided source separation | Extract specific sounds | [mlx-community/sam-audio-large](https://huggingface.co/mlx-community/sam-audio-large) |
122-
| **Liquid2.5-Audio*** | Speech-to-Speech, Text-to-Speech and Speech-to-Text | Speech interactions | [mlx-community/LFM2.5-Audio-1.5B-8bit](https://huggingface.co/mlx-community/LFM2.5-Audio-1.5B-8bit)
122+
| **Liquid2.5-Audio*** | Speech-to-Speech, Text-to-Speech and Speech-to-Text | Speech interactions | [mlx-community/LFM2.5-Audio-1.5B-8bit](https://huggingface.co/mlx-community/LFM2.5-Audio-1.5B-8bit) |
123123
| **MossFormer2 SE** | Speech enhancement | Noise removal | [starkdmi/MossFormer2_SE_48K_MLX](https://huggingface.co/starkdmi/MossFormer2_SE_48K_MLX) |
124+
| **DeepFilterNet (1/2/3)** | Speech enhancement | Noise suppression | [mlx-community/DeepFilterNet-mlx](https://huggingface.co/mlx-community/DeepFilterNet-mlx) |
124125

125126
## Model Examples
126127

938 KB
Binary file not shown.
938 KB
Binary file not shown.

mlx_audio/sts/__init__.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
from .models.deepfilternet import (
2+
DeepFilterNet2Config,
3+
DeepFilterNet3Config,
4+
DeepFilterNetConfig,
5+
DeepFilterNetModel,
6+
DeepFilterNetStreamer,
7+
DeepFilterNetStreamingConfig,
8+
)
19
from .models.mossformer2_se import (
210
MossFormer2SE,
311
MossFormer2SEConfig,
@@ -11,7 +19,11 @@
1119
SeparationResult,
1220
save_audio,
1321
)
14-
from .voice_pipeline import VoicePipeline
22+
23+
try:
24+
from .voice_pipeline import VoicePipeline
25+
except ImportError:
26+
VoicePipeline = None
1527

1628
__all__ = [
1729
"SAMAudio",
@@ -21,6 +33,13 @@
2133
"save_audio",
2234
"SAMAudioConfig",
2335
"VoicePipeline",
36+
# DeepFilterNet
37+
"DeepFilterNetModel",
38+
"DeepFilterNetConfig",
39+
"DeepFilterNet2Config",
40+
"DeepFilterNet3Config",
41+
"DeepFilterNetStreamer",
42+
"DeepFilterNetStreamingConfig",
2443
# MossFormer2 SE
2544
"MossFormer2SE",
2645
"MossFormer2SEConfig",

mlx_audio/sts/generate.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Generate enhanced audio using speech-to-speech models.
2+
3+
Usage:
4+
python -m mlx_audio.sts.generate --model mlx-community/DeepFilterNet-mlx --audio noisy.wav
5+
python -m mlx_audio.sts.generate --model mlx-community/DeepFilterNet-mlx --audio noisy.wav --version 2
6+
python -m mlx_audio.sts.generate --model mlx-community/DeepFilterNet-mlx --audio noisy.wav --stream
7+
python -m mlx_audio.sts.generate --model starkdmi/MossFormer2_SE_48K_MLX --audio noisy.wav
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import argparse
13+
import time
14+
from pathlib import Path
15+
16+
# Repo ID substrings to model type mapping
17+
REPO_HINTS = {
18+
"deepfilter": "deepfilternet",
19+
"mossformer": "mossformer2",
20+
}
21+
22+
23+
def _detect_model_type(model_name: str) -> str:
24+
"""Detect model type from repo ID or path name."""
25+
lower = model_name.lower()
26+
for hint, model_type in REPO_HINTS.items():
27+
if hint in lower:
28+
return model_type
29+
raise ValueError(
30+
f"Cannot detect model type from '{model_name}'. "
31+
f"Supported models: {', '.join(REPO_HINTS.keys())}"
32+
)
33+
34+
35+
def parse_args():
36+
parser = argparse.ArgumentParser(
37+
description="Enhance audio using speech-to-speech models"
38+
)
39+
parser.add_argument(
40+
"--model",
41+
type=str,
42+
default="mlx-community/DeepFilterNet-mlx",
43+
help="HuggingFace repo ID or local path to the model",
44+
)
45+
parser.add_argument(
46+
"--audio",
47+
type=str,
48+
required=True,
49+
help="Path to the input audio file",
50+
)
51+
parser.add_argument(
52+
"--output-path",
53+
type=str,
54+
default=None,
55+
help="Output audio file path (default: <input>_enhanced.wav)",
56+
)
57+
parser.add_argument(
58+
"--verbose",
59+
action="store_true",
60+
help="Print detailed processing information",
61+
)
62+
63+
# DeepFilterNet-specific options
64+
dfn = parser.add_argument_group("DeepFilterNet options")
65+
dfn.add_argument(
66+
"--version",
67+
type=int,
68+
default=None,
69+
choices=[1, 2, 3],
70+
help="DeepFilterNet version (1, 2, or 3). Default: 3",
71+
)
72+
dfn.add_argument(
73+
"--subfolder",
74+
type=str,
75+
default=None,
76+
help="Subfolder within the model repo (e.g. v1, v2, v3)",
77+
)
78+
dfn.add_argument(
79+
"--stream",
80+
action="store_true",
81+
help="Use streaming enhancement mode (DeepFilterNet v2/v3 only)",
82+
)
83+
84+
return parser.parse_args()
85+
86+
87+
def main():
88+
args = parse_args()
89+
90+
in_path = Path(args.audio).expanduser().resolve()
91+
if not in_path.exists():
92+
raise FileNotFoundError(f"Input audio file not found: {in_path}")
93+
94+
if args.output_path:
95+
out_path = Path(args.output_path).expanduser().resolve()
96+
else:
97+
out_path = in_path.with_stem(in_path.stem + "_enhanced")
98+
99+
model_type = _detect_model_type(args.model)
100+
101+
if args.verbose:
102+
print(f"Model: {args.model}")
103+
print(f"Type: {model_type}")
104+
print(f"Input: {in_path}")
105+
print(f"Output: {out_path}")
106+
107+
start = time.time()
108+
109+
if model_type == "deepfilternet":
110+
from mlx_audio.sts.models.deepfilternet import DeepFilterNetModel
111+
112+
load_kwargs = {"model_name_or_path": args.model}
113+
if args.version is not None:
114+
load_kwargs["version"] = args.version
115+
elif args.subfolder is not None:
116+
load_kwargs["subfolder"] = args.subfolder
117+
118+
model = DeepFilterNetModel.from_pretrained(**load_kwargs)
119+
120+
if args.stream:
121+
model.enhance_file_streaming(str(in_path), str(out_path))
122+
mode = "streaming"
123+
else:
124+
model.enhance_file(str(in_path), str(out_path))
125+
mode = "offline"
126+
127+
elif model_type == "mossformer2":
128+
from mlx_audio import audio_io
129+
from mlx_audio.sts.models.mossformer2_se import MossFormer2SEModel
130+
131+
model = MossFormer2SEModel.from_pretrained(args.model)
132+
enhanced = model.enhance(str(in_path))
133+
audio_io.write(str(out_path), enhanced, model.config.sample_rate)
134+
mode = "offline"
135+
136+
elapsed = time.time() - start
137+
138+
if args.verbose:
139+
print(f"Mode: {mode}")
140+
print(f"Time: {elapsed:.2f}s")
141+
142+
print(f"Saved: {out_path}")
143+
144+
145+
if __name__ == "__main__":
146+
main()

mlx_audio/sts/models/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Copyright (c) 2025 Prince Canuma and contributors (https://github.com/Blaizzy/mlx-audio)
22

3+
from .deepfilternet import (
4+
DeepFilterNet2Config,
5+
DeepFilterNet3Config,
6+
DeepFilterNetConfig,
7+
DeepFilterNetModel,
8+
DeepFilterNetStreamer,
9+
DeepFilterNetStreamingConfig,
10+
)
311
from .lfm_audio import (
412
ChatState,
513
GenerationConfig,
@@ -25,6 +33,13 @@
2533
"Batch",
2634
"save_audio",
2735
"SAMAudioConfig",
36+
# DeepFilterNet
37+
"DeepFilterNetModel",
38+
"DeepFilterNetConfig",
39+
"DeepFilterNet2Config",
40+
"DeepFilterNet3Config",
41+
"DeepFilterNetStreamer",
42+
"DeepFilterNetStreamingConfig",
2843
# MossFormer2 SE
2944
"MossFormer2SE",
3045
"MossFormer2SEConfig",
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# DeepFilterNet (MLX)
2+
3+
DeepFilterNet speech enhancement in pure MLX with support for model versions 1, 2, and 3.
4+
5+
Pretrained weights: [mlx-community/DeepFilterNet-mlx](https://huggingface.co/mlx-community/DeepFilterNet-mlx)
6+
7+
## Quick Start
8+
9+
```python
10+
from mlx_audio.sts.models.deepfilternet import DeepFilterNetModel
11+
12+
# Load v3 (default)
13+
model = DeepFilterNetModel.from_pretrained()
14+
model.enhance_file("noisy.wav", "clean.wav")
15+
16+
# Load a specific version
17+
model = DeepFilterNetModel.from_pretrained(version=2)
18+
19+
# Or specify the subfolder directly
20+
model = DeepFilterNetModel.from_pretrained(subfolder="v1")
21+
```
22+
23+
Streaming/chunked mode (true per-hop stateful processing for v2/v3):
24+
25+
```python
26+
streamer = model.create_streamer(pad_end_frames=3, compensate_delay=True)
27+
out_1 = streamer.process_chunk(chunk_a)
28+
out_2 = streamer.process_chunk(chunk_b)
29+
out_tail = streamer.flush()
30+
```
31+
32+
## Model Selection
33+
34+
Model architecture is selected automatically from `config.json` (`model_version` field).
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""DeepFilterNet speech enhancement model for MLX."""
2+
3+
from .config import DeepFilterNet2Config, DeepFilterNet3Config, DeepFilterNetConfig
4+
from .model import DeepFilterNetModel
5+
from .streaming import DeepFilterNetStreamer, DeepFilterNetStreamingConfig
6+
7+
Model = DeepFilterNetModel
8+
ModelConfig = DeepFilterNetConfig
9+
10+
__all__ = [
11+
"DeepFilterNetModel",
12+
"DeepFilterNetConfig",
13+
"DeepFilterNet2Config",
14+
"DeepFilterNet3Config",
15+
"DeepFilterNetStreamer",
16+
"DeepFilterNetStreamingConfig",
17+
"Model",
18+
"ModelConfig",
19+
]

0 commit comments

Comments
 (0)