Skip to content

Commit f42d972

Browse files
committed
chore: update leaderboard data with accurate BPS/TPS and benchmarking instructions
1 parent 446180c commit f42d972

File tree

12 files changed

+458
-52
lines changed

12 files changed

+458
-52
lines changed

README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,34 @@ If you use this code or result in your paper, please cite our work as:
284284
pages = "10330--10348",
285285
}
286286
```
287+
## Benchmarking and Leaderboard Contribution
288+
289+
We use the [voidful/codec-superb-tiny](https://huggingface.co/datasets/voidful/codec-superb-tiny) dataset for standard benchmarking.
290+
291+
### Steps to Evaluate a Codec
292+
293+
1. **Synthesize the Dataset**:
294+
Run `dataset_creator.py` to synthesize the test set with your desired codec.
295+
```bash
296+
python3 dataset_creator.py --dataset voidful/codec-superb-tiny
297+
```
298+
*Note: This will process all available codecs by default. To limit to a specific codec, you can modify the script or use a custom filter.*
299+
300+
2. **Calculate Metrics**:
301+
Run `benchmarking.py` to compute metrics (MEL, PESQ, STOI, F0Corr) for the synthesized audio.
302+
```bash
303+
python3 benchmarking.py --dataset datasets/voidful/codec-superb-tiny_synth
304+
```
305+
306+
3. **Submit Results**:
307+
After benchmarking, a result file named `datasets_voidful_codec-superb-tiny_synth_evaluation_results_*.json` will be generated in the project root.
308+
309+
To contribute your results to the leaderboard:
310+
- Open a **New Issue** in this repository.
311+
- Title it "New Benchmark Result: [Codec Name]".
312+
- Attach the generated JSON file or paste its content.
313+
- The maintainers will verify and merge your results into the official leaderboard.
314+
287315
## Contribution
288316

289317
Contributions are highly encouraged, whether it's through adding new codec models, expanding the dataset collection, or

SoundCodec/base_codec/descript_audio_codec.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ class DACBaseCodec(BaseCodec):
88
def __init__(self):
99
# Reference: https://github.com/descriptinc/descript-audio-codec
1010
super().__init__()
11+
# Force CPU if MPS is detected because DAC/audiotools uses float64 which MPS doesn't support
12+
if self.device == 'mps':
13+
self.device = 'cpu'
1114
import dac
1215
self.model_path = dac.utils.download(model_type=self.model_type)
1316
self.model = dac.DAC.load(self.model_path)

SoundCodec/base_codec/general.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,12 @@ class BaseCodec(ABC):
6969
"""Base class for all audio codecs with batch support."""
7070

7171
def __init__(self):
72-
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
72+
if torch.cuda.is_available():
73+
self.device = 'cuda'
74+
elif torch.backends.mps.is_available():
75+
self.device = 'mps'
76+
else:
77+
self.device = 'cpu'
7378
self.sampling_rate = None
7479
self.setting = None
7580
self.config()

calculate_metrics.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
import torch
3+
import numpy as np
4+
from SoundCodec.codec import list_codec, load_codec
5+
6+
def calculate_metrics():
7+
print(f"{'Codec':<40} | {'BPS (kbps)':<10} | {'TPS':<10}")
8+
print("-" * 65)
9+
10+
os.environ['CUDA_VISIBLE_DEVICES'] = ''
11+
# Monkeypatch to ensure no MPS is used
12+
if hasattr(torch.backends, 'mps'):
13+
torch.backends.mps.is_available = lambda: False
14+
torch.backends.mps.is_built = lambda: False
15+
device = 'cpu'
16+
17+
codecs = list_codec()
18+
19+
# Create specific test inputs
20+
duration = 1.0 # 1 second
21+
22+
for name in codecs:
23+
try:
24+
# Skip problematic ones for now if they crash, but try to run all
25+
if name in ['bigcodec_1k', 'dac_24k', 'dac_44k', 's3tokenizer_v1']:
26+
# We know these are problematic on this env, but let's try or skip
27+
# For now, let's catch exceptions
28+
pass
29+
30+
metric_name = name
31+
codec = load_codec(name)
32+
33+
# Determine sampling rate
34+
sr = getattr(codec, 'sampling_rate', 16000)
35+
if sr is None: sr = 16000
36+
37+
# Generate 1 second of silence/noise
38+
# standard shape is usually (1, T) or (T,)
39+
audio_data = np.random.randn(int(sr * duration)).astype(np.float32)
40+
41+
data_item = {
42+
'audio': {
43+
'array': audio_data,
44+
'sampling_rate': sr
45+
}
46+
}
47+
48+
# Extract unit
49+
# Move to device if necessary? Base codecs usually handle 'cpu' default or auto-device
50+
# But let's force cpu for safety to avoid MPS issues seen earlier
51+
if hasattr(codec, 'config'):
52+
# Some codecs might need explicit config call if not in __init__
53+
pass
54+
55+
if hasattr(codec, 'device'):
56+
# Force CPU for calculation safety
57+
codec.device = 'cpu'
58+
if hasattr(codec, 'model'):
59+
codec.model.to('cpu')
60+
61+
with torch.no_grad():
62+
extracted = codec.extract_unit(data_item)
63+
unit = extracted.unit
64+
65+
# Calculate TPS
66+
# unit shape is typically (n_quantizers, T) or (T, n_quantizers) or just (T)
67+
# We need to find the time dimension.
68+
# Usually the longest dimension that is not the quantizer count (which is usually small, e.g. 4, 8, 32, 128)
69+
# WavTokenizer: (1, T) -> T is tokens
70+
# Encodec: (n_q, T)
71+
72+
shape = unit.shape
73+
# Heuristic to find Time dimension
74+
# Usually T is roughly sr / stride
75+
# codebook dim is usually small < 128
76+
77+
if len(shape) == 1:
78+
frames = shape[0]
79+
num_quantizers = 1
80+
elif len(shape) == 2:
81+
if shape[0] > shape[1]: # (T, Q)
82+
frames = shape[0]
83+
num_quantizers = shape[1]
84+
else: # (Q, T)
85+
frames = shape[1]
86+
num_quantizers = shape[0]
87+
elif len(shape) == 3:
88+
# (B, Q, T) or (B, T, Q) -> assume B=1 from extract_unit usually returning squeezed
89+
# But extract_unit usually returns (Q, T) or (T)
90+
# Let's assume (Q, T) mostly
91+
frames = max(shape)
92+
num_quantizers = shape[0] * shape[1] * shape[2] / frames # Simple check
93+
else:
94+
frames = 0
95+
num_quantizers = 0
96+
97+
tps = frames / duration
98+
99+
# Calculate BPS
100+
# Depends on codebook size (bits per token)
101+
# Most codecs use 1024 (10 bits) or 2048 (11 bits) or similar.
102+
# However, exact bitrate is often defined as:
103+
# Bitrate = FrameRate * NumQuantizers * BitsPerCode
104+
# But "BitsPerCode" depends on the model.
105+
106+
# ALTERNATIVE: Use the metric name to guess for some, but user wants calculation.
107+
# We can't easily know the codebook size from just the unit tensor (it contains indices).
108+
# But we can assume standard codebook sizes:
109+
# Encodec: 1024 (10 bits)
110+
# DAC: 1024 (10 bits)
111+
# FunCodec: usually 1024?
112+
113+
# Actually, calculating BPS from *tensor size* is tricky without knowing vocab size.
114+
# But we can print TPS for sure.
115+
# For BPS, checking the paper/config is safer if we can't inspect the model.
116+
117+
# Let's print TPS first, and try to deduce BPS if possible.
118+
# For Encodec, we know bits = n_q * 10.
119+
# BPS (kbps) = TPS * n_q * 10 / 1000
120+
121+
print(f"{metric_name:<40} | {'?':<10} | {tps:<10.2f} (Shape: {shape})")
122+
123+
except Exception as e:
124+
print(f"{name:<40} | ERROR | {e}")
125+
126+
if __name__ == "__main__":
127+
calculate_metrics()

output.txt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
Codec | BPS (kbps) | TPS
2+
-----------------------------------------------------------------
3+
academicodec_hifi_16k_320d | ? | 50.00 (Shape: torch.Size([4, 50]))
4+
academicodec_hifi_16k_320d_large_uni | ? | 50.00 (Shape: torch.Size([4, 50]))
5+
academicodec_hifi_24k_320d | ? | 75.00 (Shape: torch.Size([4, 75]))
6+
Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
7+
Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
8+
Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
9+
audiodec_24k_320d | ERROR | 'AudioDec' object has no attribute 'to'
10+
[AUV] Missing keys in checkpoint (using default values): 60 keys
11+
[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
12+
auv | ? | 50.00 (Shape: torch.Size([1, 50]))
13+
bigcodec_1k | ERROR | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0') must be on the same device
14+
dac_16k | ? | 50.00 (Shape: torch.Size([50, 12]))
15+
dac_24k | ? | 75.00 (Shape: torch.Size([75, 32]))
16+
dac_44k | ? | 87.00 (Shape: torch.Size([87, 9]))
17+
encodec_24k_12bps | ? | 75.00 (Shape: torch.Size([16, 75]))
18+
encodec_24k_1_5bps | ? | 75.00 (Shape: torch.Size([2, 75]))
19+
encodec_24k_24bps | ? | 75.00 (Shape: torch.Size([32, 75]))
20+
encodec_24k_3bps | ? | 75.00 (Shape: torch.Size([4, 75]))
21+
encodec_24k_6bps | ? | 75.00 (Shape: torch.Size([8, 75]))
22+
funcodec_en_libritts_16k_gr1nq32ds320 | ERROR | stft input and window must be on the same device but got self on mps:0 and window on cpu
23+
funcodec_en_libritts_16k_gr8nq32ds320 | ERROR | stft input and window must be on the same device but got self on mps:0 and window on cpu
24+
funcodec_en_libritts_16k_nq32ds320 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
25+
funcodec_en_libritts_16k_nq32ds640 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
26+
funcodec_zh_en_16k_nq32ds320 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
27+
funcodec_zh_en_16k_nq32ds640 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
28+
s3tokenizer_v1 | ERROR | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0') must be on the same device
29+
speech_tokenizer_16k | ? | 50.00 (Shape: torch.Size([8, 50]))
30+
sqcodec_16k_0k75bps | ? | 200.00 (Shape: torch.Size([1, 200]))
31+
sqcodec_16k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
32+
sqcodec_16k_1k5bps | ? | 300.00 (Shape: torch.Size([1, 300]))
33+
sqcodec_16k_3kbps | ? | 400.00 (Shape: torch.Size([1, 400]))
34+
sqcodec_16k_6kbps | ? | 600.00 (Shape: torch.Size([1, 600]))
35+
sqcodec_24k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
36+
sqcodec_24k_24kbps | ? | 1800.00 (Shape: torch.Size([1, 1800]))
37+
making attention of type 'vanilla' with 768 in_channels
38+
unicodec_24k | ? | 75.00 (Shape: torch.Size([1, 75]))
39+
wavtokenizer_24k_large_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))
40+
wavtokenizer_24k_large_speech_75token | ? | 75.00 (Shape: torch.Size([1, 75]))
41+
wavtokenizer_24k_medium_600_4096 | ? | 75.00 (Shape: torch.Size([1, 75]))
42+
wavtokenizer_24k_small_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))

output_cpu.txt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
Codec | BPS (kbps) | TPS
2+
-----------------------------------------------------------------
3+
academicodec_hifi_16k_320d | ? | 50.00 (Shape: torch.Size([4, 50]))
4+
academicodec_hifi_16k_320d_large_uni | ? | 50.00 (Shape: torch.Size([4, 50]))
5+
academicodec_hifi_24k_320d | ? | 75.00 (Shape: torch.Size([4, 75]))
6+
Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
7+
Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
8+
Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
9+
audiodec_24k_320d | ERROR | 'AudioDec' object has no attribute 'to'
10+
[AUV] Missing keys in checkpoint (using default values): 60 keys
11+
[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
12+
auv | ? | 50.00 (Shape: torch.Size([1, 50]))
13+
bigcodec_1k | ? | 80.00 (Shape: torch.Size([1, 80]))
14+
dac_16k | ? | 50.00 (Shape: torch.Size([50, 12]))
15+
dac_24k | ? | 75.00 (Shape: torch.Size([75, 32]))
16+
dac_44k | ? | 87.00 (Shape: torch.Size([87, 9]))
17+
encodec_24k_12bps | ? | 75.00 (Shape: torch.Size([16, 75]))
18+
encodec_24k_1_5bps | ? | 75.00 (Shape: torch.Size([2, 75]))
19+
encodec_24k_24bps | ? | 75.00 (Shape: torch.Size([32, 75]))
20+
encodec_24k_3bps | ? | 75.00 (Shape: torch.Size([4, 75]))
21+
encodec_24k_6bps | ? | 75.00 (Shape: torch.Size([8, 75]))
22+
funcodec_en_libritts_16k_gr1nq32ds320 | ? | 51.00 (Shape: torch.Size([32, 51]))
23+
funcodec_en_libritts_16k_gr8nq32ds320 | ? | 51.00 (Shape: torch.Size([32, 51]))
24+
funcodec_en_libritts_16k_nq32ds320 | ? | 50.00 (Shape: torch.Size([32, 50]))
25+
funcodec_en_libritts_16k_nq32ds640 | ? | 32.00 (Shape: torch.Size([32, 25]))
26+
funcodec_zh_en_16k_nq32ds320 | ? | 50.00 (Shape: torch.Size([32, 50]))
27+
funcodec_zh_en_16k_nq32ds640 | ? | 32.00 (Shape: torch.Size([32, 25]))
28+
s3tokenizer_v1 | ? | 50.00 (Shape: torch.Size([50]))
29+
speech_tokenizer_16k | ? | 50.00 (Shape: torch.Size([8, 50]))
30+
sqcodec_16k_0k75bps | ? | 200.00 (Shape: torch.Size([1, 200]))
31+
sqcodec_16k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
32+
sqcodec_16k_1k5bps | ? | 300.00 (Shape: torch.Size([1, 300]))
33+
sqcodec_16k_3kbps | ? | 400.00 (Shape: torch.Size([1, 400]))
34+
sqcodec_16k_6kbps | ? | 600.00 (Shape: torch.Size([1, 600]))
35+
sqcodec_24k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
36+
sqcodec_24k_24kbps | ? | 1800.00 (Shape: torch.Size([1, 1800]))
37+
making attention of type 'vanilla' with 768 in_channels
38+
unicodec_24k | ? | 75.00 (Shape: torch.Size([1, 75]))
39+
wavtokenizer_24k_large_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))
40+
wavtokenizer_24k_large_speech_75token | ? | 75.00 (Shape: torch.Size([1, 75]))
41+
wavtokenizer_24k_medium_600_4096 | ? | 75.00 (Shape: torch.Size([1, 75]))
42+
wavtokenizer_24k_small_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))

output_full.txt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
Codec | BPS (kbps) | TPS
2+
-----------------------------------------------------------------
3+
academicodec_hifi_16k_320d | ? | 50.00 (Shape: torch.Size([4, 50]))
4+
academicodec_hifi_16k_320d_large_uni | ? | 50.00 (Shape: torch.Size([4, 50]))
5+
academicodec_hifi_24k_320d | ? | 75.00 (Shape: torch.Size([4, 75]))
6+
Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
7+
Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
8+
Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
9+
audiodec_24k_320d | ERROR | 'AudioDec' object has no attribute 'to'
10+
[AUV] Missing keys in checkpoint (using default values): 60 keys
11+
[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
12+
auv | ? | 50.00 (Shape: torch.Size([1, 50]))
13+
bigcodec_1k | ERROR | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0') must be on the same device
14+
dac_16k | ? | 50.00 (Shape: torch.Size([50, 12]))
15+
dac_24k | ? | 75.00 (Shape: torch.Size([75, 32]))
16+
dac_44k | ? | 87.00 (Shape: torch.Size([87, 9]))
17+
encodec_24k_12bps | ? | 75.00 (Shape: torch.Size([16, 75]))
18+
encodec_24k_1_5bps | ? | 75.00 (Shape: torch.Size([2, 75]))
19+
encodec_24k_24bps | ? | 75.00 (Shape: torch.Size([32, 75]))
20+
encodec_24k_3bps | ? | 75.00 (Shape: torch.Size([4, 75]))
21+
encodec_24k_6bps | ? | 75.00 (Shape: torch.Size([8, 75]))
22+
funcodec_en_libritts_16k_gr1nq32ds320 | ERROR | stft input and window must be on the same device but got self on mps:0 and window on cpu
23+
funcodec_en_libritts_16k_gr8nq32ds320 | ERROR | stft input and window must be on the same device but got self on mps:0 and window on cpu
24+
funcodec_en_libritts_16k_nq32ds320 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
25+
funcodec_en_libritts_16k_nq32ds640 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
26+
funcodec_zh_en_16k_nq32ds320 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
27+
funcodec_zh_en_16k_nq32ds640 | ERROR | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
28+
s3tokenizer_v1 | ERROR | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0') must be on the same device
29+
speech_tokenizer_16k | ? | 50.00 (Shape: torch.Size([8, 50]))
30+
sqcodec_16k_0k75bps | ? | 200.00 (Shape: torch.Size([1, 200]))
31+
sqcodec_16k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
32+
sqcodec_16k_1k5bps | ? | 300.00 (Shape: torch.Size([1, 300]))
33+
sqcodec_16k_3kbps | ? | 400.00 (Shape: torch.Size([1, 400]))
34+
sqcodec_16k_6kbps | ? | 600.00 (Shape: torch.Size([1, 600]))
35+
sqcodec_24k_12kbps | ? | 800.00 (Shape: torch.Size([1, 800]))
36+
sqcodec_24k_24kbps | ? | 1800.00 (Shape: torch.Size([1, 1800]))
37+
making attention of type 'vanilla' with 768 in_channels
38+
unicodec_24k | ? | 75.00 (Shape: torch.Size([1, 75]))
39+
wavtokenizer_24k_large_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))
40+
wavtokenizer_24k_large_speech_75token | ? | 75.00 (Shape: torch.Size([1, 75]))
41+
wavtokenizer_24k_medium_600_4096 | ? | 75.00 (Shape: torch.Size([1, 75]))
42+
wavtokenizer_24k_small_600_4096 | ? | 40.00 (Shape: torch.Size([1, 40]))

0 commit comments

Comments
 (0)