voidful
diff --git a/‎README.md‎
Lines changed: 28 additions & 0 deletions b/‎README.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎SoundCodec/base_codec/descript_audio_codec.py‎
Lines changed: 3 additions & 0 deletions b/‎SoundCodec/base_codec/descript_audio_codec.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎SoundCodec/base_codec/general.py‎
Lines changed: 6 additions & 1 deletion b/‎SoundCodec/base_codec/general.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎calculate_metrics.py‎
Lines changed: 127 additions & 0 deletions b/‎calculate_metrics.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎output.txt‎
Lines changed: 42 additions & 0 deletions b/‎output.txt‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎output_cpu.txt‎
Lines changed: 42 additions & 0 deletions b/‎output_cpu.txt‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎output_full.txt‎
Lines changed: 42 additions & 0 deletions b/‎output_full.txt‎
Lines changed: 42 additions & 0 deletions
@@ -284,6 +284,34 @@ If you use this code or result in your paper, please cite our work as:
     pages = "10330--10348",
 }
 ```
+## Benchmarking and Leaderboard Contribution
+
+We use the [voidful/codec-superb-tiny](https://huggingface.co/datasets/voidful/codec-superb-tiny) dataset for standard benchmarking.
+
+### Steps to Evaluate a Codec
+
+1.  **Synthesize the Dataset**:
+    Run `dataset_creator.py` to synthesize the test set with your desired codec.
+    ```bash
+    python3 dataset_creator.py --dataset voidful/codec-superb-tiny
+    ```
+    *Note: This will process all available codecs by default. To limit to a specific codec, you can modify the script or use a custom filter.*
+
+2.  **Calculate Metrics**:
+    Run `benchmarking.py` to compute metrics (MEL, PESQ, STOI, F0Corr) for the synthesized audio.
+    ```bash
+    python3 benchmarking.py --dataset datasets/voidful/codec-superb-tiny_synth
+    ```
+
+3.  **Submit Results**:
+    After benchmarking, a result file named `datasets_voidful_codec-superb-tiny_synth_evaluation_results_*.json` will be generated in the project root.
+    
+    To contribute your results to the leaderboard:
+    - Open a **New Issue** in this repository.
+    - Title it "New Benchmark Result: [Codec Name]".
+    - Attach the generated JSON file or paste its content.
+    - The maintainers will verify and merge your results into the official leaderboard.
+
 ## Contribution
 
 Contributions are highly encouraged, whether it's through adding new codec models, expanding the dataset collection, or
 
@@ -8,6 +8,9 @@ class DACBaseCodec(BaseCodec):
     def __init__(self):
         # Reference: https://github.com/descriptinc/descript-audio-codec
         super().__init__()
+        # Force CPU if MPS is detected because DAC/audiotools uses float64 which MPS doesn't support
+        if self.device == 'mps':
+             self.device = 'cpu'
         import dac
         self.model_path = dac.utils.download(model_type=self.model_type)
         self.model = dac.DAC.load(self.model_path)
 
@@ -69,7 +69,12 @@ class BaseCodec(ABC):
     """Base class for all audio codecs with batch support."""
 
     def __init__(self):
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        elif torch.backends.mps.is_available():
+            self.device = 'mps'
+        else:
+            self.device = 'cpu'
         self.sampling_rate = None
         self.setting = None
         self.config()
 
@@ -0,0 +1,127 @@
+import os
+import torch
+import numpy as np
+from SoundCodec.codec import list_codec, load_codec
+
+def calculate_metrics():
+    print(f"{'Codec':<40} | {'BPS (kbps)':<10} | {'TPS':<10}")
+    print("-" * 65)
+    
+    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+    # Monkeypatch to ensure no MPS is used
+    if hasattr(torch.backends, 'mps'):
+        torch.backends.mps.is_available = lambda: False
+        torch.backends.mps.is_built = lambda: False
+    device = 'cpu'
+    
+    codecs = list_codec()
+    
+    # Create specific test inputs
+    duration = 1.0 # 1 second
+    
+    for name in codecs:
+        try:
+            # Skip problematic ones for now if they crash, but try to run all
+            if name in ['bigcodec_1k', 'dac_24k', 'dac_44k', 's3tokenizer_v1']:
+               # We know these are problematic on this env, but let's try or skip
+               # For now, let's catch exceptions
+               pass
+
+            metric_name = name
+            codec = load_codec(name)
+            
+            # Determine sampling rate
+            sr = getattr(codec, 'sampling_rate', 16000)
+            if sr is None: sr = 16000
+            
+            # Generate 1 second of silence/noise
+            # standard shape is usually (1, T) or (T,)
+            audio_data = np.random.randn(int(sr * duration)).astype(np.float32)
+            
+            data_item = {
+                'audio': {
+                    'array': audio_data,
+                    'sampling_rate': sr
+                }
+            }
+            
+            # Extract unit
+            # Move to device if necessary? Base codecs usually handle 'cpu' default or auto-device
+            # But let's force cpu for safety to avoid MPS issues seen earlier
+            if hasattr(codec, 'config'):
+                # Some codecs might need explicit config call if not in __init__
+                pass
+            
+            if hasattr(codec, 'device'):
+                # Force CPU for calculation safety
+                 codec.device = 'cpu'
+                 if hasattr(codec, 'model'):
+                     codec.model.to('cpu')
+
+            with torch.no_grad():
+                extracted = codec.extract_unit(data_item)
+                unit = extracted.unit
+            
+            # Calculate TPS
+            # unit shape is typically (n_quantizers, T) or (T, n_quantizers) or just (T)
+            # We need to find the time dimension.
+            # Usually the longest dimension that is not the quantizer count (which is usually small, e.g. 4, 8, 32, 128)
+            # WavTokenizer: (1, T) -> T is tokens
+            # Encodec: (n_q, T)
+            
+            shape = unit.shape
+            # Heuristic to find Time dimension
+            # Usually T is roughly sr / stride
+            # codebook dim is usually small < 128
+            
+            if len(shape) == 1:
+                frames = shape[0]
+                num_quantizers = 1
+            elif len(shape) == 2:
+                if shape[0] > shape[1]: # (T, Q)
+                    frames = shape[0]
+                    num_quantizers = shape[1]
+                else: # (Q, T)
+                    frames = shape[1]
+                    num_quantizers = shape[0]
+            elif len(shape) == 3:
+                 # (B, Q, T) or (B, T, Q) -> assume B=1 from extract_unit usually returning squeezed
+                 # But extract_unit usually returns (Q, T) or (T)
+                 # Let's assume (Q, T) mostly
+                 frames = max(shape)
+                 num_quantizers = shape[0] * shape[1] * shape[2] / frames # Simple check
+            else:
+                frames = 0
+                num_quantizers = 0
+            
+            tps = frames / duration
+            
+            # Calculate BPS
+            # Depends on codebook size (bits per token)
+            # Most codecs use 1024 (10 bits) or 2048 (11 bits) or similar.
+            # However, exact bitrate is often defined as:
+            # Bitrate = FrameRate * NumQuantizers * BitsPerCode
+            # But "BitsPerCode" depends on the model.
+            
+            # ALTERNATIVE: Use the metric name to guess for some, but user wants calculation.
+            # We can't easily know the codebook size from just the unit tensor (it contains indices).
+            # But we can assume standard codebook sizes:
+            # Encodec: 1024 (10 bits)
+            # DAC: 1024 (10 bits)
+            # FunCodec: usually 1024?
+            
+            # Actually, calculating BPS from *tensor size* is tricky without knowing vocab size.
+            # But we can print TPS for sure.
+            # For BPS, checking the paper/config is safer if we can't inspect the model.
+            
+            # Let's print TPS first, and try to deduce BPS if possible.
+            # For Encodec, we know bits = n_q * 10.
+            # BPS (kbps) = TPS * n_q * 10 / 1000
+            
+            print(f"{metric_name:<40} | {'?':<10} | {tps:<10.2f} (Shape: {shape})")
+
+        except Exception as e:
+            print(f"{name:<40} | ERROR      | {e}")
+
+if __name__ == "__main__":
+    calculate_metrics()
@@ -0,0 +1,42 @@
+Codec                                    | BPS (kbps) | TPS       
+-----------------------------------------------------------------
+academicodec_hifi_16k_320d               | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_16k_320d_large_uni     | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_24k_320d               | ?          | 75.00      (Shape: torch.Size([4, 75]))
+Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
+audiodec_24k_320d                        | ERROR      | 'AudioDec' object has no attribute 'to'
+[AUV] Missing keys in checkpoint (using default values): 60 keys
+[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
+auv                                      | ?          | 50.00      (Shape: torch.Size([1, 50]))
+bigcodec_1k                              | ERROR      | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0')  must be on the same device
+dac_16k                                  | ?          | 50.00      (Shape: torch.Size([50, 12]))
+dac_24k                                  | ?          | 75.00      (Shape: torch.Size([75, 32]))
+dac_44k                                  | ?          | 87.00      (Shape: torch.Size([87, 9]))
+encodec_24k_12bps                        | ?          | 75.00      (Shape: torch.Size([16, 75]))
+encodec_24k_1_5bps                       | ?          | 75.00      (Shape: torch.Size([2, 75]))
+encodec_24k_24bps                        | ?          | 75.00      (Shape: torch.Size([32, 75]))
+encodec_24k_3bps                         | ?          | 75.00      (Shape: torch.Size([4, 75]))
+encodec_24k_6bps                         | ?          | 75.00      (Shape: torch.Size([8, 75]))
+funcodec_en_libritts_16k_gr1nq32ds320    | ERROR      | stft input and window must be on the same device but got self on mps:0 and window on cpu
+funcodec_en_libritts_16k_gr8nq32ds320    | ERROR      | stft input and window must be on the same device but got self on mps:0 and window on cpu
+funcodec_en_libritts_16k_nq32ds320       | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_en_libritts_16k_nq32ds640       | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_zh_en_16k_nq32ds320             | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_zh_en_16k_nq32ds640             | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+s3tokenizer_v1                           | ERROR      | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0')  must be on the same device
+speech_tokenizer_16k                     | ?          | 50.00      (Shape: torch.Size([8, 50]))
+sqcodec_16k_0k75bps                      | ?          | 200.00     (Shape: torch.Size([1, 200]))
+sqcodec_16k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_16k_1k5bps                       | ?          | 300.00     (Shape: torch.Size([1, 300]))
+sqcodec_16k_3kbps                        | ?          | 400.00     (Shape: torch.Size([1, 400]))
+sqcodec_16k_6kbps                        | ?          | 600.00     (Shape: torch.Size([1, 600]))
+sqcodec_24k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_24k_24kbps                       | ?          | 1800.00    (Shape: torch.Size([1, 1800]))
+making attention of type 'vanilla' with 768 in_channels
+unicodec_24k                             | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_large_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))
+wavtokenizer_24k_large_speech_75token    | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_medium_600_4096         | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_small_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))
@@ -0,0 +1,42 @@
+Codec                                    | BPS (kbps) | TPS       
+-----------------------------------------------------------------
+academicodec_hifi_16k_320d               | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_16k_320d_large_uni     | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_24k_320d               | ?          | 75.00      (Shape: torch.Size([4, 75]))
+Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
+audiodec_24k_320d                        | ERROR      | 'AudioDec' object has no attribute 'to'
+[AUV] Missing keys in checkpoint (using default values): 60 keys
+[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
+auv                                      | ?          | 50.00      (Shape: torch.Size([1, 50]))
+bigcodec_1k                              | ?          | 80.00      (Shape: torch.Size([1, 80]))
+dac_16k                                  | ?          | 50.00      (Shape: torch.Size([50, 12]))
+dac_24k                                  | ?          | 75.00      (Shape: torch.Size([75, 32]))
+dac_44k                                  | ?          | 87.00      (Shape: torch.Size([87, 9]))
+encodec_24k_12bps                        | ?          | 75.00      (Shape: torch.Size([16, 75]))
+encodec_24k_1_5bps                       | ?          | 75.00      (Shape: torch.Size([2, 75]))
+encodec_24k_24bps                        | ?          | 75.00      (Shape: torch.Size([32, 75]))
+encodec_24k_3bps                         | ?          | 75.00      (Shape: torch.Size([4, 75]))
+encodec_24k_6bps                         | ?          | 75.00      (Shape: torch.Size([8, 75]))
+funcodec_en_libritts_16k_gr1nq32ds320    | ?          | 51.00      (Shape: torch.Size([32, 51]))
+funcodec_en_libritts_16k_gr8nq32ds320    | ?          | 51.00      (Shape: torch.Size([32, 51]))
+funcodec_en_libritts_16k_nq32ds320       | ?          | 50.00      (Shape: torch.Size([32, 50]))
+funcodec_en_libritts_16k_nq32ds640       | ?          | 32.00      (Shape: torch.Size([32, 25]))
+funcodec_zh_en_16k_nq32ds320             | ?          | 50.00      (Shape: torch.Size([32, 50]))
+funcodec_zh_en_16k_nq32ds640             | ?          | 32.00      (Shape: torch.Size([32, 25]))
+s3tokenizer_v1                           | ?          | 50.00      (Shape: torch.Size([50]))
+speech_tokenizer_16k                     | ?          | 50.00      (Shape: torch.Size([8, 50]))
+sqcodec_16k_0k75bps                      | ?          | 200.00     (Shape: torch.Size([1, 200]))
+sqcodec_16k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_16k_1k5bps                       | ?          | 300.00     (Shape: torch.Size([1, 300]))
+sqcodec_16k_3kbps                        | ?          | 400.00     (Shape: torch.Size([1, 400]))
+sqcodec_16k_6kbps                        | ?          | 600.00     (Shape: torch.Size([1, 600]))
+sqcodec_24k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_24k_24kbps                       | ?          | 1800.00    (Shape: torch.Size([1, 1800]))
+making attention of type 'vanilla' with 768 in_channels
+unicodec_24k                             | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_large_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))
+wavtokenizer_24k_large_speech_75token    | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_medium_600_4096         | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_small_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))
@@ -0,0 +1,42 @@
+Codec                                    | BPS (kbps) | TPS       
+-----------------------------------------------------------------
+academicodec_hifi_16k_320d               | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_16k_320d_large_uni     | ?          | 50.00      (Shape: torch.Size([4, 50]))
+academicodec_hifi_24k_320d               | ?          | 75.00      (Shape: torch.Size([4, 75]))
+Load tx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load rx_encoder: audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl
+Load decoder: audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl
+audiodec_24k_320d                        | ERROR      | 'AudioDec' object has no attribute 'to'
+[AUV] Missing keys in checkpoint (using default values): 60 keys
+[AUV] Detected GELU->Snake conversion, initializing Snake alpha parameters...
+auv                                      | ?          | 50.00      (Shape: torch.Size([1, 50]))
+bigcodec_1k                              | ERROR      | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0')  must be on the same device
+dac_16k                                  | ?          | 50.00      (Shape: torch.Size([50, 12]))
+dac_24k                                  | ?          | 75.00      (Shape: torch.Size([75, 32]))
+dac_44k                                  | ?          | 87.00      (Shape: torch.Size([87, 9]))
+encodec_24k_12bps                        | ?          | 75.00      (Shape: torch.Size([16, 75]))
+encodec_24k_1_5bps                       | ?          | 75.00      (Shape: torch.Size([2, 75]))
+encodec_24k_24bps                        | ?          | 75.00      (Shape: torch.Size([32, 75]))
+encodec_24k_3bps                         | ?          | 75.00      (Shape: torch.Size([4, 75]))
+encodec_24k_6bps                         | ?          | 75.00      (Shape: torch.Size([8, 75]))
+funcodec_en_libritts_16k_gr1nq32ds320    | ERROR      | stft input and window must be on the same device but got self on mps:0 and window on cpu
+funcodec_en_libritts_16k_gr8nq32ds320    | ERROR      | stft input and window must be on the same device but got self on mps:0 and window on cpu
+funcodec_en_libritts_16k_nq32ds320       | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_en_libritts_16k_nq32ds640       | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_zh_en_16k_nq32ds320             | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+funcodec_zh_en_16k_nq32ds640             | ERROR      | Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same
+s3tokenizer_v1                           | ERROR      | slow_conv2d_forward_mps: input(device='cpu') and weight(device=mps:0')  must be on the same device
+speech_tokenizer_16k                     | ?          | 50.00      (Shape: torch.Size([8, 50]))
+sqcodec_16k_0k75bps                      | ?          | 200.00     (Shape: torch.Size([1, 200]))
+sqcodec_16k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_16k_1k5bps                       | ?          | 300.00     (Shape: torch.Size([1, 300]))
+sqcodec_16k_3kbps                        | ?          | 400.00     (Shape: torch.Size([1, 400]))
+sqcodec_16k_6kbps                        | ?          | 600.00     (Shape: torch.Size([1, 600]))
+sqcodec_24k_12kbps                       | ?          | 800.00     (Shape: torch.Size([1, 800]))
+sqcodec_24k_24kbps                       | ?          | 1800.00    (Shape: torch.Size([1, 1800]))
+making attention of type 'vanilla' with 768 in_channels
+unicodec_24k                             | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_large_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))
+wavtokenizer_24k_large_speech_75token    | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_medium_600_4096         | ?          | 75.00      (Shape: torch.Size([1, 75]))
+wavtokenizer_24k_small_600_4096          | ?          | 40.00      (Shape: torch.Size([1, 40]))