Skip to content

HuggingFace audio dataset

Albert Zeyer edited this page Sep 29, 2025 · 8 revisions

(Generic) HuggingFace dataset in RETURNN: https://github.com/rwth-i6/returnn/issues/1257

(The underlying format is Arrow. (Parquet is another binary format supported, but not optimized for reading speed.))

HuggingFace audio datasets have some special structure. https://huggingface.co/docs/hub/en/datasets-audio https://huggingface.co/docs/datasets/en/about_dataset_features#audio-feature

Also see: https://huggingface.co/spaces/hf-audio/open_asr_leaderboard

Examples: https://huggingface.co/datasets/openslr/librispeech_asr

{'chapter_id': 141231,
 'file': '/home/albert/.cache/.../dev_clean/1272/141231/1272-141231-0000.flac',
 'audio': {
    'array': array([-0.00048828, -0.00018311, -0.00137329, ...,  0.00079346, 0.00091553,  0.00085449], dtype=float32),
    'sampling_rate': 16000
 },
 'id': '1272-141231-0000',
 'speaker_id': 1272,
 'text': 'A MAN SAID TO THE UNIVERSE SIR I EXIST'}

https://huggingface.co/datasets/speechcolab/gigaspeech

{
    'segment_id': 'YOU0000000315_S0000660', 
    'speaker': 'N/A', 
    'text': "AS THEY'RE LEAVING <COMMA> CAN KASH PULL ZAHRA ASIDE REALLY QUICKLY <QUESTIONMARK>", 
    'audio': 
        {
            # in streaming mode 'path' will be 'xs_chunks_0000/YOU0000000315_S0000660.wav'
            'path': '/home/user/.cache/.../YOU0000000315_S0000660.wav', 
            'array': array([0.0005188 , 0.00085449, 0.00012207, ..., 0.00125122, 0.00076294, 0.00036621], dtype=float32), 
            'sampling_rate': 16000
        }, 
    'begin_time': 2941.889892578125, 
    'end_time': 2945.070068359375, 
    'audio_id': 'YOU0000000315', 
    'title': 'Return to Vasselheim | Critical Role: VOX MACHINA | Episode 43', 
    'url': 'https://www.youtube.com/watch?v=zr2n1fLVasU', 
    'source': 2, 
    'category': 24,
    'original_full_path': 'audio/youtube/P0004/YOU0000000315.opus'
}

https://huggingface.co/datasets/kensho/spgispeech

{
    'wav_filename': '32bcf9c9dc707fb61a04290e296f31eb/99.wav',
    'audio': {
        'path': '/home/user/.cache/.../99.wav',
        'array': array([-0.00039673, -0.00057983, -0.00057983, ..., -0.0007019 , -0.00027466,  0.00021362], dtype=float32),
        'sampling_rate': 16000
    },
    'wav_filesize': 292844,
    'transcript': 'This is proving to be true, and through ... in 2017. As a reminder,'
}

https://huggingface.co/datasets/hf-audio/esb-datasets-test-only-sorted

{
  'dataset': 'librispeech', 
  'audio': {'path': '/home/sanchit-gandhi/.cache/.../374-180298-0000.flac',
      'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ..., -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
      'sampling_rate': 16000},
  'text': 'chapter sixteen i might have told you of the beginning of this liaison ...',
  'id': '374-180298-0000'
}

https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0

{
  'client_id': 'd59478fbc1ee646a28a3c652a1...6ae9c7afc5', 
  'path': 'et/clips/common_voice_et_18318995.mp3', 
  'audio': {
    'path': 'et/clips/common_voice_et_18318995.mp3', 
    'array': array([-0.00048828, -0.00018311, -0.00137329, ...,  0.00079346, 0.00091553,  0.00085449], dtype=float32), 
    'sampling_rate': 48000
  }, 
  'sentence': 'Tasub kokku saada inimestega, keda tunned juba ammust ajast saati.', 
  'up_votes': 2, 
  'down_votes': 0, 
  'age': 'twenties', 
  'gender': 'male', 
  'accent': '', 
  'locale': 'et', 
  'segment': ''
}

https://huggingface.co/datasets/espnet/yodas

{'id': '9774',
 'utt_id': 'YoRjzEnRcqu-00000-00000716-00000819',
 'audio': {
   'path': None,
   'array': array([-0.009552  , -0.01086426, -0.012146  , ..., -0.01992798, -0.01885986, -0.01074219]),
   'sampling_rate': 16000},
 'text': 'There is a saying'}

https://huggingface.co/datasets/MLCommons/peoples_speech

{"id": "gov_DOT_uscourts_DOT_scotus_..._mp3_00002.flac",
 "audio": {
   "path": "gov_DOT_..._mp3_00002.flac",
   "array": array([-6.10351562e-05, ...]),
   "sampling_rate": 16000
 },
 "duration_ms": 14490,
 "text": "contends that the suspension clause requires a [...]"}

https://huggingface.co/datasets/speechbrain/LoquaciousSet

"ID": "20091124-0900-PLENARY-18-en_20091124-22:35:55_9",
"duration": 17.120001,
"wav": <{"decode": false, "_type": "Audio"}>,
"spk_id": "1309",
"sex": "male",
"text": "AND WHAT ABOUT INTEROPERABILITY ... OF INTEROPERABILITY"
Clone this wiki locally