Added HfHubDownloadYodas2Data, HfHubDownload, GetGranarysYodas2

ssh-meister · ssh-meister · commit c9e72fdd85d5 · 2025-05-19T03:25:37.000-07:00
Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -81,7 +81,7 @@
 from sdp.processors.manage_files.convert_to_tarred_audio_dataset import ConvertToTarredAudioDataset
 
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload, HfHubDownload
 
 from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
 from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
@@ -159,6 +159,8 @@
 )
 from sdp.processors.datasets.yodas2.create_initial_manifest import(
     ListYodas2Data,
-    DownloadYodas2Data,
+    SnapshotDownloadYodas2Data,
+    HfHubDownloadYodas2Data,
     CreateInitialManifestYodas2,
-)
+)
+from sdp.processors.datasets.yodas2.granary import GetGranarysYodas2
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -20,7 +20,7 @@
 import importlib.util
 
 from sdp.processors import ListToEntries
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload, HfHubDownload
 from sdp.logging import logger
 
 
@@ -137,7 +137,7 @@ def process(self):
         logger.info("Metadata successfully saved!")
         
 
-class DownloadYodas2Data(SnapshotDownload):
+class SnapshotDownloadYodas2Data(SnapshotDownload):
     """
     A specialized processor for downloading the YODAS2 dataset from Hugging Face
     and updating the input manifest with local file paths to the downloaded files.
@@ -194,7 +194,12 @@ class DownloadYodas2Data(SnapshotDownload):
 
     def __init__(self, **kwargs):
         # Hardcoded to download the espnet/yodas2 dataset from Hugging Face
-        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
+        if not 'snapshot_download_args' in kwargs:
+            kwargs['snapshot_download_args'] = dict()
+        kwargs['snapshot_download_args']['repo_id'] = 'espnet/yodas2'
+        kwargs['snapshot_download_args']['repo_type'] = 'dataset'
+
+        super().__init__(**kwargs)
 
     def write_output_manifest_file(self):
         """
@@ -271,6 +276,18 @@ def process(self):
         self.write_output_manifest_file()
 
 
+class HfHubDownloadYodas2Data(HfHubDownload):
+    def __init__(self, filename_field: str = 'audio_key', output_filepath_field = 'local_audio', **kwargs):
+        if not 'hf_hub_download_args' in kwargs:
+            kwargs['hf_hub_download_args'] = dict()
+        kwargs['hf_hub_download_args']['repo_id'] = 'espnet/yodas2'
+        kwargs['hf_hub_download_args']['repo_type'] = 'dataset'
+
+        super().__init__(filename_field = filename_field, output_filepath_field = output_filepath_field, **kwargs)
+    
+    def process(self):
+        super().process()
+
 class CreateInitialManifestYodas2(ListToEntries):
     """
     A dataset processor specialized for the YODAS2 dataset.
diff --git a/sdp/processors/datasets/yodas2/granary.py b/sdp/processors/datasets/yodas2/granary.py
@@ -0,0 +1,62 @@
+import os
+import json
+from glob import glob
+from tqdm import tqdm
+import tempfile
+
+from sdp.processors.huggingface.huggingface_hub import SnapshotDownload
+from sdp.logging import logger
+
+class GetGranarysYodas2(SnapshotDownload):
+    AVAILABLE_LANGS = ["bg", "cs", "da", "de", "el",
+                       "en", "es", "et", "fi", "fr",
+                       "hr", "hu", "it", "lt", "lv",
+                       "nl", "pl", "pt", "ro", "ru",
+                       "sk", "sv", "uk"]
+
+    def __init__(self, lang: str, translation: bool = False, **kwargs):
+        super().__init__(repo_id="YODASEnj/YDS", repo_type="dataset", **kwargs)
+        if lang not in self.AVAILABLE_LANGS:
+            raise ValueError("")
+        self.lang = lang
+
+        self.translation = translation
+        if self.lang == "en" and self.translation:
+            logger.warning(f'There are no translations for `en` language.')
+            self.translation = False
+    
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok = True)
+        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            pattern = f"{self.lang}/{self.lang}*.json"
+            if self.translation:
+                pattern = f"Translation/{self.lang}_/{self.lang}*.jsonl"
+
+            self.snapshot_download_kwargs['allow_patterns'] = pattern
+            with tempfile.TemporaryDirectory() as tmp_dir: 
+                self.snapshot_download_kwargs["local_dir"] = tmp_dir
+                self.download()
+
+                for manifest_filepath in sorted(glob(f"{tmp_dir}/{pattern}")):
+                    with open(manifest_filepath, 'r', encoding='utf8') as fin:
+                        for line in tqdm(fin, desc = f'Processing {os.path.basename(manifest_filepath)}'):
+                            sample = json.loads(line)
+                            new_sample = dict(source_lang = self.lang,
+                                            target_lang = self.lang,
+                                            yodas_id = sample['wav_id'],
+                                            offset = sample['start_time'],
+                                            duration = sample['duration'],
+                                            text = sample['text'],
+                                            answer = sample['text'],
+                                            decodercontext = "",
+                                            emotion = "<|emo:undefined|>",
+                                            pnc = "pnc",
+                                            itn = "itn",
+                                            timestamp = "notimestamp", 
+                                            diarize = "nodiarize")
+                                                        
+                            if self.translation:
+                                new_sample['target_lang'] = "en"
+                                new_sample['answer'] = sample['translation_en']
+                                
+                            fout.writelines(json.dumps(new_sample) + '\n')
diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
@@ -13,9 +13,16 @@
 # limitations under the License.
 
 import json
+import os
+from typing import Dict
 
-from sdp.processors.base_processor import BaseProcessor
+from tqdm.contrib.concurrent import process_map
 
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor
+
+def _hf_hub_download(kwargs):
+    from huggingface_hub import hf_hub_download
+    return hf_hub_download(**kwargs)
 
 class ListRepoFiles(BaseProcessor):
     """
@@ -93,34 +100,70 @@ class SnapshotDownload(BaseProcessor):
 
     def __init__(
         self,
-        output_manifest_file: str,
-        input_manifest_file: str = None,
-        **snapshot_download_kwargs,
+        output_filepath_field: str = "downloaded",
+        snapshot_download_args: dict = {},
+        **kwargs,
     ):
-        super().__init__(
-            output_manifest_file=output_manifest_file,
-            input_manifest_file=input_manifest_file,
-        )
-        self.snapshot_download_kwargs = snapshot_download_kwargs
+        super().__init__(**kwargs)
+        self.output_filepath_field = output_filepath_field
+        self.snapshot_download_args = snapshot_download_args
 
-    def download(self):
+    def process(self):
         """
-        Download the repository snapshot to a local folder.
+        Main processing entrypoint: download repo and write path to manifest. 
         """
         from huggingface_hub import snapshot_download
 
-        self.local_dir = snapshot_download(**self.snapshot_download_kwargs)
-
-    def write_output_manifest_file(self):
-        """
-        Write the path of the downloaded snapshot folder to the output manifest.
-        """
+        self.local_dir = snapshot_download(**self.snapshot_download_args)
+        
         with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-            fout.writelines(json.dumps({"destination_dir": self.local_dir}))
+            fout.writelines(json.dumps({self.output_filepath_field : self.local_dir}))
+
+
+class HfHubDownload(BaseParallelProcessor):
+    def __init__(
+        self,
+        filename_field: str,
+        output_filepath_field: str = "downloaded",
+        hf_hub_download_args: Dict = {},
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.filename_field = filename_field
+        self.output_filepath_field = output_filepath_field
+        self.hf_hub_download_args = hf_hub_download_args
 
     def process(self):
-        """
-        Main processing entrypoint: download repo and write path to manifest.
-        """
-        self.download()
-        self.write_output_manifest_file()
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # Подготовим список задач
+                download_tasks = [
+                    {
+                        **self.hf_hub_download_args,
+                        "filename": entry[self.filename_field]
+                    }
+                    for entry in manifest_chunk
+                ]
+
+                # Параллельная загрузка с учётом max_workers и chunksize
+                results = process_map(
+                    _hf_hub_download,
+                    download_tasks,
+                    max_workers=self.max_workers,
+                    chunksize=self.chunksize,
+                )
+
+                # Сопоставим обратно результаты с входными entry
+                for entry, local_path in zip(manifest_chunk, results):
+                    entry[self.output_filepath_field] = local_path
+                    json.dump(entry, fout, ensure_ascii=False)
+                    fout.write("\n")
+                    self.number_of_entries += 1
+
+        self.finalize(self.test_cases)
+    
+    def process_dataset_entry(self, data_entry):
+        pass