docs + hf

Alcray · Alcray · commit 9afa326e9e09 · 2025-03-12T21:20:52.000+04:00
Signed-off-by: Alexan &lt;hayrapetyan.alexan@gmail.com&gt;
diff --git a/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml b/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml
@@ -6,6 +6,7 @@ documentation: |
   It processes all accepted results from the Toloka pool and prepares the data for training by refining and resampling audio files and ensuring text formatting consistency.
 
   **Stage Overview**:
+  
   This stage includes the following steps:
   1. Downloading all the ACCEPTED results from the Toloka platform.
   2. Filtering out damaged audio files.
diff --git a/dataset_configs/armenian/toloka/pipeline_start.yaml b/dataset_configs/armenian/toloka/pipeline_start.yaml
@@ -6,6 +6,7 @@ documentation: |
   It sets up the foundation for creating structured tasks by initializing a new Toloka project, preparing pools, and processing textual data to generate a clean and organized corpus.
 
   **Stage Overview**:
+  
   This stage focuses on preparing and refining the dataset through the following steps:
   1. Creating a new Toloka project.
   2. Creating a new pool for the project.
diff --git a/dataset_configs/armenian/toloka/pipeline_validate_answers.yaml b/dataset_configs/armenian/toloka/pipeline_validate_answers.yaml
@@ -7,6 +7,7 @@ documentation: |
 
   **Stage Overview**:
   This stage includes the following steps:
+  
   1. Downloading results of completed tasks from Toloka.
   2. Validating the audio files and filtering out corrupted files.
   3. Transcribing Armenian audio to text using a HuggingFace model.
diff --git a/sdp/processors/datasets/coraa/create_initial_manifest.py b/sdp/processors/datasets/coraa/create_initial_manifest.py
@@ -2,7 +2,6 @@
 import os
 from pathlib import Path
 from typing import List
-from huggingface_hub import snapshot_download
 import pandas as pd
 
 import rarfile  #Needs to be installed
@@ -64,7 +63,11 @@ def prepare(self):
         os.makedirs(self.resampled_audio_dir, exist_ok=True)
         os.makedirs(self.extract_archive_dir, exist_ok=True)
         if not self.already_downloaded:
-            snapshot_download(repo_id="gabrielrstan/CORAA-v1.1", repo_type='dataset', local_dir=self.raw_data_dir)
+            try:
+                from huggingface_hub import snapshot_download
+                snapshot_download(repo_id="gabrielrstan/CORAA-v1.1", repo_type='dataset', local_dir=self.raw_data_dir)
+            except ImportError:
+                raise ImportError("huggingface_hub is required to download the dataset. Please install it with pip install huggingface_hub")
         if not self.already_extracted:
 
             if self.data_split == 'train':
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
@@ -23,21 +23,27 @@
 from typing import Optional
 
 class ASRTransformers(BaseProcessor):
-    """
-    Processor to transcribe using ASR Transformers model from HuggingFace.
+    """This processor transcribes audio files using HuggingFace ASR Transformer models.
+
+    It processes audio files from the manifest and adds transcriptions using the specified
+    pre-trained model from HuggingFace.
 
     Args:
-        pretrained_model (str): name of pretrained model on HuggingFace.
-        output_text_key (str): Key to save transcription result.
-        input_audio_key (str): Key to read audio file. Defaults to "audio_filepath".
-        input_duration_key (str): Audio duration key. Defaults to "duration".
-        device (str): Inference device.
-        batch_size (int): Inference batch size. Defaults to 1.
-        chunk_length_s (int): Length of the chunks (in seconds) into which the input audio should be divided.
-            Note: Some models perform the chunking on their own (for instance, Whisper chunks into 30s segments also by maintaining the context of the previous chunks).
-        torch_dtype (str): Tensor data type. Default to "float32"
-        max_new_tokens (Optional[int]): The maximum number of new tokens to generate.
-            If not specified, there is no hard limit on the number of tokens generated, other than model-specific constraints.
+        pretrained_model (str): Name of pretrained model on HuggingFace.
+        output_text_key (str): Key to save transcription result in the manifest.
+        input_audio_key (str): Key to read audio file paths from the manifest. Default: "audio_filepath".
+        input_duration_key (str): Key for audio duration in the manifest. Default: "duration".
+        device (str, optional): Inference device (e.g., "cuda", "cpu"). Default: None.
+        batch_size (int): Inference batch size. Default: 1.
+        chunk_length_s (int): Length of audio chunks in seconds. Default: 0.
+        torch_dtype (str): Tensor data type for model inference. Default: "float32".
+        generate_task (str): Task type for generation. Default: "transcribe".
+        generate_language (str): Language for generation. Default: "english".
+        max_new_tokens (int, optional): Maximum number of new tokens to generate. Default: None.
+
+    Returns:
+        A manifest with transcribed text added to each entry under the specified output_text_key.
+
     """
 
     def __init__(
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
@@ -739,8 +739,30 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
-
 class CopyManifestData(BaseParallelProcessor):
+    """This processor copies files specified in the manifest to a new location.
+
+    It is useful for creating a consolidated dataset by gathering files from different sources
+    into a single directory.
+
+    Args:
+        copy_path (str): The destination directory where files will be copied.
+        source_filepath (str): The key in the manifest that contains the path to 
+            the file to be copied. Default: "audio_path".
+
+    Returns:
+        The same data as in the input manifest, but the files referenced in the manifest
+        will have been copied to the specified destination directory.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.modify_manifest.data_to_data.CopyManifestData
+              input_manifest_file: ${workspace_dir}/dataset.json
+              output_manifest_file: ${workspace_dir}/dataset_copied.json
+              copy_path: ${workspace_dir}/consolidated_data
+              source_filepath: "audio_filepath"
+    """
     def __init__(
         self,
         copy_path: str,
@@ -906,6 +928,18 @@ def process_dataset_entry(self, data_entry) -> List:
 
 
 class GetWER(BaseParallelProcessor):
+    """This processor calculates Word Error Rate (WER) between predicted text and ground truth text.
+
+    It computes the WER for each entry in the manifest and adds the result as a new field.
+    
+    Args:
+        text_key (str): Key for the ground truth text field in the manifest. Default: "text".
+        pred_text_key (str): Key for the predicted text field in the manifest. Default: "pred_text".
+    
+    Returns:
+        The same data as in the input manifest with an additional 'wer' field containing 
+        the calculated Word Error Rate between the specified text fields.
+    """
     def __init__(
         self,
         text_key: str = "text",
@@ -922,11 +956,33 @@ def process_dataset_entry(self, data_entry) -> List:
 
 
 class MakeSentence(BaseParallelProcessor):
-    """
-    Processes a text string by capitalizing its first character (if enabled) and appending 
-    an end_symbol if the text does not already end with punctuation.
-    """
+    """This processor formats text strings into proper sentences.
+
+    It capitalizes the first character of the text (if enabled) and appends
+    an end symbol if the text does not already end with punctuation.
+
+    Args:
+        text_key (str): The key in the manifest containing the text to be processed.
+            Default: "text".
+        end_symbol (str): The punctuation symbol to add at the end of the text if it
+            doesn't already have one. Default: ":".
+        make_uppercase (bool): Whether to capitalize the first character of the text.
+            Default: True.
 
+    Returns:
+        The same data as in the input manifest with the text field modified to have
+        proper sentence formatting.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.modify_manifest.data_to_data.MakeSentence
+              input_manifest_file: ${workspace_dir}/dataset.json
+              output_manifest_file: ${workspace_dir}/dataset_formatted.json
+              text_key: "transcript"
+              end_symbol: "."
+              make_uppercase: true
+    """
     def __init__(
         self,
         text_key: str = "text",
@@ -949,30 +1005,22 @@ def process_dataset_entry(self, data_entry) -> List:
         return [DataEntry(data=data_entry)]
 
 
-
 class ASRFileCheck(BaseProcessor):
-    """
-    ASRFileCheck is a class for validating audio files listed in a manifest file.
-    This class checks if each audio file can be successfully loaded with the `torchaudio` library, marking
-    and moving corrupted files to a specified directory.
+    """This processor validates audio files in the manifest and identifies corrupted files.
+
+    It attempts to load each audio file using the torchaudio library and moves corrupted
+    files to a specified directory.
+
+    Args:
+        audio_filepath_key (str): The key in the manifest that contains the path to
+            the audio file. Default: "audio_filepath".
+        corrupted_audio_dir (str): The directory where corrupted audio files will be moved.
+        workspace_dir (str, optional): The base directory for resolving relative paths.
+            Default: None.
+
+    Returns:
+        A manifest with corrupted audio files removed.
 
-    Attributes:
-    ----------
-    audio_filepath_key : str, optional
-        The key in the manifest entries used to retrieve the path to the audio file. Defaults to 'audio_filepath'.
-    corrupted_audio_dir : str
-        The directory where corrupted audio files will be moved. This is a required parameter.
-    workspace_dir : str, optional
-        The base directory where audio files are stored. If provided, audio file paths will be resolved
-        relative to this directory. Defaults to None.
-    failed_files : list
-        A list of file paths for audio files that failed to load.
-
-    Methods:
-    -------
-    process()
-        Checks each file listed in the manifest to ensure it can be loaded with torchaudio.
-        Moves corrupted files and outputs a new manifest with only valid entries.
     """
     def __init__(self, audio_filepath_key: str = "audio_filepath", corrupted_audio_dir: str = None, workspace_dir: str = None, **kwargs):
         """
diff --git a/sdp/processors/toloka/accept_if.py b/sdp/processors/toloka/accept_if.py
@@ -32,37 +32,32 @@
 
 
 class AcceptIfWERLess(BaseParallelProcessor):
-    """
-    AcceptIfWERLess is a class for accepting Toloka assignments if the Word Error Rate (WER) is below a specified threshold.
-    This class uses Toloka's API to evaluate the WER of assignments and accept them if they meet the criteria.
-
-    Attributes:
-    ----------
-    input_data_file : str
-        The path to the input data file containing API configurations.
-    input_pool_file : str
-        The path to the input pool file containing pool configurations.
-    threshold : float, optional
-        The WER threshold below which assignments are accepted. Defaults to 75.
-    config_file : str, optional
-        The path to the configuration file. Defaults to None.
-    API_KEY : str, optional
-        The API key used to authenticate with Toloka's API. Defaults to None, in which case it tries to
-        load the key from environment variables or config file.
-    platform : str, optional
-        Specifies the Toloka environment (e.g., 'PRODUCTION', 'SANDBOX'). Defaults to None, meaning it will
-        try to load from environment variables or the config file.
-    pool_id : str, optional
-        The ID of the pool from which assignments will be retrieved. Defaults to None.
-
-    Methods:
-    -------
-    load_config()
-        Loads configuration data from a config file to populate API_KEY, platform, and pool_id attributes.
-    prepare()
-        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
-    process()
-        Accepts Toloka assignments if their Word Error Rate (WER) is below the specified threshold.
+    """This processor accepts Toloka assignments if the Word Error Rate (WER) is below a threshold.
+
+    It evaluates the WER between ground truth and predicted text for each assignment
+    and accepts those that meet the specified threshold criteria.
+
+    Args:
+        input_data_file (str): Path to the input data file containing API configurations.
+        input_pool_file (str): Path to the input pool file containing pool configurations.
+        threshold (float): The WER threshold below which assignments are accepted. Default: 75.
+        config_file (str, optional): Path to the configuration file. Default: None.
+        API_KEY (str, optional): The API key for authenticating with Toloka's API. Default: None.
+        platform (str, optional): The Toloka platform to use. Default: None.
+        pool_id (str, optional): The ID of the Toloka pool. Default: None.
+
+    Returns:
+        A manifest with accepted assignments from Toloka based on the WER threshold.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.toloka.accept_if.AcceptIfWERLess
+              input_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
+              output_manifest_file: ${workspace_dir}/result_manifest_pred_review.json
+              input_data_file: ${workspace_dir}/data_file.json
+              input_pool_file: ${workspace_dir}/taskpool.json
+              threshold: 50
     """
     def __init__(
         self,