diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml
index d5b570a2..7b3fb240 100644
--- a/dataset_configs/english/coraal/config.yaml
+++ b/dataset_configs/english/coraal/config.yaml
@@ -93,8 +93,8 @@ processors:
       - {"pattern": '\baksing\b', "repl": "asking"}
       - {"pattern": '\baksed\b', "repl": "asked"}
       # removing unintelligible/redacted flags
-      - {"pattern": '/(?i)unintelligible/', "repl": ""}
-      - {"pattern": '/(?i)inaudible/', "repl": ""}
+      - {"pattern": '(?i)unintelligible/', "repl": ""}
+      - {"pattern": '(?i)inaudible/', "repl": ""}
       - {"pattern": '/RD(.*?)/', "repl": ""}
       - {"pattern": '/(\?)\1*/', "repl": ""}
       # removing non-linguistic markers
diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml
new file mode 100644
index 00000000..ade44a4e
--- /dev/null
+++ b/dataset_configs/multilingual/babel/config.yaml
@@ -0,0 +1,53 @@
+documentation: |
+  IARPA Babel Dataset
+  ###################
+
+  This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu.
+
+  It creates initial manifest for the specified data type and data split. 
+  Further data processing steps should be performed based on the specific langauge.
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify path of the directory downloaded from LDC.
+  * **data_type**: should be "conversational" or "scripted".
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval".
+  * **output_manifest_file**: specify output manifest filepath.
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${output_manifest_file}``.
+  The output manifest contains the following fields:
+
+  * **outputFn (str)**: initial audio filename.
+  * **sessID (str)**: session ID of the recording.
+  * **date (str)**: date of the recording.
+  * **time (str)**: time of the recording.
+  * **spkrCode (str)**: speaker ID.
+  * **lineType (str)**: type of the line (inline or outline).
+  * **dialect (str)**: dialect of the speaker.
+  * **gen (str)**: gender of the speaker.
+  * **envType (str)**: environment (i.e., home, office, etc.).
+  * **age (str)**: age of the speaker.
+  * **network (str)**: name of the telecommunications network.
+  * **phoneModel (str)**: model of the phone.
+  * **sampleCount (str)**: count of the sample.
+  * **sampleRate (str)**: original sample rate of the recording.
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+data_type: scripted
+resampled_audio_dir: ${workspace_dir}/processed/${data_type}/${data_split}
+data_split: training
+final_manifest: ${workspace_dir}/processed/${data_type}/${data_split}_manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestBabel
+    raw_data_dir: ${workspace_dir}
+    data_type: ${data_type}
+    data_split: ${data_split}
+    resampled_audio_dir: ${resampled_audio_dir}
+    output_manifest_file: ${final_manifest}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/voxpopuli/config_un.yaml b/dataset_configs/multilingual/voxpopuli/config_un.yaml
new file mode 100644
index 00000000..4480ad6e
--- /dev/null
+++ b/dataset_configs/multilingual/voxpopuli/config_un.yaml
@@ -0,0 +1,39 @@
+documentation: |
+  Voxpopuli unlabelled subset
+  ###########################
+
+  This config can be used to prepare
+  `Voxpopuli dataset unlabelled subset <https://github.com/facebookresearch/voxpopuli/>`_
+  dataset in the NeMo format.
+
+  It creates initial manifest for the specified language. 
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify the directory where the downloaded data will be/is saved.
+  * **language_id**: specify the language of the data you wish to be downloaded and/or processed.
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **delete_raw_file**: specify if the initial raw audio files should be deleted or not.
+
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``.
+  The output manifest contains the following fields:
+
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+language_id: hu_v2
+resampled_data_dir: ${workspace_dir}/unlabelled_processed/
+final_manifest: ${workspace_dir}/unlabelled_processed/${language_id}/manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+    raw_data_dir: ${workspace_dir}
+    language_id: ${language_id}
+    resampled_data_dir: ${resampled_data_dir}
+    delete_raw_file: False
+    output_manifest_file: ${final_manifest}
diff --git a/dataset_configs/multilingual/yodas/config.yaml b/dataset_configs/multilingual/yodas/config.yaml
new file mode 100644
index 00000000..16fffefc
--- /dev/null
+++ b/dataset_configs/multilingual/yodas/config.yaml
@@ -0,0 +1,35 @@
+processors_to_run: all
+manifest: ???
+resampled_audio_dir: ???
+out_manifest: ???
+char_rate: 10
+min_duration: 1.5
+max_duration: 40.1
+
+processors:
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    input_manifest_file: ${manifest}
+
+  - _target_: sdp.processors.RandomSegment
+    min_duration: ${min_duration}
+    max_duration: ${max_duration}
+    resampled_audio_dir: ${resampled_audio_dir}
+    audio_format: flac
+
+  - _target_: sdp.processors.ASRInference
+    pretrained_model: nvidia/parakeet-ctc-0.6b
+
+  - _target_: sdp.processors.DropHighLowCharrate
+    low_charrate_threshold: ${char_rate}
+    text_key: pred_text
+    high_charrate_threshold: 10000
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep:
+      - audio_filepath
+      - duration
+
+  - _target_: sdp.processors.DropCorrupted
+    output_manifest_file: ${out_manifest}
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 6d85e83d..496b6154 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -40,6 +40,9 @@ VoxPopuli
 .. autodata:: sdp.processors.CreateInitialManifestVoxpopuli
    :annotation:
 
+.. autodata:: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+   :annotation:
+
 .. autodata:: sdp.processors.NormalizeFromNonPCTextVoxpopuli
    :annotation:
 
@@ -58,8 +61,13 @@ Librispeech
 
 .. autodata:: sdp.processors.CreateInitialManifestLibrispeech
    :annotation:
-   
 
+Babel
+'''''''''''
+
+.. autodata:: sdp.processors.CreateInitialManifestBabel
+   :annotation:
+   
 SLR83
 '''''
 
@@ -158,6 +166,18 @@ Data modifications
 .. autodata:: sdp.processors.InverseNormalizeText
    :annotation:
 
+.. autodata:: sdp.processors.RandomSegment
+   :annotation:
+
+.. autodata:: sdp.processors.UntarAudios
+   :annotation:
+
+.. autodata:: sdp.processors.ExtractFilesFromTar
+   :annotation:
+
+.. autodata:: sdp.processors.RemoveEmojis
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
@@ -237,6 +257,9 @@ Data filtering
 .. autodata:: sdp.processors.DropRepeatedFields
    :annotation:
 
+.. autodata:: sdp.processors.DropCorrupted
+   :annotation:
+
 
 Miscellaneous
 #############
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 32c52ec1..271a938c 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -92,12 +92,16 @@ VoxPopuli
 * **Spanish**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/spanish_pc/voxpopuli/config.yaml>`__ |
   :doc:`documentation <config-docs/spanish_pc/voxpopuli/config>`
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/voxpopuli/config_un.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/voxpopuli/config_un>`
 
 .. toctree::
    :hidden:
 
    config-docs/italian/voxpopuli/config
    config-docs/spanish_pc/voxpopuli/config
+   config-docs/multilingual/voxpopuli/config_un
 
 Fisher
 ~~~~~~
@@ -237,6 +241,22 @@ MTEDx
 
    config-docs/portuguese/mtedx/config
 
+Babel
+~~~~~~
+
+**Dataset link:** https://www.ldc.upenn.edu
+
+**Supported configs**.
+
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/babel/config.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/babel/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/multilingual/babel/config
+
 Kazakh Speech Dataset (SLR140)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pytest.ini b/pytest.ini
index 2bed0f3a..ae4de828 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
-addopts = --doctest-modules
\ No newline at end of file
+addopts = --doctest-modules
+markers =
+    dependency: mark a test as a dependent on the other mentioned test.
+    slow: marks tests as slow (deselect with '-m "not slow"').
\ No newline at end of file
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
index e603c631..4f8696e1 100644
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@@ -1,3 +1,3 @@
-accelerate
-transformers>=0.2.1
+accelerate==0.34.2
+transformers==4.39
 huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793
diff --git a/requirements/main.txt b/requirements/main.txt
index c39b2844..7cb1ee43 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -4,7 +4,7 @@ ffmpeg
 hydra-core
 joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
-numpy
+numpy==1.26.4
 omegaconf
 pandas
 rarfile
@@ -13,7 +13,7 @@ sox
 tqdm
 webvtt-py
 wget
-
+pydub
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fdafb521..7f0c0d71 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -50,7 +50,9 @@
 )
 from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
     CreateInitialManifestVoxpopuli,
+    CreateInitialManifestVoxpopuliUnlabelled,
 )
+from sdp.processors.datasets.babel.create_initial_manifest import CreateInitialManifestBabel
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
@@ -80,6 +82,10 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    ExtractFilesFromTar,
+    RandomSegment,
+    UntarAudios,
+    RemoveEmojis
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
@@ -97,6 +103,7 @@
     DropOnAttribute,
     PreserveByValue,
     DropRepeatedFields,
+    DropCorrupted,
 )
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
diff --git a/sdp/processors/datasets/babel/create_initial_manifest.py b/sdp/processors/datasets/babel/create_initial_manifest.py
new file mode 100644
index 00000000..4bf0eaa6
--- /dev/null
+++ b/sdp/processors/datasets/babel/create_initial_manifest.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+from pathlib import Path
+
+from pydub import AudioSegment
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class CreateInitialManifestBabel(BaseParallelProcessor):
+    """Processor to create initial manifest for the Babel dataset.
+
+    Dataset is available for 25 underserved languages on https://catalog.ldc.upenn.edu
+
+    Segments the raw audio based on transcriptions files
+    (each segment contains an utterance from the transcription file for which start and end timestamps are procided)
+    and creates manifest for the resampled data.
+
+    .. note::
+        The dataset should be downloaded manually from LDC.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data is saved.
+        data_type (str): "conversational" or "scripted".
+        data_split (str): "training", "untranscribed-training", "sub-train", "dev" or "eval".
+        resampled_audio_dir (str): the directory where the resampled audio
+            files will be stored.
+        audio_format (str): format in which new audio files will be stored.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "outputFn": <initial audio filename>,
+                "sessID": <session ID of the recording>,
+                "date": <date of the recording>,
+                "time": <time of the recording>,
+                "spkrCode": <speaker ID>,
+                "lineType": <type of the line (inline or outline)>,
+                "dialect": <dialect of the speaker>,
+                "gen": <gender of the speaker>,
+                "envType": <environment (i.e. home, office etc.)>,
+                "age": <age of the speaker>,
+                "network": <name of the telecommunications network>,
+                "phoneModel": <model of the phone>,
+                "sampleCount": <count of the sample>,
+                "sampleRate": <original sample rate of the recording>,
+                "audio_filepath": <path to the processed audio file>,
+                "duration": <duration of the audio in seconds>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        data_type: str,
+        data_split: str,
+        resampled_audio_dir: str,
+        audio_format: str = 'flac',
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.data_type = data_type
+        self.resampled_audio_dir = resampled_audio_dir
+        self.data_split = data_split
+        self.audio_format = audio_format
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+        self.audios_dir = Path(self.raw_data_dir, self.data_type, self.data_split, 'audio')
+        self.transcriptions_dir = Path(self.raw_data_dir, self.data_type, self.data_split, 'transcription')
+        self.demographics_file = Path(self.raw_data_dir, self.data_type, 'reference_materials', 'demographics.tsv')
+        self.un_demographics_file = Path(
+            self.raw_data_dir, self.data_type, 'reference_materials', 'demographics.untranscribed-training.tsv'
+        )
+
+        if not os.path.exists(self.resampled_audio_dir):
+            os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def prepare(self):
+        self.demographics = {}
+
+        with open(self.demographics_file, "rt", encoding="utf8") as fin:
+            header = fin.readline()
+            titles = [t.strip() for t in header.split('\t')]
+            for line in fin:
+                data_entry = dict(zip(titles, line.strip('\n').split('\t')))
+                self.demographics[data_entry["outputFn"]] = data_entry
+
+        if self.un_demographics_file.exists():
+            with open(self.un_demographics_file, "rt", encoding="utf8") as fin:
+                header = fin.readline()
+                titles = [t.strip() for t in header.split('\t')]
+                for line in fin:
+                    data_entry = dict(zip(titles, line.strip('\n').split('\t')))
+                    self.demographics[data_entry["outputFn"]] = data_entry
+
+    def read_manifest(self):
+        return self.audios_dir.glob("*.sph")
+
+    def process_dataset_entry(self, data_entry: pathlib.PosixPath):
+        transcription_path = Path(self.transcriptions_dir, data_entry.stem).with_suffix('.txt')
+
+        tgt_audio_filepath = (
+            Path(self.resampled_audio_dir, data_entry.stem).with_suffix(f".{self.audio_format}").as_posix()
+        )
+
+        try:
+            audio = AudioSegment.from_file(data_entry)
+
+            if not transcription_path.exists():
+                if audio.frame_rate != self.target_samplerate:
+                    audio = audio.set_frame_rate(self.target_samplerate)
+                audio.export(tgt_audio_filepath, format=self.audio_format)
+
+                modified_entry = self.demographics[data_entry.name].copy()
+                modified_entry['audio_filepath'] = tgt_audio_filepath
+                modified_entry['duration'] = round(audio.duration_seconds, 2)
+                return [DataEntry(data=modified_entry)]
+
+            if audio.frame_rate != self.target_samplerate:
+                audio = audio.set_frame_rate(self.target_samplerate)
+
+            if audio.channels != self.target_nchannels:
+                audio = audio.set_channels(self.target_nchannels)
+
+            with open(transcription_path, 'rt') as f:
+                data = f.readlines()
+                timestamps = data[::2]
+                texts = data[1::2]
+
+            data_entries = []
+
+            for idx in range(len(timestamps) - 1):
+                text = texts[idx].strip('\n')
+                if text == "<no-speech>":
+                    continue
+                new_audio_filepath = tgt_audio_filepath.replace(f'.{self.audio_format}', f'_{idx}.{self.audio_format}')
+
+                start = float(timestamps[idx].strip('[]\n'))
+                end = float(timestamps[idx + 1].strip('[]\n'))
+
+                audio_segment = audio[start * 1000 : end * 1000]
+
+                audio_segment.export(new_audio_filepath, format=self.audio_format)
+
+                modified_entry = self.demographics[data_entry.name].copy()
+                modified_entry['audio_filepath'] = new_audio_filepath
+                modified_entry['text'] = text
+                modified_entry['duration'] = round(end - start, 2)
+
+                data_entries.append(DataEntry(data=modified_entry))
+
+        except Exception as e:
+            logger.warning(str(e) + " file: " + transcription_path)
+
+        return data_entries
\ No newline at end of file
diff --git a/sdp/processors/datasets/coraal/create_initial_manifest.py b/sdp/processors/datasets/coraal/create_initial_manifest.py
index 1f67f730..16aa166a 100644
--- a/sdp/processors/datasets/coraal/create_initial_manifest.py
+++ b/sdp/processors/datasets/coraal/create_initial_manifest.py
@@ -47,7 +47,7 @@ def get_coraal_url_list():
 class CreateInitialManifestCORAAL(BaseParallelProcessor):
     """Processor to create initial manifest for the Corpus of Regional African American Language (CORAAL) dataset.
 
-    Dataset link: https://oraal.uoregon.edu/coraal/
+    Dataset link: https://oraal.github.io/coraal
 
     Will download all files, extract tars and split wav files based on the
     provided durations in the transcripts.
diff --git a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
index b7c47ba5..2f7985fe 100644
--- a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
+++ b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
@@ -14,10 +14,11 @@
 
 import os
 import subprocess
-from pathlib import Path
+from pathlib import Path, PosixPath
 
 import sox
 from sox import Transformer
+from pydub import AudioSegment
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -153,3 +154,130 @@ def process_dataset_entry(self, data_entry: str):
             "accent": accent,
         }
         return [DataEntry(data=data)]
+
+
+
+class CreateInitialManifestVoxpopuliUnlabelled(BaseParallelProcessor):
+    """Processor to create initial manifest for the VoxPopuli dataset unlabelled subset.
+
+    Dataset link: https://github.com/facebookresearch/voxpopuli/
+
+    If not already downloaded and segmented, downloads and segments raw VoxPopuli data for the specified language,
+    and creates an initial manifest with the reformated audiofilepaths and their durations.
+
+    .. note::
+        This processor will install a couple of Python packages, including
+        PyTorch, so it might be a good idea to run it in an isolated Python
+        environment. As unlabelled data is huge in volumes, the downloading,
+        segmenting and processing might take a long time.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data will be/is saved.
+        language_id (str): the language of the data you wish to be downloaded and/or processed.
+            E.g., "en", "es", "it", "it_v2" etc.
+        resampled_audio_dir (str): the directory where the resampled audio
+            files will be stored.
+        audio_format (str): format in which new audio files will be stored.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+        delete_raw_file (bool): whether initial .ogg files should be deleted or not.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        language_id: str,
+        resampled_data_dir: str,
+        audio_format: str = 'flac',
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        delete_raw_file: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.language_id = language_id
+        self.audio_format = audio_format
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        self.delete_raw_file = delete_raw_file
+
+
+        self.resampled_audio_dir = Path(resampled_data_dir, self.language_id.replace('_v2', ''), 'audios')
+        self.output_manifest_file = self.output_manifest_file.replace('_v2', '')
+
+    def prepare(self):
+        """Downloading data (unless already done)"""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        if not (self.raw_data_dir / "unlabelled_data" / self.language_id.replace('_v2', '')).exists():
+            # TODO: some kind of isolated environment?
+            if not os.path.exists(self.raw_data_dir / 'voxpopuli'):
+                logger.info("Downloading voxpopuli and installing requirements")
+                subprocess.run(f"git clone {VOXPOPULI_URL} {self.raw_data_dir / 'voxpopuli'}", check=True, shell=True)
+            subprocess.run(
+                f"pip install -r {self.raw_data_dir / 'voxpopuli' / 'requirements.txt'}", check=True, shell=True
+            )
+            subprocess.run(f"pip install torch==1.13 torchaudio==0.13", check=True, shell=True)
+            subprocess.run(f"pip uninstall torch-tensorrt torchdata torchvision -y", check=True, shell=True)
+            if not os.path.exists(self.raw_data_dir / 'raw_audios' / self.language_id.replace('_v2', '')):
+                logger.info("Downloading raw audios")
+                subprocess.run(
+                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                    f"python -m voxpopuli.download_audios --root {self.raw_data_dir} --subset {self.language_id}",
+                    check=True,
+                    shell=True,
+                )
+
+            logger.info("Segmenting the data")
+            subprocess.run(
+                f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                f"python -m voxpopuli.get_unlabelled_data  --root {self.raw_data_dir} --subset {self.language_id}",
+                check=True,
+                shell=True,
+            )
+
+        if not self.resampled_audio_dir.exists():
+            self.resampled_audio_dir.mkdir(exist_ok=True, parents=True)
+
+    def read_manifest(self):
+        unlabelled_dir = Path(self.raw_data_dir, 'unlabelled_data')
+        return Path(unlabelled_dir, self.language_id.replace('_v2', '')).rglob('*.ogg')
+
+    def process_dataset_entry(self, data_entry: PosixPath):
+        tgt_audio_filepath = Path(self.resampled_audio_dir, data_entry.stem + f".{self.audio_format}")
+
+        try:
+            audio = AudioSegment.from_ogg(data_entry)
+
+            if audio.frame_rate != self.target_samplerate:
+                audio = audio.set_frame_rate(self.target_samplerate)
+
+            if audio.channels != self.target_nchannels:
+                audio = audio.set_channels(self.target_nchannels)
+
+            audio.export(tgt_audio_filepath, format=self.audio_format)
+
+            data = {
+                "audio_filepath": tgt_audio_filepath.as_posix(),
+                "duration": audio.duration_seconds,
+            }
+
+            if self.delete_raw_file:
+                os.remove(data_entry)
+
+        except Exception as e:
+            logger.warning(str(e) + " file: " + data_entry.as_posix())
+            data = None
+
+        return [DataEntry(data=data)]
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index 77096b3e..48674007 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -42,7 +42,7 @@ def __init__(
         self.extension = extension
 
     def read_manifest(self):
-        output_file = [str(self.raw_data_dir / file) for file in self.raw_data_dir.rglob('*.' + self.extension)]
+        output_file = [file.as_posix() for file in self.raw_data_dir.rglob('*.' + self.extension)]
         return output_file
 
     def process_dataset_entry(self, data_entry):
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 9328b081..f616f230 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,10 +15,18 @@
 import collections
 import os
 import re
+import json
+import random
+import itertools
+import tarfile
+from tqdm import tqdm
 from typing import Dict, List
+from pathlib import Path, PosixPath
 
 import soundfile
 from sox import Transformer
+from pydub import AudioSegment
+from tqdm.contrib.concurrent import process_map
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -690,3 +698,291 @@ def process_dataset_entry(self, data_entry):
             data_entry[self.input_text_key], verbose=self.verbose
         )
         return [DataEntry(data=data_entry)]
+
+
+class RandomSegment(BaseParallelProcessor):
+    """
+    Processor that randomly segments mini-audios from the main audio, durations of which are uniformely distributed from ``min_duration`` to ``max_duration``.
+    New audios are saved in the following location ``<resampled_audio_dir>/<audio_file>_segment_num.<audio_format>``
+
+    Args:
+        min_duration (float): minimum duration for the newly segmented audio.
+        max_duration (float): maximum duration for the newly segmented audio.
+        resampled_audio_dir (str) (Optional): directory where the resampled audio files will be stored.
+        audio_format (str) (Optional): key to get audio filepath from data entry. Defaults to None.
+        audio_filepath_key (str) (Optional): format of the output audio files. Defaults to `wav`. Defaults to ``audio_filepath``
+        save_other_part (bool) (Optional): whether to save the residual part of the audio after segmentation. Defaults to True.
+        random_seed (int) (Optional): seed for ``random.uniform``. Defaults to 1000.
+        target_samplerate (int) (Optional): the target sample rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int) (Optional): the target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        min_duration: float,
+        max_duration: float,
+        resampled_audio_dir: str,
+        audio_format: str = None,
+        audio_filepath_key: str = 'audio_filepath',
+        save_other_part: bool = True,
+        random_seed: int = 1000,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.resampled_audio_dir = resampled_audio_dir
+        self.audio_format = audio_format
+        self.audio_filepath_key = audio_filepath_key
+        self.save_other_part = save_other_part
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        random.seed(random_seed)
+
+    def process_dataset_entry(self, data_entry):
+        data_entries = []
+
+        audio = AudioSegment.from_file(data_entry[self.audio_filepath_key])
+        duration = audio.duration_seconds
+
+        if audio.frame_rate != self.target_samplerate:
+            audio = audio.set_frame_rate(self.target_samplerate)
+
+        if audio.channels != self.target_nchannels:
+            audio = audio.set_channels(self.target_nchannels)
+
+        audio_format = self.audio_format if self.audio_format else data_entry[self.audio_filepath_key].suffix
+
+        Path(self.resampled_audio_dir).mkdir(parents=True, exist_ok=True)
+
+        segment_num = 0
+
+        if duration - self.min_duration < self.min_duration:
+            new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+            new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+            audio.export(new_filename, format=self.audio_format)
+
+            new_data_entry = data_entry.copy()
+            new_data_entry[self.audio_filepath_key] = new_filename
+
+            return [DataEntry(data=new_data_entry)]
+
+        while True:
+            rand_dur = random.uniform(self.min_duration, min(self.max_duration, duration) - self.min_duration)
+            segmented_part = audio[: int(rand_dur * 1000)]
+
+            new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+            new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+            segmented_part.export(new_filename, format=self.audio_format)
+
+            new_data_entry = data_entry.copy()
+            new_data_entry[self.audio_filepath_key] = new_filename
+            new_data_entry['duration'] = round(rand_dur, 2)
+
+            data_entries.append(DataEntry(data=new_data_entry))
+            segment_num += 1
+
+            if (duration - rand_dur) > self.max_duration:
+                audio = audio[int(rand_dur * 1000) :]
+                duration = duration - rand_dur
+                continue
+
+            if self.save_other_part:
+                other_part = audio[int(rand_dur * 1000) :]
+                new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+                new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+                other_part.export(new_filename, format=self.audio_format)
+
+                new_data_entry = data_entry.copy()
+                new_data_entry[self.audio_filepath_key] = new_filename
+                new_data_entry['duration'] = round(duration - rand_dur, 2)
+                data_entries.append(DataEntry(data=new_data_entry))
+
+            break
+
+        return data_entries
+
+
+class UntarAudios(BaseParallelProcessor):
+    """Processor that extracts the files from .tar files in ``tar_dir`` to ``resampled_audio_dir``.
+
+    Args:
+        tar_dir (str): directory that contains tarred files.
+        resampled_audio_dir (str): directory where extracted files will be located.
+        remove_tars (bool) (Optional): whether tarred file should be removed after files extraction.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        tar_dir: str,
+        resampled_audio_dir: str,
+        remove_tars: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tar_dir = Path(tar_dir)
+        self.resampled_audio_dir = resampled_audio_dir
+        self.remove_tars = remove_tars
+
+    def read_manifest(self):
+        for file in self.tar_dir.glob('*.tar'):
+            yield file
+
+    def process(self):
+        for manifest_chunk in self._chunk_manifest():
+            # this will unroll all inner lists
+            data = itertools.chain(
+                *process_map(
+                    self.process_dataset_entry,
+                    manifest_chunk,
+                    max_workers=self.max_workers,
+                    chunksize=self.chunksize,
+                )
+            )
+
+    def process_dataset_entry(self, data_entry: PosixPath):
+        with tarfile.open(data_entry, 'r') as tar:
+            tar.extractall(self.resampled_audio_dir)
+
+        if self.remove_tars:
+            os.remove(data_entry)
+
+
+class ExtractFilesFromTar(BaseParallelProcessor):
+    """Processor that extracts the files from ``input_manifest_file`` to ``extract_to_dir``.
+
+    Args:
+        tar_dir (str): directory that contains tarred files.
+        extract_to_dir (str): directory where extracted files will be located.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        tar_dir: str,
+        extract_to_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tar_dir = Path(tar_dir)
+        self.extract_to_dir = extract_to_dir
+
+    def read_manifest(self):
+        if self.input_manifest_file is None:
+            raise ValueError("Manifest with files should be provided!")
+
+        logger.info('Reading Manifest...')
+
+        tar_entries = collections.defaultdict(list)
+
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            for line in tqdm(fin):
+                entry = json.loads(line)
+                tar_entries[entry['shard_id']].append(entry)
+
+        for shard_id, entries in tar_entries.items():
+            yield (shard_id, entries)
+
+    def process_dataset_entry(self, data_entry):
+        shard_id, entries = data_entry
+
+        logger.info('Working on shard_id ', shard_id)
+
+        tar_file = Path(self.tar_dir, f"audio_{shard_id}").with_suffix('.tar')
+
+        extracted_entries = []
+
+        with tarfile.open(tar_file, 'r') as tar:
+            for entry in tqdm(entries):
+                extracted_path = Path(self.extract_to_dir, entry['audio_filepath']).as_posix()
+
+                if not os.path.exists(extracted_path):
+                    tar.extract(member=entry['audio_filepath'], path=self.extract_to_dir)
+
+                entry['audio_filepath'] = extracted_path
+                extracted_entries.append(DataEntry(data=entry))
+
+        return extracted_entries
+
+
+class RemoveEmojis(BaseParallelProcessor):
+    """Replaces emojis with empty string.
+
+    .. note:: Emoji patterns are predefined. There might be (new) emojis which are not included in the list.
+
+    Args:
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field without detected emojis.
+    """
+
+    EMOJI_PATTERN = re.compile(
+        r" ?[\U0001F600-\U0001F64F"  # emoticons
+        r"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        r"\U0001F680-\U0001F6FF"  # transport & map symbols
+        r"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        r"\U00002500-\U00002BEF"  # Chinese characters
+        r"\U00002702-\U000027B0"
+        r"\U00002702-\U000027B0"
+        r"\U000024C2-\U0001F251"
+        r"\U0001f926-\U0001f937"
+        r"\U00010000-\U0010ffff"
+        r"\u2640-\u2642"
+        r"\u2600-\u2B55"
+        r"\u200d"
+        r"\u23cf"
+        r"\u23e9"
+        r"\u231a"
+        r"\ufe0f"  # dingbats
+        r"\u3030"
+        r"]+",
+        flags=re.UNICODE,
+    )
+
+    def __init__(
+        self,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        """Replaces each found regex match with a given string."""
+        replace_word_counter = 0
+
+        text_in = data_entry[self.text_key]
+
+        text_in = add_start_end_spaces(text_in)
+        text_out = re.sub(
+            self.EMOJI_PATTERN,
+            repl='',
+            string=text_in,
+        )
+
+        if text_in != text_out:
+            replace_word_counter += 1
+        text_in = text_out
+
+        text_out = remove_extra_spaces(text_out)
+
+        data_entry[self.text_key] = text_out
+
+        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
+
+    def finalize(self, metrics):
+        """Reports how many substitutions were made for each pattern."""
+        super().finalize(metrics)
+
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 3c91ba20..19e51667 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -16,6 +16,7 @@
 import re
 import os 
 import json
+import soundfile
 from operator import eq, ge, gt, le, lt, ne
 from typing import List, Union
 
@@ -863,3 +864,36 @@ def finalize(self, metrics: List):
             total_counter += counter
         logger.info("Dropped %d utterances", total_counter)
         super().finalize(metrics)
+
+
+class DropCorrupted(BaseParallelProcessor):
+    """Drops audios if they are corrupted or empty.
+    Args:
+        audio_filepath_key (str) (Optional): which key to use for audio filepaths. Defaults to ``audio_filepath``
+    """
+
+    def __init__(
+        self,
+        audio_filepath_key: str = 'audio_filepath',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_filepath_key = audio_filepath_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        try:
+            data, _ = soundfile.read(data_entry[self.audio_filepath_key])
+        except:
+            return [DataEntry(data=None, metrics=1)]
+
+        if sum(data) == 0:
+            return [DataEntry(data=None, metrics=1)]
+
+        return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        total_counter = 0
+        for counter in metrics:
+            total_counter += counter
+        logger.info("Dropped %d utterances", total_counter)
+        super().finalize(metrics)
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 7db88f34..7f4e7ff9 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -15,6 +15,7 @@
 import os
 import shutil
 import tarfile
+import logging
 from functools import partial
 from pathlib import Path
 from typing import Callable, List, Tuple
@@ -44,15 +45,21 @@ def data_check_fn_generic(raw_data_dir: str, file_name: str, **kwargs) -> None:
 data_check_fn_ksc2 = partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz")
 data_check_fn_librispeech = partial(data_check_fn_generic, file_name="dev-clean.tar.gz")
 data_check_fn_fleurs = partial(data_check_fn_generic, file_name="dev.tar.gz")
+data_check_fn_babel = partial(data_check_fn_generic, file_name="scripted")
 
-def data_check_fn_voxpopuli(raw_data_dir: str) -> None:
+def data_check_fn_voxpopuli(raw_data_dir: str, asr_data: bool = True) -> None:
     """Raises error if do not find expected data.
 
     Will also extract the archive as initial processor expects extracted data.
     """
-    if (Path(raw_data_dir) / "transcribed_data").exists():
+    if asr_data:
+        file_name = "transcribed_data"
+    else:
+        file_name = "unlabelled_data"
+
+    if (Path(raw_data_dir) / file_name).exists():
         return
-    expected_file = Path(raw_data_dir) / "transcribed_data.tar.gz"
+    expected_file = Path(raw_data_dir, file_name).with_suffix(".tar.gz")
     if not expected_file.exists():
         raise ValueError(f"No such file {str(expected_file)}")
     with tarfile.open(expected_file, 'r:gz') as tar:
@@ -87,7 +94,7 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
     return [
         (f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", partial(data_check_fn_mls, language="spanish")),
         (f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es")),
-        (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", data_check_fn_voxpopuli),
+        (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", partial(data_check_fn_voxpopuli, asr_data=True)),
         (f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", partial(data_check_fn_mls, language="italian")),
         (f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", partial(data_check_fn_mls, language="portuguese")),
         (f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt")),
@@ -103,8 +110,18 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
         (f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", data_check_fn_slr140),
         (f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", data_check_fn_slr102),
         (f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn_ksc2),
+        (f"{DATASET_CONFIGS_ROOT}/multilingual/babel/config.yaml", data_check_fn_babel),
+        (f"{DATASET_CONFIGS_ROOT}/multilingual/voxpopuli/config_un.yaml", partial(data_check_fn_voxpopuli, asr_data=False)),
     ]
 
+def get_test_names():
+    config_names = [
+        Path(t[0]).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases()
+        ]
+
+    return config_names
+
+
 def check_e2e_test_data() -> bool:
     """
     Checks if required environment variables are defined for e2e data.
@@ -124,7 +141,6 @@ def get_e2e_test_data_path(rel_path_from_root: str) -> str:
         return test_data_root
 
     import boto3
-    import logging
 
     s3_resource = boto3.resource(
         "s3",
@@ -145,27 +161,43 @@ def get_e2e_test_data_path(rel_path_from_root: str) -> str:
 
     return os.path.abspath("test_data")
 
-@pytest.mark.skipif(
-    not check_e2e_test_data(),
-    reason="Either TEST_DATA_ROOT needs to be defined or both AWS_SECRET_KEY "
-    "and AWS_ACCESS_KEY to run e2e config tests",
-)
-@pytest.mark.parametrize("config_path,data_check_fn", get_test_cases())
-def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
-    # we expect DATASET_CONFIGS_ROOT and TEST_DATA_ROOT
-    # to have the same structure (e.g. <lang>/<dataset>)
+@pytest.fixture(scope="module", params=get_test_cases(), ids=get_test_names())
+def setup_data(request):
+
+    if not check_e2e_test_data():
+        pytest.fail("Either TEST_DATA_ROOT needs to be defined or both AWS_SECRET_KEY "
+    "and AWS_ACCESS_KEY to run e2e config tests")
+        
+    config_path, data_check_fn  = request.param
+
     rel_path_from_root = Path(config_path).parent.relative_to(DATASET_CONFIGS_ROOT)
-    test_data_root = Path(get_e2e_test_data_path(str(rel_path_from_root)))
- 
-    # run data_check_fn - it will raise error if the expected test data is not found
+    test_data_root = get_e2e_test_data_path(str(rel_path_from_root))
+    data_dir = Path(test_data_root, rel_path_from_root)
+
+    yield config_path, data_check_fn, data_dir
+    shutil.rmtree(data_dir)
+
+
+def test_data_availability(setup_data):
+
+    _, data_check_fn, data_dir = setup_data
     try:
-        data_check_fn(raw_data_dir=str(test_data_root / rel_path_from_root))
+        data_check_fn(raw_data_dir=data_dir)
     except ValueError as e:
-        pytest.skip(f"Test data not available: {str(e)}")
+        pytest.fail(f"Test data not available: {str(e)}")
+
+    reference_manifest = Path(data_dir, "test_data_reference.json")
 
-    reference_manifest = test_data_root / rel_path_from_root / "test_data_reference.json"
     if not reference_manifest.exists():
-        pytest.skip(f"Reference manifest not found: {reference_manifest}")
+        pytest.fail(f"Reference manifest not found: {reference_manifest}")
+
+@pytest.mark.dependency(depends=['test_data_availability'])
+def test_configs(setup_data, tmp_path):
+    # we expect DATASET_CONFIGS_ROOT and TEST_DATA_ROOT
+    # to have the same structure (e.g. <lang>/<dataset>)
+
+    config_path, _, data_dir = setup_data
+    reference_manifest = data_dir / "test_data_reference.json"
 
     cfg = OmegaConf.load(config_path)
     assert "processors" in cfg
@@ -173,7 +205,7 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
     cfg.workspace_dir = str(tmp_path)
     cfg.final_manifest = str(tmp_path / "final_manifest.json")
     cfg.data_split = cfg.get("data_split", "train")
-    cfg.processors[0].raw_data_dir = str(test_data_root / rel_path_from_root)
+    cfg.processors[0].raw_data_dir = data_dir.as_posix()
 
     if "already_downloaded" in cfg["processors"][0]:
         cfg["processors"][0]["already_downloaded"] = True
@@ -195,7 +227,6 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
             generated_data.pop("audio_filepath", None)
             assert reference_data == generated_data
 
- # if CLEAN_UP_TMP_PATH is set to non-0 value, we will delete tmp_path
     if os.getenv("CLEAN_UP_TMP_PATH", "0") != "0":
         shutil.rmtree(tmp_path)
 
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 5bd75f47..79c0b9d9 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -19,6 +19,7 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    RemoveEmojis
 )
 
 test_params_list = []
@@ -90,6 +91,17 @@
     ]
 )
 
+test_params_list.extend(
+    [
+        (
+            RemoveEmojis,
+            {"text_key": "text"},
+            {"text": "The weather is perfect ☀️, and the trails are calling! Let's enjoy the beauty of nature and make some unforgettable memories 🌲🌿."},
+            {"text": "The weather is perfect, and the trails are calling! Let's enjoy the beauty of nature and make some unforgettable memories."},
+        ),
+    ]
+)
+
 
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):