Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dataset_configs/portuguese/unlabeled/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,21 +75,21 @@ processors:
output_manifest_file: ${manifest_dir}/vad
input_manifest_arg: "manifest_filepath"
output_manifest_arg: "output_dir"
cmd: 'python sdp/processors/nemo/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=sdp/processors/nemo/frame_vad_infer_postprocess.yaml'
cmd: 'python sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=sdp/processors/inference/asr/nemo/utils/frame_vad_infer_postprocess.yaml'

- _target_: sdp.processors.RenameFields
input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
output_manifest_file: ${manifest_dir}/manifest7.json
rename_fields: {"audio_filepath":"source_filepath"}

- _target_: sdp.processors.nemo.rttm.GetRttmSegments
- _target_: sdp.processors.GetRttmSegments
output_manifest_file: ${manifest_dir}/manifest8.json
rttm_key: rttm_file
output_file_key: audio_segments
duration_key: duration
duration_threshold: 20.0

- _target_: sdp.processors.nemo.rttm.SplitAudioFile
- _target_: sdp.processors.SplitAudioFile
output_manifest_file: ${manifest_dir}/manifest9.json
splited_audio_dir: ${workspace_dir}/splited_wavs/
segments_key: audio_segments
Expand Down
27 changes: 24 additions & 3 deletions docs/src/sdp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,6 @@ used in the downstream processing for additional enhancement or filtering.
.. autodata:: sdp.processors.ASRTransformers
:annotation:

.. autodata:: sdp.processors.EstimateBandwidth
:annotation:

.. autodata:: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection
:annotation:

Expand All @@ -202,6 +199,15 @@ used in the downstream processing for additional enhancement or filtering.
.. autodata:: sdp.processors.tts.metrics.BandwidthEstimationProcessor
:annotation:

.. autodata:: sdp.processors.FasterWhisperInference
:annotation:

.. autodata:: sdp.processors.vLLMInference
:annotation:

.. autodata:: sdp.processors.AudioLid
:annotation:

Text-only processors
####################

Expand Down Expand Up @@ -246,6 +252,9 @@ Data modifications
.. autodata:: sdp.processors.ListToEntries
:annotation:

.. autodata:: sdp.processors.EstimateBandwidth
:annotation:

Data filtering
''''''''''''''

Expand Down Expand Up @@ -364,6 +373,18 @@ Data filtering
.. autodata:: sdp.processors.RejectIfBanned
:annotation:

.. autodata:: sdp.processors.DetectWhisperHallucinationFeatures
:annotation:

.. autodata:: sdp.processors.CleanQwenGeneration
:annotation:

.. autodata:: sdp.processors.GetRttmSegments
:annotation:

.. autodata:: sdp.processors.SplitAudioFile
:annotation:

Miscellaneous
#############

Expand Down
4 changes: 4 additions & 0 deletions requirements/main.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ datasets>=2.14.0,<3.0.0
# for some processers, additionally https://github.com/NVIDIA/NeMo is required
# for some processers, additionally nemo_text_processing is required
# for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
# for FasterWhisperInference processor is required:
# pip install pytorch-lightning nvidia-cublas-cu12 nvidia-cudnn-cu12==9.* faster_whisper
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
# for vLLMInference processor is required: pip install "optree>=0.13.0" vllm
17 changes: 12 additions & 5 deletions sdp/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@
from sdp.processors.huggingface.create_initial_manifest import (
CreateInitialManifestHuggingFace,
)
from sdp.processors.huggingface.speech_recognition import ASRTransformers
from sdp.processors.modify_manifest.common import (
AddConstantFields,
ApplyInnerJoin,
Expand Down Expand Up @@ -119,6 +118,7 @@
SubRegex,
ListToEntries,
LambdaExpression,
EstimateBandwidth,
)
from sdp.processors.modify_manifest.data_to_dropbool import (
DropASRError,
Expand All @@ -141,6 +141,16 @@
from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
MakeLettersUppercaseAfterPeriod,
)
from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
from sdp.processors.inference.asr.nemo.lid_inference import AudioLid
from sdp.processors.inference.asr.faster_whisper.faster_whisper_inference import FasterWhisperInference
from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
from sdp.processors.inference.asr.utils.whisper_hallucinations import DetectWhisperHallucinationFeatures
from sdp.processors.inference.asr.utils.rttm import GetRttmSegments, SplitAudioFile
from sdp.processors.inference.nlp.nemo.pc_inference import PCInference
from sdp.processors.inference.llm.vllm.vllm import vLLMInference
from sdp.processors.inference.llm.utils.qwen_cleaning import CleanQwenGeneration

from sdp.processors.manage_files.convert_audio import (
FfmpegConvert,
SoxConvert,
Expand All @@ -151,10 +161,7 @@
from sdp.processors.manage_files.remove import (
RemoveFiles,
)
from sdp.processors.nemo.asr_inference import ASRInference
from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
from sdp.processors.nemo.lid_inference import AudioLid
from sdp.processors.nemo.pc_inference import PCInference

from sdp.processors.toloka.accept_if import AcceptIfWERLess
from sdp.processors.toloka.create_pool import CreateTolokaPool
from sdp.processors.toloka.create_project import CreateTolokaProject
Expand Down
Loading
Loading