File tree Expand file tree Collapse file tree 7 files changed +26
-18
lines changed
dataset_configs/english/coraal
sdp/processors/datasets/coraal Expand file tree Collapse file tree 7 files changed +26
-18
lines changed Original file line number Diff line number Diff line change @@ -75,14 +75,20 @@ jobs:
7575 pip install nemo-toolkit[asr,nlp]==1.23.0
7676 pip install nemo_text_processing
7777 pip install -r requirements/huggingface.txt
78+ pip install certifi #this needed to avoid problems with certificates [COORAL]
79+ export SSL_CERT_FILE=$(python -m certifi)
7880 python -m pip cache purge
81+
7982
8083 - name : Run all tests
8184 env :
8285 AWS_SECRET_KEY : ${{ secrets.AWS_SECRET_KEY }}
8386 AWS_ACCESS_KEY : ${{ secrets.AWS_ACCESS_KEY }}
8487 CLEAN_UP_TMP_PATH : 1
8588 run : |
89+ wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL]
90+ sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
91+ sudo update-ca-certificates # [cert for CORAL]
8692 set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
8793 python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
8894
Original file line number Diff line number Diff line change @@ -18,7 +18,7 @@ documentation: |
1818 This config performs the following data processing.
1919
2020 1. Downloads CORAAL data based on the
21- `official file list <http ://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_.
21+ `official file list <https ://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_. #Official mirror link
2222 There are a couple of errors in the links there, which are fixed in our code.
2323 2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo.
2424 3. Groups all consecutive segments from the same speaker until 20 seconds duration
Original file line number Diff line number Diff line change 4545 "numpy" ,
4646 "tqdm" ,
4747 "soundfile" ,
48- "ndjson" ,
4948 "boto3" ,
5049 "webvtt_py" ,
5150 "python_docx" ,
@@ -189,3 +188,8 @@ def setup(app):
189188]
190189# nitpick_ignore_regex = [('py:class', '*')]
191190
191+ #adding this especially for coraal, temporary
192+ linkcheck_ignore = [
193+ r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt' ,
194+ ]
195+ # https://lingtools.uoregon.edu/coraal/coraal_download_list.txt
Original file line number Diff line number Diff line change 44hydra-core
55joblib
66librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
7- numpy== 1.26
7+ numpy>= 1.26, <2.0 # module was used numpy 1.x and may crash in 2.x
88omegaconf
99pandas
1010rarfile
@@ -18,7 +18,7 @@ python-docx
1818pydub
1919dask
2020distributed
21-
21+ jiwer>=3.1.0,<4.0.0
2222# toloka-kit # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support
2323# for some processers, additionally https://github.com/NVIDIA/NeMo is required
2424# for some processers, additionally nemo_text_processing is required
Original file line number Diff line number Diff line change 1- ndjson
21transformers
32accelerate
43torchaudio
Original file line number Diff line number Diff line change @@ -31,15 +31,15 @@ def get_coraal_url_list():
3131 There are a few mistakes in the official url list that are fixed here.
3232 Can be overridden by tests to select a subset of urls.
3333 """
34- dataset_url = "http ://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
34+ dataset_url = "https ://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
3535 urls = []
3636 for file_url in urllib .request .urlopen (dataset_url ):
3737 file_url = file_url .decode ('utf-8' ).strip ()
3838 # fixing known errors in the urls
39- if file_url == 'http ://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt' :
40- file_url = 'http ://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
41- if file_url == 'http ://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt' :
42- file_url = 'http ://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
39+ if file_url == 'https ://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt' :
40+ file_url = 'https ://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
41+ if file_url == 'https ://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt' :
42+ file_url = 'https ://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
4343 urls .append (file_url )
4444 return urls
4545
Original file line number Diff line number Diff line change 88from omegaconf import OmegaConf
99
1010from sdp .run_processors import run_processors
11+ from sdp .utils .common import load_manifest
1112
1213DATASET_CONFIGS_ROOT = Path (__file__ ).parents [1 ] / "dataset_configs"
1314
@@ -69,16 +70,14 @@ def test_tts_sdp_end_to_end(get_tts_ytc_data):
6970
7071 assert os .path .exists (cfg .final_manifest )
7172 output_file_data = {}
72- with open (cfg .final_manifest , "r" ) as f :
73- output_data = ndjson .load (f )
74- for item in output_data :
75- output_file_data [item ["audio_item_id" ]] = item
73+ output_data = load_manifest (cfg .final_manifest , encoding = "utf8" )
74+ for item in output_data :
75+ output_file_data [item ["audio_item_id" ]] = item
7676
7777 reference_file_data = {}
78- with open (reference_manifest_file , "r" ) as f :
79- reference_data = ndjson .load (f )
80- for item in reference_data :
81- reference_file_data [item ["audio_item_id" ]] = item
78+ reference_data = load_manifest (reference_manifest_file , encoding = "utf8" )
79+ for item in reference_data :
80+ reference_file_data [item ["audio_item_id" ]] = item
8281
8382 assert len (output_file_data ) == len (reference_file_data )
8483 assert len (output_file_data ) == 2
You can’t perform that action at this time.
0 commit comments