Skip to content

Commit 4266da1

Browse files
committed
merge
Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
2 parents 846a633 + c53be5e commit 4266da1

File tree

7 files changed

+26
-18
lines changed

7 files changed

+26
-18
lines changed

.github/workflows/tests.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,20 @@ jobs:
7575
pip install nemo-toolkit[asr,nlp]==1.23.0
7676
pip install nemo_text_processing
7777
pip install -r requirements/huggingface.txt
78+
pip install certifi #this needed to avoid problems with certificates [COORAL]
79+
export SSL_CERT_FILE=$(python -m certifi)
7880
python -m pip cache purge
81+
7982
8083
- name: Run all tests
8184
env:
8285
AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
8386
AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
8487
CLEAN_UP_TMP_PATH: 1
8588
run: |
89+
wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL]
90+
sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
91+
sudo update-ca-certificates # [cert for CORAL]
8692
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
8793
python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
8894

dataset_configs/english/coraal/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ documentation: |
1818
This config performs the following data processing.
1919
2020
1. Downloads CORAAL data based on the
21-
`official file list <http://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_.
21+
`official file list <https://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_. #Official mirror link
2222
There are a couple of errors in the links there, which are fixed in our code.
2323
2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo.
2424
3. Groups all consecutive segments from the same speaker until 20 seconds duration

docs/src/conf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
"numpy",
4646
"tqdm",
4747
"soundfile",
48-
"ndjson",
4948
"boto3",
5049
"webvtt_py",
5150
"python_docx",
@@ -189,3 +188,8 @@ def setup(app):
189188
]
190189
# nitpick_ignore_regex = [('py:class', '*')]
191190

191+
#adding this especially for coraal, temporary
192+
linkcheck_ignore = [
193+
r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt',
194+
]
195+
# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt

requirements/main.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ffmpeg
44
hydra-core
55
joblib
66
librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
7-
numpy==1.26
7+
numpy>=1.26, <2.0 # module was used numpy 1.x and may crash in 2.x
88
omegaconf
99
pandas
1010
rarfile
@@ -18,7 +18,7 @@ python-docx
1818
pydub
1919
dask
2020
distributed
21-
21+
jiwer>=3.1.0,<4.0.0
2222
# toloka-kit # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support
2323
# for some processers, additionally https://github.com/NVIDIA/NeMo is required
2424
# for some processers, additionally nemo_text_processing is required

requirements/tts.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
ndjson
21
transformers
32
accelerate
43
torchaudio

sdp/processors/datasets/coraal/create_initial_manifest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ def get_coraal_url_list():
3131
There are a few mistakes in the official url list that are fixed here.
3232
Can be overridden by tests to select a subset of urls.
3333
"""
34-
dataset_url = "http://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
34+
dataset_url = "https://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
3535
urls = []
3636
for file_url in urllib.request.urlopen(dataset_url):
3737
file_url = file_url.decode('utf-8').strip()
3838
# fixing known errors in the urls
39-
if file_url == 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt':
40-
file_url = 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
41-
if file_url == 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt':
42-
file_url = 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
39+
if file_url == 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt':
40+
file_url = 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
41+
if file_url == 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt':
42+
file_url = 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
4343
urls.append(file_url)
4444
return urls
4545

tests/test_tts_sdp_end_to_end.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from omegaconf import OmegaConf
99

1010
from sdp.run_processors import run_processors
11+
from sdp.utils.common import load_manifest
1112

1213
DATASET_CONFIGS_ROOT = Path(__file__).parents[1] / "dataset_configs"
1314

@@ -69,16 +70,14 @@ def test_tts_sdp_end_to_end(get_tts_ytc_data):
6970

7071
assert os.path.exists(cfg.final_manifest)
7172
output_file_data = {}
72-
with open(cfg.final_manifest, "r") as f:
73-
output_data = ndjson.load(f)
74-
for item in output_data:
75-
output_file_data[item["audio_item_id"]] = item
73+
output_data = load_manifest(cfg.final_manifest, encoding="utf8")
74+
for item in output_data:
75+
output_file_data[item["audio_item_id"]] = item
7676

7777
reference_file_data = {}
78-
with open(reference_manifest_file, "r") as f:
79-
reference_data = ndjson.load(f)
80-
for item in reference_data:
81-
reference_file_data[item["audio_item_id"]] = item
78+
reference_data = load_manifest(reference_manifest_file, encoding="utf8")
79+
for item in reference_data:
80+
reference_file_data[item["audio_item_id"]] = item
8281

8382
assert len(output_file_data) == len(reference_file_data)
8483
assert len(output_file_data) == 2

0 commit comments

Comments
 (0)