Skip to content

Commit 7c16f21

Browse files
Uzbek processors added (#91)
Uzbek processors added Signed-off-by: Rima Shahbazyan <[email protected]>
1 parent 39539d9 commit 7c16f21

File tree

10 files changed

+626
-3
lines changed

10 files changed

+626
-3
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ jobs:
7575
pip install Cython wheel # need to pre-install to avoid error in nemo installation
7676
pip install "nemo_toolkit[asr,nlp]"
7777
python -m pip cache purge
78+
7879
- name: Run all tests
7980
env:
8081
AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
documentation: |
2+
FLEURS
3+
######
4+
This config can be used to prepare
5+
`FLEURS <https://huggingface.co/datasets/google/fleurs>`_
6+
dataset in the NeMo format.
7+
It produces manifest for uzbek language.
8+
This config performs the following data processing.
9+
10+
1. Downloads FLEURS data
11+
2. Calculates the length of wav files
12+
3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
13+
14+
15+
**Required arguments**.
16+
17+
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
18+
* **data_split**: should be "train", "dev" or "test".
19+
20+
Note that you can customize any part of this config either directly or from command-line.
21+
22+
**Output format**
23+
24+
This config generates output manifest files:
25+
26+
* ``${workspace_dir}/${final_manifest}`` - dev subset of the data.
27+
28+
Output manifest contains the following keys:
29+
30+
* **audio_filepath (str)**: relative path to the audio files.
31+
* **text (str)**: transcription (lower-case without punctuation).
32+
* **duration (float)**: audio duration in seconds.
33+
processors_to_run: '0:'
34+
workspace_dir: ???
35+
data_split: dev
36+
save_dir: ${workspace_dir}
37+
final_manifest: ${workspace_dir}/${data_split}_manifest.json
38+
39+
processors:
40+
# creating manifest for uzbek dev set
41+
- _target_: sdp.processors.CreateInitialManifestFleurs
42+
lang: "uz_uz"
43+
split: ${data_split}
44+
raw_data_dir: ${workspace_dir}/raw_data
45+
46+
- _target_: sdp.processors.GetAudioDuration
47+
audio_filepath_key: audio_filepath
48+
duration_key: duration
49+
50+
- _target_: sdp.processors.SubRegex
51+
text_key: text
52+
53+
regex_params_list:
54+
- {"pattern": ":", "repl": ''}
55+
56+
# replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
57+
- {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
58+
- {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
59+
- {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
60+
- {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
61+
- {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}
62+
63+
# rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
64+
- {"pattern": "‘", "repl": "’"}
65+
- {"pattern": "`", "repl": "’"}
66+
- {"pattern": "'", "repl": "’"}
67+
- {"pattern": 'ʼ', "repl": "’"}
68+
- {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}
69+
70+
test_cases:
71+
- { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output: { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
72+
- { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}
73+
74+
75+
- _target_: sdp.processors.SubRegex
76+
text_key: text
77+
78+
regex_params_list:
79+
- {"pattern": ":", "repl": ""}
80+
81+
- {"pattern": "!", "repl": "."}
82+
- {"pattern": "\r", "repl": ""}
83+
84+
- {"pattern": '―', "repl": "-"}
85+
- {"pattern": '—', "repl": "-"}
86+
- {"pattern": '⁻', "repl": "-"}
87+
- {"pattern": '‑', "repl": "-"}
88+
- {"pattern": '–', "repl": "-"}
89+
90+
- {"pattern": '"', "repl": ""}
91+
- {"pattern": '“', "repl": ""}
92+
- {"pattern": '”', "repl": ""}
93+
- {"pattern": '„', "repl": ""}
94+
- {"pattern": '‟', "repl": ""}
95+
- {"pattern": ';', "repl": ","}
96+
- {"pattern": '…', "repl": "."}
97+
- {"pattern": '\.\.\.', "repl": "."}
98+
99+
# for Ŏ ŏ Ó ó Ō ō Õ õ
100+
- {"pattern": "Ŏ", "repl": "Oʻ"}
101+
- {"pattern": "ŏ", "repl": "oʻ"}
102+
- {"pattern": "Ó", "repl": "Oʻ"}
103+
- {"pattern": "ó", "repl": "oʻ"}
104+
- {"pattern": "Ō", "repl": "Oʻ"}
105+
- {"pattern": "ō", "repl": "oʻ"}
106+
- {"pattern": "Õ", "repl": "Oʻ"}
107+
- {"pattern": "õ", "repl": "oʻ"}
108+
109+
#for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ
110+
- {"pattern": "Ğ", "repl": "Gʻ"}
111+
- {"pattern": "ğ", "repl": "gʻ"}
112+
- {"pattern": "Ǵ", "repl": "Gʻ"}
113+
- {"pattern": "ǵ", "repl": "gʻ"}
114+
- {"pattern": "Ḡ", "repl": "Gʻ"}
115+
- {"pattern": "ḡ", "repl": "gʻ"}
116+
- {"pattern": "Ğ", "repl": "Gʻ"}
117+
- {"pattern": "ğ", "repl": "gʻ"}
118+
119+
#for Ş ş Ç ç Ñ ñ
120+
- {"pattern": "Ş", "repl": "Sh"}
121+
- {"pattern": "ş", "repl": "sh"}
122+
- {"pattern": "Ç", "repl": "Ch"}
123+
- {"pattern": "ç", "repl": "ch"}
124+
- {"pattern": "Ñ", "repl": "Ng"}
125+
- {"pattern": "ñ", "repl": "ng"}
126+
127+
test_cases:
128+
- { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output: { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}
129+
130+
131+
- _target_: sdp.processors.DropIfNoneOfRegexMatch
132+
regex_patterns: ["^( [A-Z])(.)+"]
133+
test_cases:
134+
- { input: { text: "one One" }, output: null }
135+
- { input: { text: "One one" }, output: { text: "One one" } }
136+
137+
138+
- _target_: sdp.processors.DropNonAlphabet
139+
alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
140+
test_cases:
141+
- { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
142+
- { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }
143+
144+
- { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
145+
output_manifest_file: ${final_manifest}
146+
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
documentation: |
2+
MCV Uzbek
3+
###########
4+
5+
This config is designed for the
6+
`Mozilla Common Voice (MCV) <https://commonvoice.mozilla.org/>`_ dataset
7+
17.0 release, but should work for any subsequent releases as well.
8+
9+
It performs the following data processing.
10+
11+
1. Extracts and converts all data to the specified manifest format.
12+
2. Gets audio durations and then keeps only instances with the duration greater than 0.
13+
3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
14+
15+
16+
**Required arguments**.
17+
18+
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
19+
You need to manually place the downloaded .tar files data inside
20+
``<workspace dir>`` folder.
21+
* **data_split**: should be "train", "dev" or "test".
22+
23+
Note that you can customize any part of this config either directly or from command-line.
24+
Here are some common customizations to consider:
25+
26+
* **remove_pc**: set to True if P&C is not needed. Defaults to True.
27+
* **remove_hyphen**: set to True if hyphens is not needed. Defaults to True.
28+
29+
**Output format**.
30+
31+
This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``.
32+
The output manifest contains the following fields:
33+
34+
* **audio_filepath (str)**: relative path to the audio files.
35+
* **text (str)**: transcription, including punctuation ".,?" and capitalization.
36+
* **duration (float)**: audio duration in seconds.
37+
38+
processors_to_run: '0:'
39+
workspace_dir: ???
40+
data_split: ???
41+
final_manifest: ${workspace_dir}/${data_split}_manifest.json
42+
save_dir: ${workspace_dir}
43+
remove_pc: False
44+
45+
processors:
46+
- _target_: sdp.processors.CreateInitialManifestMCV
47+
language_id: uz
48+
extract_archive_dir: ${workspace_dir}/raw_data
49+
resampled_audio_dir: ${workspace_dir}/${data_split}/audio/
50+
data_split: ${data_split}
51+
raw_data_dir: ${workspace_dir}
52+
output_manifest_file: ${save_dir}/${data_split}_manifest_1.json
53+
54+
- _target_: sdp.processors.SubRegex
55+
text_key: text
56+
output_manifest_file: ${save_dir}/${data_split}_manifest_2.json
57+
58+
regex_params_list:
59+
- {"pattern": ":", "repl": ''}
60+
61+
# replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
62+
- {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
63+
- {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
64+
- {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
65+
- {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
66+
- {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}
67+
68+
# rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
69+
- {"pattern": "‘", "repl": "’"}
70+
- {"pattern": "`", "repl": "’"}
71+
- {"pattern": "'", "repl": "’"}
72+
- {"pattern": 'ʼ', "repl": "’"}
73+
- {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}
74+
75+
test_cases:
76+
- { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output: { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
77+
- { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}
78+
79+
80+
- _target_: sdp.processors.SubRegex
81+
text_key: text
82+
output_manifest_file: ${save_dir}/${data_split}_manifest_3.json
83+
84+
regex_params_list:
85+
- {"pattern": ":", "repl": ""}
86+
87+
- {"pattern": "!", "repl": "."}
88+
- {"pattern": "\r", "repl": ""}
89+
90+
- {"pattern": '―', "repl": "-"}
91+
- {"pattern": '—', "repl": "-"}
92+
- {"pattern": '⁻', "repl": "-"}
93+
- {"pattern": '‑', "repl": "-"}
94+
- {"pattern": '–', "repl": "-"}
95+
96+
- {"pattern": '"', "repl": ""}
97+
- {"pattern": '“', "repl": ""}
98+
- {"pattern": '”', "repl": ""}
99+
- {"pattern": '„', "repl": ""}
100+
- {"pattern": '‟', "repl": ""}
101+
- {"pattern": ';', "repl": ","}
102+
- {"pattern": '…', "repl": "."}
103+
- {"pattern": '\.\.\.', "repl": "."}
104+
105+
# for Ŏ ŏ Ó ó Ō ō Õ õ
106+
- {"pattern": "Ŏ", "repl": "Oʻ"}
107+
- {"pattern": "ŏ", "repl": "oʻ"}
108+
- {"pattern": "Ó", "repl": "Oʻ"}
109+
- {"pattern": "ó", "repl": "oʻ"}
110+
- {"pattern": "Ō", "repl": "Oʻ"}
111+
- {"pattern": "ō", "repl": "oʻ"}
112+
- {"pattern": "Õ", "repl": "Oʻ"}
113+
- {"pattern": "õ", "repl": "oʻ"}
114+
115+
#for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ
116+
- {"pattern": "Ğ", "repl": "Gʻ"}
117+
- {"pattern": "ğ", "repl": "gʻ"}
118+
- {"pattern": "Ǵ", "repl": "Gʻ"}
119+
- {"pattern": "ǵ", "repl": "gʻ"}
120+
- {"pattern": "Ḡ", "repl": "Gʻ"}
121+
- {"pattern": "ḡ", "repl": "gʻ"}
122+
- {"pattern": "Ğ", "repl": "Gʻ"}
123+
- {"pattern": "ğ", "repl": "gʻ"}
124+
125+
#for Ş ş Ç ç Ñ ñ
126+
- {"pattern": "Ş", "repl": "Sh"}
127+
- {"pattern": "ş", "repl": "sh"}
128+
- {"pattern": "Ç", "repl": "Ch"}
129+
- {"pattern": "ç", "repl": "ch"}
130+
- {"pattern": "Ñ", "repl": "Ng"}
131+
- {"pattern": "ñ", "repl": "ng"}
132+
133+
test_cases:
134+
- { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output: { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}
135+
136+
137+
- _target_: sdp.processors.DropIfNoneOfRegexMatch
138+
regex_patterns: ["^( [A-Z])(.)+"]
139+
test_cases:
140+
- { input: { text: "one One" }, output: null }
141+
- { input: { text: "One one" }, output: { text: "One one" } }
142+
143+
144+
- _target_: sdp.processors.DropNonAlphabet
145+
alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
146+
test_cases:
147+
- { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
148+
- { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }
149+
150+
- { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
151+
output_manifest_file: ${final_manifest}

0 commit comments

Comments
 (0)