|
| 1 | +documentation: | |
| 2 | + MCV Uzbek |
| 3 | + ########### |
| 4 | +
|
| 5 | + This config is designed for the |
| 6 | + `Mozilla Common Voice (MCV) <https://commonvoice.mozilla.org/>`_ dataset |
| 7 | + 17.0 release, but should work for any subsequent releases as well. |
| 8 | +
|
| 9 | + It performs the following data processing. |
| 10 | +
|
| 11 | + 1. Extracts and converts all data to the specified manifest format. |
| 12 | + 2. Gets audio durations and then keeps only instances with the duration greater than 0. |
| 13 | + 3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters. |
| 14 | +
|
| 15 | +
|
| 16 | + **Required arguments**. |
| 17 | +
|
| 18 | + * **workspace_dir**: specify the workspace folder where all audio files will be stored. |
| 19 | + You need to manually place the downloaded .tar files data inside |
| 20 | + ``<workspace dir>`` folder. |
| 21 | + * **data_split**: should be "train", "dev" or "test". |
| 22 | +
|
| 23 | + Note that you can customize any part of this config either directly or from command-line. |
| 24 | + Here are some common customizations to consider: |
| 25 | +
|
| 26 | + * **remove_pc**: set to True if P&C is not needed. Defaults to True. |
| 27 | + * **remove_hyphen**: set to True if hyphens is not needed. Defaults to True. |
| 28 | +
|
| 29 | + **Output format**. |
| 30 | +
|
| 31 | + This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. |
| 32 | + The output manifest contains the following fields: |
| 33 | +
|
| 34 | + * **audio_filepath (str)**: relative path to the audio files. |
| 35 | + * **text (str)**: transcription, including punctuation ".,?" and capitalization. |
| 36 | + * **duration (float)**: audio duration in seconds. |
| 37 | +
|
| 38 | +processors_to_run: '0:' |
| 39 | +workspace_dir: ??? |
| 40 | +data_split: ??? |
| 41 | +final_manifest: ${workspace_dir}/${data_split}_manifest.json |
| 42 | +save_dir: ${workspace_dir} |
| 43 | +remove_pc: False |
| 44 | + |
| 45 | +processors: |
| 46 | + - _target_: sdp.processors.CreateInitialManifestMCV |
| 47 | + language_id: uz |
| 48 | + extract_archive_dir: ${workspace_dir}/raw_data |
| 49 | + resampled_audio_dir: ${workspace_dir}/${data_split}/audio/ |
| 50 | + data_split: ${data_split} |
| 51 | + raw_data_dir: ${workspace_dir} |
| 52 | + output_manifest_file: ${save_dir}/${data_split}_manifest_1.json |
| 53 | + |
| 54 | + - _target_: sdp.processors.SubRegex |
| 55 | + text_key: text |
| 56 | + output_manifest_file: ${save_dir}/${data_split}_manifest_2.json |
| 57 | + |
| 58 | + regex_params_list: |
| 59 | + - {"pattern": ":", "repl": ''} |
| 60 | + |
| 61 | + # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ |
| 62 | + - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"} |
| 63 | + - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"} |
| 64 | + - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"} |
| 65 | + - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"} |
| 66 | + - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"} |
| 67 | + |
| 68 | + # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’ |
| 69 | + - {"pattern": "‘", "repl": "’"} |
| 70 | + - {"pattern": "`", "repl": "’"} |
| 71 | + - {"pattern": "'", "repl": "’"} |
| 72 | + - {"pattern": 'ʼ', "repl": "’"} |
| 73 | + - {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"} |
| 74 | + |
| 75 | + test_cases: |
| 76 | + - { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output: { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }} |
| 77 | + - { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }} |
| 78 | + |
| 79 | + |
| 80 | + - _target_: sdp.processors.SubRegex |
| 81 | + text_key: text |
| 82 | + output_manifest_file: ${save_dir}/${data_split}_manifest_3.json |
| 83 | + |
| 84 | + regex_params_list: |
| 85 | + - {"pattern": ":", "repl": ""} |
| 86 | + |
| 87 | + - {"pattern": "!", "repl": "."} |
| 88 | + - {"pattern": "\r", "repl": ""} |
| 89 | + |
| 90 | + - {"pattern": '―', "repl": "-"} |
| 91 | + - {"pattern": '—', "repl": "-"} |
| 92 | + - {"pattern": '⁻', "repl": "-"} |
| 93 | + - {"pattern": '‑', "repl": "-"} |
| 94 | + - {"pattern": '–', "repl": "-"} |
| 95 | + |
| 96 | + - {"pattern": '"', "repl": ""} |
| 97 | + - {"pattern": '“', "repl": ""} |
| 98 | + - {"pattern": '”', "repl": ""} |
| 99 | + - {"pattern": '„', "repl": ""} |
| 100 | + - {"pattern": '‟', "repl": ""} |
| 101 | + - {"pattern": ';', "repl": ","} |
| 102 | + - {"pattern": '…', "repl": "."} |
| 103 | + - {"pattern": '\.\.\.', "repl": "."} |
| 104 | + |
| 105 | + # for Ŏ ŏ Ó ó Ō ō Õ õ |
| 106 | + - {"pattern": "Ŏ", "repl": "Oʻ"} |
| 107 | + - {"pattern": "ŏ", "repl": "oʻ"} |
| 108 | + - {"pattern": "Ó", "repl": "Oʻ"} |
| 109 | + - {"pattern": "ó", "repl": "oʻ"} |
| 110 | + - {"pattern": "Ō", "repl": "Oʻ"} |
| 111 | + - {"pattern": "ō", "repl": "oʻ"} |
| 112 | + - {"pattern": "Õ", "repl": "Oʻ"} |
| 113 | + - {"pattern": "õ", "repl": "oʻ"} |
| 114 | + |
| 115 | + #for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ |
| 116 | + - {"pattern": "Ğ", "repl": "Gʻ"} |
| 117 | + - {"pattern": "ğ", "repl": "gʻ"} |
| 118 | + - {"pattern": "Ǵ", "repl": "Gʻ"} |
| 119 | + - {"pattern": "ǵ", "repl": "gʻ"} |
| 120 | + - {"pattern": "Ḡ", "repl": "Gʻ"} |
| 121 | + - {"pattern": "ḡ", "repl": "gʻ"} |
| 122 | + - {"pattern": "Ğ", "repl": "Gʻ"} |
| 123 | + - {"pattern": "ğ", "repl": "gʻ"} |
| 124 | + |
| 125 | + #for Ş ş Ç ç Ñ ñ |
| 126 | + - {"pattern": "Ş", "repl": "Sh"} |
| 127 | + - {"pattern": "ş", "repl": "sh"} |
| 128 | + - {"pattern": "Ç", "repl": "Ch"} |
| 129 | + - {"pattern": "ç", "repl": "ch"} |
| 130 | + - {"pattern": "Ñ", "repl": "Ng"} |
| 131 | + - {"pattern": "ñ", "repl": "ng"} |
| 132 | + |
| 133 | + test_cases: |
| 134 | + - { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output: { text: "Bir sigʻir katta, qashshoqlikni yopadi." }} |
| 135 | + |
| 136 | + |
| 137 | + - _target_: sdp.processors.DropIfNoneOfRegexMatch |
| 138 | + regex_patterns: ["^( [A-Z])(.)+"] |
| 139 | + test_cases: |
| 140 | + - { input: { text: "one One" }, output: null } |
| 141 | + - { input: { text: "One one" }, output: { text: "One one" } } |
| 142 | + |
| 143 | + |
| 144 | + - _target_: sdp.processors.DropNonAlphabet |
| 145 | + alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? " |
| 146 | + test_cases: |
| 147 | + - { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null } |
| 148 | + - { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null } |
| 149 | + |
| 150 | + - { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } } |
| 151 | + output_manifest_file: ${final_manifest} |
0 commit comments