Skip to content

Commit 84d19f6

Browse files
authored
[egs] In formosa recipe, remove \u3000 and \u00a0 to prevent failure in validate_text.pl (#3913)
remove \u3000 (\xE3 \x80 \x80) and \u00a0 (\xC2 \xA0) with sed when preparing data utt2spk, wav.scp and text to prevent failure in mfcc stage (mfcc => steps/make_mfcc_pitch.sh => utils/validate_data_dir.sh => utils/validate_text.pl => (The line for utterance $utt_id contains disallowed Unicode whitespaces))
1 parent 4225a2a commit 84d19f6

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

egs/formosa/s5/local/prepare_data.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ mkdir -p data/all data/train data/test data/eval data/local/train
3232

3333

3434
# make utt2spk, wav.scp and text
35-
find -L $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/all/utt2spk
36-
find -L $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/all/wav.scp
37-
find -L $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/all/text
35+
find -L $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/all/utt2spk
36+
find -L $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/all/wav.scp
37+
find -L $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/all/text
3838

3939
# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
4040
# duplicate entries and so on). Also, it regenerates the spk2utt from
@@ -51,9 +51,9 @@ echo "cp data/train/text data/local/train/text for language model training"
5151
cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text
5252

5353
# preparing EVAL set.
54-
find -L $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/eval/utt2spk
55-
find -L $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/eval/wav.scp
56-
find -L $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/eval/text
54+
find -L $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/eval/utt2spk
55+
find -L $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/eval/wav.scp
56+
find -L $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | sed 's/\xe3\x80\x80\|\xc2\xa0//g' | dos2unix > data/eval/text
5757
utils/fix_data_dir.sh data/eval
5858

5959
echo "Data preparation completed."

0 commit comments

Comments
 (0)