Skip to content

Commit 139efff

Browse files
authored
Merge changes from master to pybind11 (#3858)
Sync pybind11 with master
2 parents 9aff362 + 7b0f8d9 commit 139efff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2716
-12
lines changed

egs/cmu_cslu_kids/s5/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ if [ $stage -le 5 ]; then
105105
utils/mkgraph.sh data/lang_test_tgmed exp/tri1 exp/tri1/graph
106106
steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode
107107
# Align - make graph - decode again
108-
steps/align_si.sh --nj 20 --cmd "queue.pl" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali
108+
steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali
109109
utils/mkgraph.sh data/lang_test_tgmed exp/tri1_ali exp/tri1_ali/graph
110110
steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1_ali/graph data/test exp/tri1_ali/decode
111111
fi

egs/gale_arabic/s5d/cmd.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# you can change cmd.sh depending on what type of queue you are using.
2+
# If you have no queueing system and want to run on a local machine, you
3+
# can change all instances 'queue.pl' to run.pl (but be careful and run
4+
# commands one by one: most recipes will exhaust the memory on your
5+
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
6+
# with slurm. Different queues are configured differently, with different
7+
# queue names and different ways of specifying things like memory;
8+
# to account for these differences you can create and edit the file
9+
# conf/queue.conf to match your queue's configuration. Search for
10+
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11+
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12+
13+
export train_cmd="retry.pl queue.pl"
14+
export decode_cmd="retry.pl queue.pl"
15+
export mkgraph_cmd="retry.pl queue.pl --mem 8G"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
link decode_dnn.config

egs/gale_arabic/s5d/conf/mfcc.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--use-energy=false # only non-default option.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# config for high-resolution MFCC features, intended for neural network training.
2+
# Note: we keep all cepstra, so it has the same info as filterbank features,
3+
# but MFCC is more easily compressible (because less correlated) which is why
4+
# we prefer this method.
5+
--use-energy=false # use average of log energy, not energy.
6+
--sample-frequency=16000
7+
--num-mel-bins=40
8+
--num-ceps=40
9+
--low-freq=40 # low cutoff frequency for mel bins
10+
--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env perl
2+
3+
# Copyright 2014 QCRI (author: Ahmed Ali)
4+
# Apache 2.0
5+
6+
use warnings;
7+
use strict;
8+
use Encode;
9+
use utf8;
10+
11+
12+
13+
if (@ARGV !=2 )
14+
{#
15+
print "usage: $0 <inFile> <onlyArabicFile>\n";
16+
exit (1);
17+
}
18+
19+
# <\check usage>
20+
my $inFile = shift (@ARGV);
21+
my $ouFile = shift(@ARGV);
22+
23+
24+
open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
25+
binmode INFILE, ":encoding(utf8)";
26+
27+
28+
open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
29+
binmode OUTPUTFILE, ":encoding(utf8)";
30+
31+
while (<INFILE>) {
32+
my $BW = convertUTF8ToBuckwalter ($_);
33+
print OUTPUTFILE "$BW"."\n";
34+
}
35+
close INFILE;
36+
close OUTPUTFILE;
37+
38+
39+
40+
41+
# this function is copied from MADATools.pm: MADA Tools
42+
sub convertUTF8ToBuckwalter {
43+
44+
my ($line)= (@_);
45+
$line =~ s/\x{0621}/\'/g; ## HAMZA
46+
$line =~ s/\x{0622}/\|/g; ## ALEF WITH MADDA ABOVE
47+
$line =~ s/\x{0623}/\>/g; ## ALEF WITH HAMZA ABOVE
48+
$line =~ s/\x{0624}/\&/g; ## WAW WITH HAMZA ABOVE
49+
$line =~ s/\x{0625}/\</g; ## ALEF WITH HAMZA BELOW
50+
$line =~ s/\x{0626}/\}/g; ## YEH WITH HAMZA ABOVE
51+
$line =~ s/\x{0627}/A/g; ## ALEF
52+
$line =~ s/\x{0628}/b/g; ## BEH
53+
$line =~ s/\x{0629}/p/g; ## TEH MARBUTA
54+
$line =~ s/\x{062A}/t/g; ## TEH
55+
$line =~ s/\x{062B}/v/g; ## THEH
56+
$line =~ s/\x{062C}/j/g; ## JEEM
57+
$line =~ s/\x{062D}/H/g; ## HAH
58+
$line =~ s/\x{062E}/x/g; ## KHAH
59+
$line =~ s/\x{062F}/d/g; ## DAL
60+
$line =~ s/\x{0630}/\*/g; ## THAL
61+
$line =~ s/\x{0631}/r/g; ## REH
62+
$line =~ s/\x{0632}/z/g; ## ZAIN
63+
$line =~ s/\x{0633}/s/g; ## SEEN
64+
$line =~ s/\x{0634}/\$/g; ## SHEEN
65+
$line =~ s/\x{0635}/S/g; ## SAD
66+
$line =~ s/\x{0636}/D/g; ## DAD
67+
$line =~ s/\x{0637}/T/g; ## TAH
68+
$line =~ s/\x{0638}/Z/g; ## ZAH
69+
$line =~ s/\x{0639}/E/g; ## AIN
70+
$line =~ s/\x{063A}/g/g; ## GHAIN
71+
$line =~ s/\x{0640}/_/g; ## TATWEEL
72+
$line =~ s/\x{0641}/f/g; ## FEH
73+
$line =~ s/\x{0642}/q/g; ## QAF
74+
$line =~ s/\x{0643}/k/g; ## KAF
75+
$line =~ s/\x{0644}/l/g; ## LAM
76+
$line =~ s/\x{0645}/m/g; ## MEEM
77+
$line =~ s/\x{0646}/n/g; ## NOON
78+
$line =~ s/\x{0647}/h/g; ## HEH
79+
$line =~ s/\x{0648}/w/g; ## WAW
80+
$line =~ s/\x{0649}/Y/g; ## ALEF MAKSURA
81+
$line =~ s/\x{064A}/y/g; ## YEH
82+
83+
## Diacritics
84+
$line =~ s/\x{064B}/F/g; ## FATHATAN
85+
$line =~ s/\x{064C}/N/g; ## DAMMATAN
86+
$line =~ s/\x{064D}/K/g; ## KASRATAN
87+
$line =~ s/\x{064E}/a/g; ## FATHA
88+
$line =~ s/\x{064F}/u/g; ## DAMMA
89+
$line =~ s/\x{0650}/i/g; ## KASRA
90+
$line =~ s/\x{0651}/\~/g; ## SHADDA
91+
$line =~ s/\x{0652}/o/g; ## SUKUN
92+
$line =~ s/\x{0670}/\`/g; ## SUPERSCRIPT ALEF
93+
94+
$line =~ s/\x{0671}/\{/g; ## ALEF WASLA
95+
$line =~ s/\x{067E}/P/g; ## PEH
96+
$line =~ s/\x{0686}/J/g; ## TCHEH
97+
$line =~ s/\x{06A4}/V/g; ## VEH
98+
$line =~ s/\x{06AF}/G/g; ## GAF
99+
100+
101+
## Punctuation should really be handled by the utf8 cleaner or other method
102+
103+
104+
105+
106+
107+
108+
return $line;
109+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/python
2+
3+
# This script appends utterances dumped out from XML to a Kaldi datadir
4+
5+
import sys, re
6+
from xml.sax.saxutils import unescape
7+
8+
basename=sys.argv[1]
9+
outdir = sys.argv[2]
10+
11+
if len(sys.argv) > 3:
12+
mer_thresh=float(sys.argv[3])
13+
else:
14+
mer_thresh = None
15+
16+
# open the output files in append mode
17+
#segments_file = open(outdir + '/segments', 'a')
18+
#utt2spk_file = open(outdir + '/utt2spk', 'a')
19+
#text_file = open(outdir + '/text', 'a')
20+
mgb2_file = open(outdir + '/mgb2', 'a')
21+
22+
for line in sys.stdin:
23+
24+
m = re.match(r'\w+speaker(\d+)\w+\s+(.*)', line)
25+
#print line
26+
27+
if m:
28+
29+
spk = int(m.group(1))
30+
31+
t = m.group(2).split()
32+
start = float(t[0])
33+
end = float(t[1])
34+
mer = float(t[2])
35+
36+
s = [unescape(w) for w in t[3:]]
37+
words = ' '.join(s)
38+
39+
segId = '%s_spk-%04d_seg-%07d:%07d' % (basename, spk, start*100, end*100)
40+
spkId = '%s_spk-%04d' % (basename, spk)
41+
42+
# only add segments where the Matching Error Rate is below the prescribed threshhold
43+
if mer_thresh == None or mer <= mer_thresh:
44+
#print >> segments_file, '%s %s %.2f %.2f' % (segId, basename, start, end )
45+
#print >> text_file, '%s %s' % (segId, words)
46+
#print >> utt2spk_file, '%s %s' % (segId, spkId)
47+
print >> mgb2_file, '%s %s %.3f %.3f %s' % (basename, segId, start, end, words)
48+
49+
#segments_file.close()
50+
#utt2spk_file.close()
51+
#text_file.close()
52+
mgb2_file.close()
53+
54+
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/usr/bin/env python3
2+
3+
import sys
4+
5+
def hex_to_decimal(utf8_string):
6+
assert(len(utf8_string) == 3)
7+
hex_dict = {}
8+
char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"]
9+
value_list = [0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
10+
for key, value in zip (char_list, value_list):
11+
hex_dict[key] = value
12+
13+
result = 0
14+
length = len(utf8_string)
15+
for i in range(length):
16+
digit = utf8_string[length - 1 - i]
17+
result += hex_dict[digit] * (16 ** i)
18+
19+
return result
20+
21+
def get_unicode_dict():
22+
unicode_dict = {}
23+
utf8_list = [("621", "'"), ("622", "|"),("623", ">"),
24+
("624", "&"), ("625", "<"),("626", "}"),
25+
("627", "A"), ("628", "b"),("629", "p"),
26+
("62A", "t"), ("62B", "v"),("62C", "j"),
27+
("62D", "H"), ("62E", "x"),("62F", "d"),
28+
("630", "*"), ("631", "r"),("632", "z"),
29+
("633", "s"), ("634", "$"),("635", "S"),
30+
("636", "D"), ("637", "T"),("638", "Z"),
31+
("639", "E"), ("63A", "g"),("640", "_"),
32+
("641", "f"), ("642", "q"),("643", "k"),
33+
("644", "l"), ("645", "m"),("646", "n"),
34+
("647", "h"), ("648", "w"),("649", "Y"),
35+
("64A", "y"), ("64B", "F"),("64C", "N"),
36+
("64D", "K"), ("64E", "a"),("64F", "u"),
37+
("650", "i"), ("651", "~"),("652", "o"),
38+
("670", "`"), ("671", "{"),("67E", "P"),
39+
("686", "J"), ("6A4", "V"),("6AF", "G")]
40+
41+
for word_pair in utf8_list:
42+
utf8 = word_pair[0]
43+
char = word_pair[1]
44+
unicode_dict[hex_to_decimal(utf8)] = char
45+
46+
return unicode_dict
47+
48+
49+
def convert(word, unicode_dict):
50+
word_list = []
51+
for char in word:
52+
c_unicode = ord(char)
53+
if c_unicode in unicode_dict:
54+
word_list.append(unicode_dict[c_unicode])
55+
56+
return "".join(word_list)
57+
58+
def process_arabic_text(arabic_text, unicode_dict):
59+
with open(arabic_text, 'r') as file:
60+
sentence_list = []
61+
is_sentence = False
62+
for line in file.readlines():
63+
#print(line.split()[0], is_sentence, line.split()[0] == "</P>")
64+
if len(line.split()) > 0:
65+
if line.split()[0] == "<P>":
66+
is_sentence = True
67+
68+
elif (is_sentence and line.split()[0] != "</P>"):
69+
for word in line.split():
70+
if word == '.':
71+
# when meet period ".", sentence_list should not be empty (do find sentence ending with two period)
72+
if (len(sentence_list) > 0):
73+
sentence = " ".join(sentence_list)
74+
print(sentence)
75+
sentence_list = []
76+
elif word[-1] == ".":
77+
word = word[:-1]
78+
sentence_list.append(word)
79+
sentence = " ".join(sentence_list)
80+
print(sentence)
81+
sentence_list = []
82+
else:
83+
word = word
84+
if word != '':
85+
sentence_list.append(word)
86+
87+
if line.split()[0] == "</P>":
88+
is_sentence = False
89+
if (len(sentence_list) > 0):
90+
print(" ".join(sentence_list))
91+
sentence_list = []
92+
93+
94+
95+
def main():
96+
arabic_text = sys.argv[1]
97+
unicode_dict = get_unicode_dict()
98+
process_arabic_text(arabic_text, unicode_dict)
99+
100+
if __name__ == "__main__":
101+
main()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
2+
ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
3+
LBC_NAHAR_ARB_20060911_142800_3683267_3685290
4+
LBC_NAHAR_ARB_20070303_145800_3249800_3251128
5+
LBC_NAHAR_ARB_20070303_145800_3623646_3624152
6+
LBC_NAHAR_ARB_20070305_035800_481003_484069
7+
ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
8+
ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
9+
ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
10+
ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238

0 commit comments

Comments
 (0)