kaldi-asr
diff --git a/‎egs/cmu_cslu_kids/s5/run.sh‎
Lines changed: 1 addition & 1 deletion b/‎egs/cmu_cslu_kids/s5/run.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎egs/gale_arabic/s5d/cmd.sh‎
Lines changed: 15 additions & 0 deletions b/‎egs/gale_arabic/s5d/cmd.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/conf/decode.config‎
Lines changed: 1 addition & 0 deletions b/‎egs/gale_arabic/s5d/conf/decode.config‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions b/‎egs/gale_arabic/s5d/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/conf/mfcc_hires.conf‎
Lines changed: 10 additions & 0 deletions b/‎egs/gale_arabic/s5d/conf/mfcc_hires.conf‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/conf/online_cmvn.conf‎
Lines changed: 1 addition & 0 deletions b/‎egs/gale_arabic/s5d/conf/online_cmvn.conf‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/local/Eng2Ara.sh‎
Lines changed: 109 additions & 0 deletions b/‎egs/gale_arabic/s5d/local/Eng2Ara.sh‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/local/add_to_datadir.py‎
Lines changed: 54 additions & 0 deletions b/‎egs/gale_arabic/s5d/local/add_to_datadir.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/local/arabic_convert.py‎
Lines changed: 101 additions & 0 deletions b/‎egs/gale_arabic/s5d/local/arabic_convert.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎egs/gale_arabic/s5d/local/bad_segments‎
Lines changed: 10 additions & 0 deletions b/‎egs/gale_arabic/s5d/local/bad_segments‎
Lines changed: 10 additions & 0 deletions
@@ -105,7 +105,7 @@ if [ $stage -le 5 ]; then
   utils/mkgraph.sh data/lang_test_tgmed exp/tri1 exp/tri1/graph 
   steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode
   # Align - make graph - decode again   
-  steps/align_si.sh --nj 20 --cmd "queue.pl" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali
+  steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali
   utils/mkgraph.sh data/lang_test_tgmed exp/tri1_ali exp/tri1_ali/graph
   steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1_ali/graph data/test exp/tri1_ali/decode
 fi
 
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl"
+export decode_cmd="retry.pl queue.pl"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
@@ -0,0 +1 @@
+link decode_dnn.config
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env perl
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+use warnings;
+use strict;
+use Encode;
+use utf8;
+
+
+
+if (@ARGV !=2 )
+    {#
+	print "usage: $0 <inFile> <onlyArabicFile>\n"; 
+	exit (1);   
+    }
+    
+# <\check usage>
+my $inFile = shift (@ARGV);
+my $ouFile = shift(@ARGV);
+
+
+open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
+binmode INFILE, ":encoding(utf8)";
+
+
+open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
+binmode OUTPUTFILE, ":encoding(utf8)";
+
+while (<INFILE>) {
+  my $BW = convertUTF8ToBuckwalter ($_);
+  print OUTPUTFILE "$BW"."\n";
+}
+close INFILE;
+close OUTPUTFILE;
+
+
+
+
+# this function is copied from MADATools.pm: MADA Tools
+ sub convertUTF8ToBuckwalter {
+
+    my ($line)= (@_);
+    $line =~ s/\x{0621}/\'/g;   ## HAMZA
+    $line =~ s/\x{0622}/\|/g;   ## ALEF WITH MADDA ABOVE
+    $line =~ s/\x{0623}/\>/g;   ## ALEF WITH HAMZA ABOVE
+    $line =~ s/\x{0624}/\&/g;   ## WAW WITH HAMZA ABOVE
+    $line =~ s/\x{0625}/\</g;   ## ALEF WITH HAMZA BELOW
+    $line =~ s/\x{0626}/\}/g;   ## YEH WITH HAMZA ABOVE
+    $line =~ s/\x{0627}/A/g;    ## ALEF
+    $line =~ s/\x{0628}/b/g;    ## BEH
+    $line =~ s/\x{0629}/p/g;    ## TEH MARBUTA
+    $line =~ s/\x{062A}/t/g;    ## TEH
+    $line =~ s/\x{062B}/v/g;    ## THEH
+    $line =~ s/\x{062C}/j/g;    ## JEEM
+    $line =~ s/\x{062D}/H/g;    ## HAH
+    $line =~ s/\x{062E}/x/g;    ## KHAH
+    $line =~ s/\x{062F}/d/g;    ## DAL
+    $line =~ s/\x{0630}/\*/g;   ## THAL
+    $line =~ s/\x{0631}/r/g;    ## REH
+    $line =~ s/\x{0632}/z/g;    ## ZAIN
+    $line =~ s/\x{0633}/s/g;    ## SEEN
+    $line =~ s/\x{0634}/\$/g;   ## SHEEN
+    $line =~ s/\x{0635}/S/g;    ## SAD
+    $line =~ s/\x{0636}/D/g;    ## DAD
+    $line =~ s/\x{0637}/T/g;    ## TAH
+    $line =~ s/\x{0638}/Z/g;    ## ZAH
+    $line =~ s/\x{0639}/E/g;    ## AIN
+    $line =~ s/\x{063A}/g/g;    ## GHAIN
+    $line =~ s/\x{0640}/_/g;    ## TATWEEL
+    $line =~ s/\x{0641}/f/g;    ## FEH
+    $line =~ s/\x{0642}/q/g;    ## QAF
+    $line =~ s/\x{0643}/k/g;    ## KAF
+    $line =~ s/\x{0644}/l/g;    ## LAM
+    $line =~ s/\x{0645}/m/g;    ## MEEM
+    $line =~ s/\x{0646}/n/g;    ## NOON
+    $line =~ s/\x{0647}/h/g;    ## HEH
+    $line =~ s/\x{0648}/w/g;    ## WAW
+    $line =~ s/\x{0649}/Y/g;    ## ALEF MAKSURA
+    $line =~ s/\x{064A}/y/g;    ## YEH
+
+    ## Diacritics
+    $line =~ s/\x{064B}/F/g;    ## FATHATAN
+    $line =~ s/\x{064C}/N/g;    ## DAMMATAN
+    $line =~ s/\x{064D}/K/g;    ## KASRATAN
+    $line =~ s/\x{064E}/a/g;    ## FATHA
+    $line =~ s/\x{064F}/u/g;    ## DAMMA
+    $line =~ s/\x{0650}/i/g;    ## KASRA
+    $line =~ s/\x{0651}/\~/g;   ## SHADDA
+    $line =~ s/\x{0652}/o/g;    ## SUKUN
+    $line =~ s/\x{0670}/\`/g;   ## SUPERSCRIPT ALEF
+
+    $line =~ s/\x{0671}/\{/g;   ## ALEF WASLA
+    $line =~ s/\x{067E}/P/g;    ## PEH
+    $line =~ s/\x{0686}/J/g;    ## TCHEH
+    $line =~ s/\x{06A4}/V/g;    ## VEH
+    $line =~ s/\x{06AF}/G/g;    ## GAF
+
+
+    ## Punctuation should really be handled by the utf8 cleaner or other method
+
+
+
+
+
+
+    return $line;
+}
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+
+# This script appends utterances dumped out from XML to a Kaldi datadir
+
+import sys, re
+from xml.sax.saxutils import unescape
+
+basename=sys.argv[1]
+outdir = sys.argv[2]
+
+if len(sys.argv) > 3:
+    mer_thresh=float(sys.argv[3])
+else:
+    mer_thresh = None
+
+# open the output files in append mode
+#segments_file = open(outdir + '/segments', 'a')
+#utt2spk_file = open(outdir + '/utt2spk', 'a')
+#text_file = open(outdir + '/text', 'a')
+mgb2_file = open(outdir + '/mgb2', 'a')
+
+for line in sys.stdin:
+
+    m = re.match(r'\w+speaker(\d+)\w+\s+(.*)', line)
+    #print line
+
+    if m:
+
+        spk = int(m.group(1))
+
+        t = m.group(2).split()
+        start = float(t[0])
+        end = float(t[1])
+        mer = float(t[2])
+        
+        s = [unescape(w) for w in t[3:]]       
+        words = ' '.join(s)
+
+        segId = '%s_spk-%04d_seg-%07d:%07d' % (basename, spk, start*100, end*100)
+        spkId = '%s_spk-%04d' % (basename, spk)
+
+        # only add segments where the Matching Error Rate is below the prescribed threshhold
+        if mer_thresh == None or mer <= mer_thresh:
+#print >> segments_file, '%s %s %.2f %.2f' % (segId, basename, start, end ) 
+#print >> text_file, '%s %s' % (segId, words)
+#print >> utt2spk_file, '%s %s' % (segId, spkId)
+            print >> mgb2_file, '%s %s %.3f %.3f %s' % (basename, segId, start, end, words)
+
+#segments_file.close()
+#utt2spk_file.close()
+#text_file.close()
+mgb2_file.close()
+ 
+            
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+import sys
+
+def hex_to_decimal(utf8_string):
+    assert(len(utf8_string) == 3)
+    hex_dict = {}
+    char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"]
+    value_list = [0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+    for key, value in zip (char_list, value_list):
+        hex_dict[key] = value
+
+    result = 0
+    length = len(utf8_string)
+    for i in range(length):
+        digit = utf8_string[length - 1 - i]
+        result += hex_dict[digit] * (16 ** i)
+
+    return result
+
+def get_unicode_dict():
+    unicode_dict = {}
+    utf8_list = [("621", "'"), ("622", "|"),("623", ">"),
+                 ("624", "&"), ("625", "<"),("626", "}"),
+                 ("627", "A"), ("628", "b"),("629", "p"),
+                 ("62A", "t"), ("62B", "v"),("62C", "j"),
+                 ("62D", "H"), ("62E", "x"),("62F", "d"),
+                 ("630", "*"), ("631", "r"),("632", "z"),
+                 ("633", "s"), ("634", "$"),("635", "S"),
+                 ("636", "D"), ("637", "T"),("638", "Z"),
+                 ("639", "E"), ("63A", "g"),("640", "_"),
+                 ("641", "f"), ("642", "q"),("643", "k"),
+                 ("644", "l"), ("645", "m"),("646", "n"),
+                 ("647", "h"), ("648", "w"),("649", "Y"),
+                 ("64A", "y"), ("64B", "F"),("64C", "N"),
+                 ("64D", "K"), ("64E", "a"),("64F", "u"),
+                 ("650", "i"), ("651", "~"),("652", "o"),
+                 ("670", "`"), ("671", "{"),("67E", "P"),
+                 ("686", "J"), ("6A4", "V"),("6AF", "G")]
+
+    for word_pair in utf8_list:
+        utf8 = word_pair[0]
+        char = word_pair[1]
+        unicode_dict[hex_to_decimal(utf8)] = char
+
+    return unicode_dict
+    
+
+def convert(word, unicode_dict):
+    word_list = []
+    for char in word:
+        c_unicode = ord(char)
+        if c_unicode in unicode_dict:
+            word_list.append(unicode_dict[c_unicode])
+
+    return "".join(word_list)
+
+def process_arabic_text(arabic_text, unicode_dict):
+    with open(arabic_text, 'r') as file:
+        sentence_list = []
+        is_sentence = False
+        for line in file.readlines():
+#print(line.split()[0], is_sentence, line.split()[0] == "</P>")
+            if len(line.split()) > 0:
+                if line.split()[0] == "<P>":
+                    is_sentence = True
+
+                elif (is_sentence and line.split()[0] != "</P>"):
+                    for word in line.split():
+                        if word == '.':
+                            # when meet period ".", sentence_list should not be empty (do find sentence ending with two period)
+                            if (len(sentence_list) > 0):                
+                                sentence = " ".join(sentence_list)
+                                print(sentence)
+                            sentence_list = []
+                        elif word[-1] == ".":
+                            word = word[:-1]
+                            sentence_list.append(word)
+                            sentence = " ".join(sentence_list)
+                            print(sentence)
+                            sentence_list = []
+                        else:
+                            word = word
+                            if word != '':
+                                sentence_list.append(word)
+    
+                if line.split()[0] == "</P>":
+                    is_sentence = False
+                    if (len(sentence_list) > 0):
+                        print(" ".join(sentence_list)) 
+                        sentence_list = []
+                
+                
+
+def main():
+    arabic_text = sys.argv[1]
+    unicode_dict = get_unicode_dict()
+    process_arabic_text(arabic_text, unicode_dict)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,10 @@
+ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
+ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
+LBC_NAHAR_ARB_20060911_142800_3683267_3685290
+LBC_NAHAR_ARB_20070303_145800_3249800_3251128
+LBC_NAHAR_ARB_20070303_145800_3623646_3624152
+LBC_NAHAR_ARB_20070305_035800_481003_484069
+ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
+ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
+ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
+ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+--use-energy=false # only non-default option.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh`