@@ -10,11 +10,13 @@ import (
1010 "github.com/neurlang/classifier/layer/crossattention"
1111 //"github.com/neurlang/classifier/layer/majpool2d"
1212 //"github.com/neurlang/classifier/layer/parity"
13+ "compress/zlib"
1314 "github.com/neurlang/classifier/layer/sochastic"
1415 "github.com/neurlang/classifier/layer/sum"
1516 "github.com/neurlang/classifier/net/feedforward"
1617 "github.com/neurlang/goruut/helpers/log"
1718 "github.com/neurlang/goruut/repo/interfaces"
19+ "github.com/neurlang/noaregtransformer/go/noareg"
1820 "strings"
1921 "sync"
2022 "unicode"
@@ -38,7 +40,7 @@ type HashtronPhonemizerRepository struct {
3840 phoner * interfaces.Phonemizer
3941 nets * map [string ]* feedforward.FeedforwardNetwork
4042
41- aregnets * map [string ]* feedforward. FeedforwardNetwork
43+ tformers * map [string ]* noareg. NoaregTransformer
4244}
4345
4446func hashtronHash (str string ) uint32 {
@@ -64,6 +66,7 @@ type language struct {
6466 DropLast []string `json:"DropLast"`
6567 SrcDuplicate [][]string `json:"SrcDuplicate"`
6668 //Histogram []string `json:"Histogram"`
69+ mapTokenizer map [uint32 ]map [[2 ]uint32 ]string
6770 mapSrcMultiLen int
6871 mapSrcMultiSufLen int
6972 mapSrcMulti map [string ]struct {}
@@ -83,6 +86,7 @@ func mapize(arr []string) (out map[string]struct{}) {
8386}
8487
8588func (l * language ) mapize () {
89+ l .mapTokenizer = noareg .MakeDetokenizer (l .Mapping )
8690 l .mapSrcMulti = mapize (l .SrcMulti )
8791 l .mapDstMulti = mapize (l .DstMulti )
8892 l .mapSrcMultiSuffix = mapize (l .SrcMultiSuffix )
@@ -222,6 +226,17 @@ func (l *languages) IsLetter(isReverse bool, lang, run string) bool {
222226 _ , ok := (* l )[lang + reverse ].mapLetters [run ]
223227 return ok
224228}
229+ func (l * languages ) Detokenizer (isReverse bool , lang string ) map [uint32 ]map [[2 ]uint32 ]string {
230+ var reverse string
231+ if isReverse {
232+ reverse = "_reverse"
233+ }
234+ if (* l )[lang + reverse ] == nil {
235+ return nil
236+ }
237+ t := (* l )[lang + reverse ].mapTokenizer
238+ return t
239+ }
225240
226241func (l * languages ) SrcSlice (isReverse bool , language string , word []rune ) (o []string ) {
227242 var reverse string
@@ -294,6 +309,13 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
294309 r .nets = & netss
295310 log .Now ().Debugf ("Language %s made map of nets" , lang )
296311 }
312+ tformers := r .tformers
313+ if tformers == nil {
314+ tformerss := make (map [string ]* noareg.NoaregTransformer )
315+ tformers = & tformerss
316+ r .tformers = & tformerss
317+ log .Now ().Debugf ("Language %s made map of tformers" , lang )
318+ }
297319 /*
298320 aregnets := r.aregnets
299321 if aregnets == nil {
@@ -307,6 +329,10 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
307329 log .Now ().Debugf ("Language %s already loaded" , lang )
308330 return
309331 }
332+ if (* tformers )[lang + reverse ] != nil {
333+ log .Now ().Debugf ("Language %s already loaded" , lang )
334+ return
335+ }
310336
311337 var language_files = []string {"language" + reverse + ".json" }
312338 for _ , file := range language_files {
@@ -333,6 +359,40 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
333359 r .phoner = & iface
334360 }
335361
362+ var noareg_files = []string {
363+ "weights8" + reverse + ".bin.zlib" ,
364+ }
365+ for i , file := range noareg_files {
366+ compressedData := log .Error1 ((* r .getter ).GetDict (lang , file ))
367+
368+ if compressedData == nil {
369+ continue
370+ }
371+ bytesReader := bytes .NewReader (compressedData )
372+ zlibReader := log .Error1 (zlib .NewReader (bytesReader ))
373+ if zlibReader == nil {
374+ continue
375+ }
376+ defer zlibReader .Close ()
377+
378+ switch i {
379+ case 0 :
380+ tensors := log .Error1 (noareg .ReadTensors (zlibReader ))
381+ if tensors == nil {
382+ break
383+ }
384+
385+ // Initialize transformer
386+ transformer := noareg .NewNoaregTransformer (32 , 16 , 100 , 4 )
387+
388+ noareg .LoadTransformerFile (transformer , tensors )
389+
390+ (* r .tformers )[lang + reverse ] = transformer
391+
392+ return
393+ }
394+ }
395+
336396 var files = []string {
337397 // "",
338398 "weights6" + reverse + ".json.zlib" ,
@@ -381,35 +441,7 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
381441 return
382442 }*/
383443 }
384- /*
385- var aregfiles = []string{"weights3" + reverse + ".json.zlib"}
386-
387- for _, file := range aregfiles {
388- compressedData := log.Error1((*r.getter).GetDict(lang, file))
389-
390- if compressedData == nil {
391- continue
392- }
393- if (*r.getter).IsNewFormat(compressedData) {
394- bytesReader := bytes.NewReader(compressedData)
395-
396- const fanout1 = 5
397- var net feedforward.FeedforwardNetwork
398- //net.NewLayer(fanout1, 0)
399- //net.NewCombiner(sochastic.MustNew(fanout1, 32, 0))
400- net.NewLayer(fanout1, 0)
401- net.NewCombiner(parity.MustNew(fanout1))
402- net.NewLayer(1, 0)
403444
404- (*r.aregnets)[lang+reverse] = &net
405-
406- err := (*r.aregnets)[lang+reverse].ReadZlibWeights(bytesReader)
407- log.Error0(err)
408-
409- break
410- }
411- }
412- */
413445}
414446
415447func isCombining (r uint32 ) bool {
@@ -549,76 +581,6 @@ func copystrings(s []string) (r []string) {
549581 return
550582}
551583
552- /*
553- func (r *HashtronPhonemizerRepository) PhonemizeWord(isReverse bool, lang string, word string) (ret map[uint64]string) {
554- var reverse string
555- if isReverse {
556- reverse = "_reverse"
557- }
558- r.LoadLanguage(isReverse, lang)
559-
560- r.mut.RLock()
561- mapLangIsNil := r.lang.Slice(isReverse, lang) == nil
562- r.mut.RUnlock()
563- if mapLangIsNil {
564- m := make(map[uint64]string)
565- return m
566- }
567-
568- r.mut.RLock()
569- histogram := r.lang.Histogram(isReverse, lang)
570- net := (*r.aregnets)[lang+reverse]
571- r.mut.RUnlock()
572-
573- if net == nil {
574- m := make(map[uint64]string)
575- return m
576- }
577-
578- for i := 0; i < 32; i++ {
579- var input = phonemizer.AregSample{
580- Src: word,
581- Dst: fmt.Sprint(i),
582- }
583- r.mut.RLock()
584- pred := net.Infer2(&input) == 1
585- r.mut.RUnlock()
586-
587- if !pred {
588- m := make(map[uint64]string)
589- return m
590- }
591- }
592- pred := true
593- var out string
594- for pred && len(out) < len(word)*2 {
595- for _, val := range histogram {
596- var input2 = phonemizer.AregSample{
597- Src: word,
598- Dst: out + val,
599- }
600- r.mut.RLock()
601- pred = net.Infer2(&input2) == 1
602- r.mut.RUnlock()
603- //fmt.Println(word, out, val, pred)
604- if pred {
605- out += val
606- break
607- }
608- }
609- }
610- //fmt.Println(word, out)
611- m := make(map[uint64]string)
612- hsh := murmur3hash(word + "\x00" + out)
613- if hsh == 0 {
614- hsh++
615- }
616- m[hsh] = out
617- m[0] = word
618- return m
619- }
620- */
621-
622584func (r * HashtronPhonemizerRepository ) ExplainWord (isReverse bool , word1 , word2 , lang string ) (ret map [string ][]string ) {
623585 ret = make (map [string ][]string )
624586 r .LoadLanguage (isReverse , lang )
@@ -642,6 +604,38 @@ func (r *HashtronPhonemizerRepository) PhonemizeWords(isReverse bool, lang strin
642604 }
643605 r .LoadLanguage (isReverse , lang )
644606
607+ r .mut .RLock ()
608+ is_new := (* r .tformers )[lang + reverse ] != nil
609+ r .mut .RUnlock ()
610+
611+ if is_new {
612+
613+ r .mut .RLock ()
614+ tran_new := (* r .tformers )[lang + reverse ]
615+ detokenizer_new := r .lang .Detokenizer (isReverse , lang )
616+ is_ok := tran_new != nil && detokenizer_new != nil
617+ r .mut .RUnlock ()
618+
619+ if is_ok {
620+
621+ out , _ := log .Error2 (noareg .TransformerInferFull (tran_new , detokenizer_new , word ))
622+ //println(word, gbg[0], gbg[1], out)
623+
624+ src := word
625+ dst := out
626+
627+ m := make (map [string ]uint32 )
628+ hsh := hashtronHash (src + "\x00 " + dst )
629+ if hsh == 0 {
630+ hsh ++
631+ }
632+ m [dst ] = uint32 (hsh )
633+ m [src + " " ] = 0
634+ ret = append (ret , m )
635+ return
636+ }
637+ }
638+
645639 r .mut .RLock ()
646640 mapLangIsNil := r .lang .Slice (isReverse , lang ) == nil
647641 r .mut .RUnlock ()
0 commit comments