Skip to content

Commit ce779c4

Browse files
committed
Latest model
1 parent dd8d263 commit ce779c4

File tree

5 files changed

+125
-100
lines changed

5 files changed

+125
-100
lines changed

cmd/analysis2/train_language.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
#!/bin/bash
22

3+
# Capture language name from first argument
4+
lang_name="$1"
5+
shift # Remove $1 from arguments list, leaving only parameters for training
6+
7+
python3 -m venv .venv
8+
./.venv/bin/pip3 install numpy torch --index-url https://download.pytorch.org/whl/cpu
9+
./.venv/bin/python3 ../../../noaregtransformer/noareg_main.py --input-tsv-file "../../dicts/$lang_name/clean.tsv" --output-train-file "../../dicts/$lang_name/weights8.bin"
10+
11+
12+
13+
14+
exit 0
15+
16+
317
# Capture language name from first argument
418
lang_name="$1"
519
shift # Remove $1 from arguments list, leaving only parameters for training

cmd/analysis2/train_language_reverse.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
#!/bin/bash
22

3+
# Capture language name from first argument
4+
lang_name="$1"
5+
shift # Remove $1 from arguments list, leaving only parameters for training
6+
7+
python3 -m venv .venv
8+
./.venv/bin/pip3 install numpy torch --index-url https://download.pytorch.org/whl/cpu
9+
./.venv/bin/python3 ../../../noaregtransformer/noareg_main.py --input-tsv-file "../../dicts/$lang_name/clean_reverse.tsv" --output-train-file "../../dicts/$lang_name/weights8_reverse.bin"
10+
11+
12+
13+
14+
exit 0
15+
16+
317
# Capture language name from first argument
418
lang_name="$1"
519
shift # Remove $1 from arguments list, leaving only parameters for training

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/neurlang/goruut
22

3-
go 1.22
3+
go 1.22.4
44

55
require (
66
github.com/gorilla/mux v1.8.1
@@ -20,6 +20,7 @@ require (
2020
github.com/dolthub/maphash v0.1.0 // indirect
2121
github.com/gammazero/deque v0.2.1 // indirect
2222
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
23+
github.com/neurlang/noaregtransformer/go v0.0.0-20260210165246-8343b31cc031 // indirect
2324
github.com/neurlang/quaternary v0.2.4 // indirect
2425
golang.org/x/sys v0.5.0 // indirect
2526
)

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ github.com/neurlang/classifier v0.3.0 h1:8xGvMBAkAKjBLB7XwrGQA0p655tu0QhRBp+EQ+p
2222
github.com/neurlang/classifier v0.3.0/go.mod h1:eTbTsao5PkP5onvcdmZiRqq3eSnXVZGVjXmRFDaYfCU=
2323
github.com/neurlang/levenshtein v0.1.0 h1:353zUJ4YL4b3p6IUS3k0vsPXIUr2r1ddImDGqZuezj0=
2424
github.com/neurlang/levenshtein v0.1.0/go.mod h1:WEohFlzG+ZuOltuzlmOJnT8Yw+7WR9MuxaUfKbfA7vA=
25+
github.com/neurlang/noaregtransformer/go v0.0.0-20260210165246-8343b31cc031 h1:EzpocRqJ87+k3GdJQvlxUflNmq5XfONigZTU0Lc4EHs=
26+
github.com/neurlang/noaregtransformer/go v0.0.0-20260210165246-8343b31cc031/go.mod h1:8hrg7apZ4oL4Lvlxlzi3bAc4jtq7KQto7Nm2/c6YZZg=
2527
github.com/neurlang/quaternary v0.2.4 h1:ITmuGIZvwIpMm/ZZC6SfZlOaCJfS3YMg0ttdU3wOBNg=
2628
github.com/neurlang/quaternary v0.2.4/go.mod h1:5ljAzCe6Udiox2BieFnce/egIMH42tAZLdNZ0i1edmk=
2729
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

repo/hashtron_phonemizer_repo.go

Lines changed: 93 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ import (
1010
"github.com/neurlang/classifier/layer/crossattention"
1111
//"github.com/neurlang/classifier/layer/majpool2d"
1212
//"github.com/neurlang/classifier/layer/parity"
13+
"compress/zlib"
1314
"github.com/neurlang/classifier/layer/sochastic"
1415
"github.com/neurlang/classifier/layer/sum"
1516
"github.com/neurlang/classifier/net/feedforward"
1617
"github.com/neurlang/goruut/helpers/log"
1718
"github.com/neurlang/goruut/repo/interfaces"
19+
"github.com/neurlang/noaregtransformer/go/noareg"
1820
"strings"
1921
"sync"
2022
"unicode"
@@ -38,7 +40,7 @@ type HashtronPhonemizerRepository struct {
3840
phoner *interfaces.Phonemizer
3941
nets *map[string]*feedforward.FeedforwardNetwork
4042

41-
aregnets *map[string]*feedforward.FeedforwardNetwork
43+
tformers *map[string]*noareg.NoaregTransformer
4244
}
4345

4446
func hashtronHash(str string) uint32 {
@@ -64,6 +66,7 @@ type language struct {
6466
DropLast []string `json:"DropLast"`
6567
SrcDuplicate [][]string `json:"SrcDuplicate"`
6668
//Histogram []string `json:"Histogram"`
69+
mapTokenizer map[uint32]map[[2]uint32]string
6770
mapSrcMultiLen int
6871
mapSrcMultiSufLen int
6972
mapSrcMulti map[string]struct{}
@@ -83,6 +86,7 @@ func mapize(arr []string) (out map[string]struct{}) {
8386
}
8487

8588
func (l *language) mapize() {
89+
l.mapTokenizer = noareg.MakeDetokenizer(l.Mapping)
8690
l.mapSrcMulti = mapize(l.SrcMulti)
8791
l.mapDstMulti = mapize(l.DstMulti)
8892
l.mapSrcMultiSuffix = mapize(l.SrcMultiSuffix)
@@ -222,6 +226,17 @@ func (l *languages) IsLetter(isReverse bool, lang, run string) bool {
222226
_, ok := (*l)[lang+reverse].mapLetters[run]
223227
return ok
224228
}
229+
func (l *languages) Detokenizer(isReverse bool, lang string) map[uint32]map[[2]uint32]string {
230+
var reverse string
231+
if isReverse {
232+
reverse = "_reverse"
233+
}
234+
if (*l)[lang+reverse] == nil {
235+
return nil
236+
}
237+
t := (*l)[lang+reverse].mapTokenizer
238+
return t
239+
}
225240

226241
func (l *languages) SrcSlice(isReverse bool, language string, word []rune) (o []string) {
227242
var reverse string
@@ -294,6 +309,13 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
294309
r.nets = &netss
295310
log.Now().Debugf("Language %s made map of nets", lang)
296311
}
312+
tformers := r.tformers
313+
if tformers == nil {
314+
tformerss := make(map[string]*noareg.NoaregTransformer)
315+
tformers = &tformerss
316+
r.tformers = &tformerss
317+
log.Now().Debugf("Language %s made map of tformers", lang)
318+
}
297319
/*
298320
aregnets := r.aregnets
299321
if aregnets == nil {
@@ -307,6 +329,10 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
307329
log.Now().Debugf("Language %s already loaded", lang)
308330
return
309331
}
332+
if (*tformers)[lang+reverse] != nil {
333+
log.Now().Debugf("Language %s already loaded", lang)
334+
return
335+
}
310336

311337
var language_files = []string{"language" + reverse + ".json"}
312338
for _, file := range language_files {
@@ -333,6 +359,40 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
333359
r.phoner = &iface
334360
}
335361

362+
var noareg_files = []string{
363+
"weights8" + reverse + ".bin.zlib",
364+
}
365+
for i, file := range noareg_files {
366+
compressedData := log.Error1((*r.getter).GetDict(lang, file))
367+
368+
if compressedData == nil {
369+
continue
370+
}
371+
bytesReader := bytes.NewReader(compressedData)
372+
zlibReader := log.Error1(zlib.NewReader(bytesReader))
373+
if zlibReader == nil {
374+
continue
375+
}
376+
defer zlibReader.Close()
377+
378+
switch i {
379+
case 0:
380+
tensors := log.Error1(noareg.ReadTensors(zlibReader))
381+
if tensors == nil {
382+
break
383+
}
384+
385+
// Initialize transformer
386+
transformer := noareg.NewNoaregTransformer(32, 16, 100, 4)
387+
388+
noareg.LoadTransformerFile(transformer, tensors)
389+
390+
(*r.tformers)[lang+reverse] = transformer
391+
392+
return
393+
}
394+
}
395+
336396
var files = []string{
337397
// "",
338398
"weights6" + reverse + ".json.zlib",
@@ -381,35 +441,7 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
381441
return
382442
}*/
383443
}
384-
/*
385-
var aregfiles = []string{"weights3" + reverse + ".json.zlib"}
386-
387-
for _, file := range aregfiles {
388-
compressedData := log.Error1((*r.getter).GetDict(lang, file))
389-
390-
if compressedData == nil {
391-
continue
392-
}
393-
if (*r.getter).IsNewFormat(compressedData) {
394-
bytesReader := bytes.NewReader(compressedData)
395-
396-
const fanout1 = 5
397-
var net feedforward.FeedforwardNetwork
398-
//net.NewLayer(fanout1, 0)
399-
//net.NewCombiner(sochastic.MustNew(fanout1, 32, 0))
400-
net.NewLayer(fanout1, 0)
401-
net.NewCombiner(parity.MustNew(fanout1))
402-
net.NewLayer(1, 0)
403444

404-
(*r.aregnets)[lang+reverse] = &net
405-
406-
err := (*r.aregnets)[lang+reverse].ReadZlibWeights(bytesReader)
407-
log.Error0(err)
408-
409-
break
410-
}
411-
}
412-
*/
413445
}
414446

415447
func isCombining(r uint32) bool {
@@ -549,76 +581,6 @@ func copystrings(s []string) (r []string) {
549581
return
550582
}
551583

552-
/*
553-
func (r *HashtronPhonemizerRepository) PhonemizeWord(isReverse bool, lang string, word string) (ret map[uint64]string) {
554-
var reverse string
555-
if isReverse {
556-
reverse = "_reverse"
557-
}
558-
r.LoadLanguage(isReverse, lang)
559-
560-
r.mut.RLock()
561-
mapLangIsNil := r.lang.Slice(isReverse, lang) == nil
562-
r.mut.RUnlock()
563-
if mapLangIsNil {
564-
m := make(map[uint64]string)
565-
return m
566-
}
567-
568-
r.mut.RLock()
569-
histogram := r.lang.Histogram(isReverse, lang)
570-
net := (*r.aregnets)[lang+reverse]
571-
r.mut.RUnlock()
572-
573-
if net == nil {
574-
m := make(map[uint64]string)
575-
return m
576-
}
577-
578-
for i := 0; i < 32; i++ {
579-
var input = phonemizer.AregSample{
580-
Src: word,
581-
Dst: fmt.Sprint(i),
582-
}
583-
r.mut.RLock()
584-
pred := net.Infer2(&input) == 1
585-
r.mut.RUnlock()
586-
587-
if !pred {
588-
m := make(map[uint64]string)
589-
return m
590-
}
591-
}
592-
pred := true
593-
var out string
594-
for pred && len(out) < len(word)*2 {
595-
for _, val := range histogram {
596-
var input2 = phonemizer.AregSample{
597-
Src: word,
598-
Dst: out + val,
599-
}
600-
r.mut.RLock()
601-
pred = net.Infer2(&input2) == 1
602-
r.mut.RUnlock()
603-
//fmt.Println(word, out, val, pred)
604-
if pred {
605-
out += val
606-
break
607-
}
608-
}
609-
}
610-
//fmt.Println(word, out)
611-
m := make(map[uint64]string)
612-
hsh := murmur3hash(word + "\x00" + out)
613-
if hsh == 0 {
614-
hsh++
615-
}
616-
m[hsh] = out
617-
m[0] = word
618-
return m
619-
}
620-
*/
621-
622584
func (r *HashtronPhonemizerRepository) ExplainWord(isReverse bool, word1, word2, lang string) (ret map[string][]string) {
623585
ret = make(map[string][]string)
624586
r.LoadLanguage(isReverse, lang)
@@ -642,6 +604,38 @@ func (r *HashtronPhonemizerRepository) PhonemizeWords(isReverse bool, lang strin
642604
}
643605
r.LoadLanguage(isReverse, lang)
644606

607+
r.mut.RLock()
608+
is_new := (*r.tformers)[lang+reverse] != nil
609+
r.mut.RUnlock()
610+
611+
if is_new {
612+
613+
r.mut.RLock()
614+
tran_new := (*r.tformers)[lang+reverse]
615+
detokenizer_new := r.lang.Detokenizer(isReverse, lang)
616+
is_ok := tran_new != nil && detokenizer_new != nil
617+
r.mut.RUnlock()
618+
619+
if is_ok {
620+
621+
out, _ := log.Error2(noareg.TransformerInferFull(tran_new, detokenizer_new, word))
622+
//println(word, gbg[0], gbg[1], out)
623+
624+
src := word
625+
dst := out
626+
627+
m := make(map[string]uint32)
628+
hsh := hashtronHash(src + "\x00" + dst)
629+
if hsh == 0 {
630+
hsh++
631+
}
632+
m[dst] = uint32(hsh)
633+
m[src+" "] = 0
634+
ret = append(ret, m)
635+
return
636+
}
637+
}
638+
645639
r.mut.RLock()
646640
mapLangIsNil := r.lang.Slice(isReverse, lang) == nil
647641
r.mut.RUnlock()

0 commit comments

Comments
 (0)