Skip to content

Commit 9d8aa30

Browse files
committed
wip: quality
1 parent 771c159 commit 9d8aa30

File tree

3 files changed

+35
-30
lines changed

3 files changed

+35
-30
lines changed

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@ require (
2424
google.golang.org/grpc v1.50.1 // indirect
2525
google.golang.org/protobuf v1.28.1 // indirect
2626
)
27+
28+
replace github.com/ndabAP/assocentity/v12 => ../assocentity

go.sum

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ github.com/googleapis/enterprise-certificate-proxy v0.2.0/go.mod h1:8C0jb7/mgJe/
5050
github.com/googleapis/gax-go/v2 v2.7.0 h1:IcsPKeInNvYi7eqSaDjiZqDDKu5rsmunY0Y1YupQSSQ=
5151
github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8=
5252
github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc=
53-
github.com/ndabAP/assocentity/v12 v12.0.0 h1:fCwj0S+jKlp2l4pYWAjl8dTey1j3RInx22HHGXlZ7FA=
54-
github.com/ndabAP/assocentity/v12 v12.0.0/go.mod h1:4TGqBbxSnNKX7odkQGGSYzdtIeJSKkz5ZSyI/Zf0QmI=
5553
github.com/ndabAP/assocentity/v12 v12.2.0 h1:SDATlGnfKvWR1Ei6WRxYOPk+mv1h0lIDb9xKXAGw9yA=
5654
github.com/ndabAP/assocentity/v12 v12.2.0/go.mod h1:4TGqBbxSnNKX7odkQGGSYzdtIeJSKkz5ZSyI/Zf0QmI=
5755
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=

main.go

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ func main() {
5959
log.Printf("len(texts)=%d", len(texts))
6060

6161
// Get mean distance per entity
62-
log.Println("get meanN")
62+
log.Println("get mean")
6363
nlpTok := nlp.NewNLPTokenizer(*gogSvcLocF, nlp.AutoLang)
6464
var wg sync.WaitGroup
6565
for _, entities := range entities {
@@ -86,7 +86,7 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
8686

8787
l := log.New(os.Stderr, entity+":", 0)
8888

89-
// Ignore articles without entity
89+
// Ignore articles without entity. This is a fuzzy search to spare the API
9090
temp := texts[:0]
9191
for _, text := range texts {
9292
if strings.Contains(text, entity) {
@@ -96,77 +96,82 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
9696
texts = temp
9797
l.Printf("len(texts)=%d", len(texts))
9898

99-
poS := tokenize.ADJ | tokenize.ADP | tokenize.ADV | tokenize.CONJ | tokenize.DET | tokenize.NOUN | tokenize.NUM | tokenize.PRON | tokenize.PRT | tokenize.VERB
100-
meanN, err := assocentity.MeanN(
99+
var (
100+
poS = tokenize.ADJ | tokenize.ADP | tokenize.ADV | tokenize.CONJ | tokenize.DET | tokenize.NOUN | tokenize.NUM | tokenize.PRON | tokenize.PRT | tokenize.VERB
101+
source = assocentity.NewSource(entities, texts)
102+
)
103+
dists, err := assocentity.Distances(
101104
context.Background(),
102105
tokenizer,
103106
poS,
104-
texts,
105-
entities,
107+
source,
106108
)
107109
if err != nil {
108110
l.Fatal(err)
109111
}
112+
assocentity.Normalize(dists)
113+
assocentity.Threshold(dists, 0.1)
114+
mean := assocentity.Mean(dists)
110115

111-
l.Printf("len(meanN)=%d", len(meanN))
116+
l.Printf("len(mean)=%d", len(mean))
112117

113-
if len(meanN) == 0 {
114-
l.Print("no meanN found, exiting")
118+
if len(mean) == 0 {
119+
l.Print("no mean found, exiting")
115120
os.Exit(0)
116121
}
117122

118123
// Convert to slice to make it sortable
119124
l.Println("convert to slice")
120-
type meanNVal struct {
125+
type meanVal struct {
121126
dist float64
122127
tok tokenize.Token
123128
}
124-
meanNVals := make([]meanNVal, 0)
125-
for tok, dist := range meanN {
129+
meanVals := make([]meanVal, 0)
130+
for tok, dist := range mean {
126131
// TODO: Whitelist: a-zA-Z0-9
127-
meanNVals = append(meanNVals, meanNVal{
132+
meanVals = append(meanVals, meanVal{
128133
dist: dist,
129134
tok: tok,
130135
})
131136
}
132137

133138
// Sort by closest distance
134139
l.Println("sort by pos and distance")
135-
sort.Slice(meanNVals, func(i, j int) bool {
136-
if meanNVals[i].tok.PoS != meanNVals[j].tok.PoS {
137-
return meanNVals[i].tok.PoS < meanNVals[j].tok.PoS
140+
sort.Slice(meanVals, func(i, j int) bool {
141+
if meanVals[i].tok.PoS != meanVals[j].tok.PoS {
142+
return meanVals[i].tok.PoS < meanVals[j].tok.PoS
138143
}
139-
return meanNVals[i].dist < meanNVals[j].dist
144+
return meanVals[i].dist < meanVals[j].dist
140145
})
141146

142147
// Top 10 per pos
143148
l.Println("limit top 10")
144-
type topMeanNVal struct {
149+
type topMeanVal struct {
145150
Dist float64 `json:"distance"`
146151
Pos string `json:"pos"`
147152
Text string `json:"text"`
148153
}
149-
topMeanNVals := make([]topMeanNVal, 0)
154+
topMeanVals := make([]topMeanVal, 0) // API result response
150155
poSCounter := make(map[tokenize.PoS]int)
151-
for _, meanNVal := range meanNVals {
156+
for _, meanVal := range meanVals {
152157
// Stop at 10 results per pos
153-
if poSCounter[meanNVal.tok.PoS] >= 10 {
158+
if poSCounter[meanVal.tok.PoS] >= 10 {
154159
continue
155160
}
156161

157-
topMeanNVals = append(topMeanNVals, topMeanNVal{
158-
Dist: meanNVal.dist,
159-
Pos: tokenize.PoSMapStr[meanNVal.tok.PoS],
160-
Text: meanNVal.tok.Text,
162+
topMeanVals = append(topMeanVals, topMeanVal{
163+
Dist: meanVal.dist,
164+
Pos: tokenize.PoSMapStr[meanVal.tok.PoS],
165+
Text: meanVal.tok.Text,
161166
})
162167

163-
poSCounter[meanNVal.tok.PoS] += 1
168+
poSCounter[meanVal.tok.PoS] += 1
164169
}
165-
l.Printf("len(topMeanNVals)=%d", len(topMeanNVals))
170+
l.Printf("len(topMeanVals)=%d", len(topMeanVals))
166171

167172
// Write top 10 to disk
168173
l.Println("write to disk")
169-
file, err := json.MarshalIndent(&topMeanNVals, "", " ")
174+
file, err := json.MarshalIndent(&topMeanVals, "", " ")
170175
if err != nil {
171176
l.Fatal(err)
172177
}

0 commit comments

Comments
 (0)