@@ -59,7 +59,7 @@ func main() {
59
59
log .Printf ("len(texts)=%d" , len (texts ))
60
60
61
61
// Get mean distance per entity
62
- log .Println ("get meanN " )
62
+ log .Println ("get mean " )
63
63
nlpTok := nlp .NewNLPTokenizer (* gogSvcLocF , nlp .AutoLang )
64
64
var wg sync.WaitGroup
65
65
for _ , entities := range entities {
@@ -86,7 +86,7 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
86
86
87
87
l := log .New (os .Stderr , entity + ":" , 0 )
88
88
89
- // Ignore articles without entity
89
+ // Ignore articles without entity. This is a fuzzy search to spare the API
90
90
temp := texts [:0 ]
91
91
for _ , text := range texts {
92
92
if strings .Contains (text , entity ) {
@@ -96,77 +96,82 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
96
96
texts = temp
97
97
l .Printf ("len(texts)=%d" , len (texts ))
98
98
99
- poS := tokenize .ADJ | tokenize .ADP | tokenize .ADV | tokenize .CONJ | tokenize .DET | tokenize .NOUN | tokenize .NUM | tokenize .PRON | tokenize .PRT | tokenize .VERB
100
- meanN , err := assocentity .MeanN (
99
+ var (
100
+ poS = tokenize .ADJ | tokenize .ADP | tokenize .ADV | tokenize .CONJ | tokenize .DET | tokenize .NOUN | tokenize .NUM | tokenize .PRON | tokenize .PRT | tokenize .VERB
101
+ source = assocentity .NewSource (entities , texts )
102
+ )
103
+ dists , err := assocentity .Distances (
101
104
context .Background (),
102
105
tokenizer ,
103
106
poS ,
104
- texts ,
105
- entities ,
107
+ source ,
106
108
)
107
109
if err != nil {
108
110
l .Fatal (err )
109
111
}
112
+ assocentity .Normalize (dists )
113
+ assocentity .Threshold (dists , 0.1 )
114
+ mean := assocentity .Mean (dists )
110
115
111
- l .Printf ("len(meanN )=%d" , len (meanN ))
116
+ l .Printf ("len(mean )=%d" , len (mean ))
112
117
113
- if len (meanN ) == 0 {
114
- l .Print ("no meanN found, exiting" )
118
+ if len (mean ) == 0 {
119
+ l .Print ("no mean found, exiting" )
115
120
os .Exit (0 )
116
121
}
117
122
118
123
// Convert to slice to make it sortable
119
124
l .Println ("convert to slice" )
120
- type meanNVal struct {
125
+ type meanVal struct {
121
126
dist float64
122
127
tok tokenize.Token
123
128
}
124
- meanNVals := make ([]meanNVal , 0 )
125
- for tok , dist := range meanN {
129
+ meanVals := make ([]meanVal , 0 )
130
+ for tok , dist := range mean {
126
131
// TODO: Whitelist: a-zA-Z0-9
127
- meanNVals = append (meanNVals , meanNVal {
132
+ meanVals = append (meanVals , meanVal {
128
133
dist : dist ,
129
134
tok : tok ,
130
135
})
131
136
}
132
137
133
138
// Sort by closest distance
134
139
l .Println ("sort by pos and distance" )
135
- sort .Slice (meanNVals , func (i , j int ) bool {
136
- if meanNVals [i ].tok .PoS != meanNVals [j ].tok .PoS {
137
- return meanNVals [i ].tok .PoS < meanNVals [j ].tok .PoS
140
+ sort .Slice (meanVals , func (i , j int ) bool {
141
+ if meanVals [i ].tok .PoS != meanVals [j ].tok .PoS {
142
+ return meanVals [i ].tok .PoS < meanVals [j ].tok .PoS
138
143
}
139
- return meanNVals [i ].dist < meanNVals [j ].dist
144
+ return meanVals [i ].dist < meanVals [j ].dist
140
145
})
141
146
142
147
// Top 10 per pos
143
148
l .Println ("limit top 10" )
144
- type topMeanNVal struct {
149
+ type topMeanVal struct {
145
150
Dist float64 `json:"distance"`
146
151
Pos string `json:"pos"`
147
152
Text string `json:"text"`
148
153
}
149
- topMeanNVals := make ([]topMeanNVal , 0 )
154
+ topMeanVals := make ([]topMeanVal , 0 ) // API result response
150
155
poSCounter := make (map [tokenize.PoS ]int )
151
- for _ , meanNVal := range meanNVals {
156
+ for _ , meanVal := range meanVals {
152
157
// Stop at 10 results per pos
153
- if poSCounter [meanNVal .tok .PoS ] >= 10 {
158
+ if poSCounter [meanVal .tok .PoS ] >= 10 {
154
159
continue
155
160
}
156
161
157
- topMeanNVals = append (topMeanNVals , topMeanNVal {
158
- Dist : meanNVal .dist ,
159
- Pos : tokenize .PoSMapStr [meanNVal .tok .PoS ],
160
- Text : meanNVal .tok .Text ,
162
+ topMeanVals = append (topMeanVals , topMeanVal {
163
+ Dist : meanVal .dist ,
164
+ Pos : tokenize .PoSMapStr [meanVal .tok .PoS ],
165
+ Text : meanVal .tok .Text ,
161
166
})
162
167
163
- poSCounter [meanNVal .tok .PoS ] += 1
168
+ poSCounter [meanVal .tok .PoS ] += 1
164
169
}
165
- l .Printf ("len(topMeanNVals )=%d" , len (topMeanNVals ))
170
+ l .Printf ("len(topMeanVals )=%d" , len (topMeanVals ))
166
171
167
172
// Write top 10 to disk
168
173
l .Println ("write to disk" )
169
- file , err := json .MarshalIndent (& topMeanNVals , "" , " " )
174
+ file , err := json .MarshalIndent (& topMeanVals , "" , " " )
170
175
if err != nil {
171
176
l .Fatal (err )
172
177
}
0 commit comments