Skip to content

Commit 11a7955

Browse files
committed
feat: add entities, improve logging
1 parent 8b9c0c7 commit 11a7955

15 files changed

+1764
-1697
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module github.com/ndabAP/entityscrape
22

33
go 1.18
44

5-
require github.com/ndabAP/assocentity/v12 v12.0.0
5+
require github.com/ndabAP/assocentity/v12 v12.2.0
66

77
require (
88
cloud.google.com/go/compute v1.12.1 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57Q
5252
github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc=
5353
github.com/ndabAP/assocentity/v12 v12.0.0 h1:fCwj0S+jKlp2l4pYWAjl8dTey1j3RInx22HHGXlZ7FA=
5454
github.com/ndabAP/assocentity/v12 v12.0.0/go.mod h1:4TGqBbxSnNKX7odkQGGSYzdtIeJSKkz5ZSyI/Zf0QmI=
55+
github.com/ndabAP/assocentity/v12 v12.2.0 h1:SDATlGnfKvWR1Ei6WRxYOPk+mv1h0lIDb9xKXAGw9yA=
56+
github.com/ndabAP/assocentity/v12 v12.2.0/go.mod h1:4TGqBbxSnNKX7odkQGGSYzdtIeJSKkz5ZSyI/Zf0QmI=
5557
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5658
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
5759
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

main.go

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
8484
entity := entities[0]
8585
log.Printf("entity=%s", entity)
8686

87+
l := log.New(os.Stderr, entity+":", 0)
88+
8789
// Ignore articles without entity
8890
temp := texts[:0]
8991
for _, text := range texts {
@@ -92,9 +94,9 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
9294
}
9395
}
9496
texts = temp
95-
log.Printf("len(texts)=%d", len(texts))
97+
l.Printf("len(texts)=%d", len(texts))
9698

97-
poS := tokenize.ADJ | tokenize.ADP | tokenize.ADV | tokenize.CONJ | tokenize.DET | tokenize.NOUN | tokenize.NUM | tokenize.PRON | tokenize.VERB
99+
poS := tokenize.ADJ | tokenize.ADP | tokenize.ADV | tokenize.CONJ | tokenize.DET | tokenize.NOUN | tokenize.NUM | tokenize.PRON | tokenize.PRT | tokenize.VERB
98100
meanN, err := assocentity.MeanN(
99101
context.Background(),
100102
tokenizer,
@@ -103,18 +105,18 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
103105
entities,
104106
)
105107
if err != nil {
106-
log.Fatal(err)
108+
l.Fatal(err)
107109
}
108110

109-
log.Printf("len(meanN)=%d", len(meanN))
111+
l.Printf("len(meanN)=%d", len(meanN))
110112

111113
if len(meanN) == 0 {
112-
log.Print("no meanN found, exiting")
114+
l.Print("no meanN found, exiting")
113115
os.Exit(0)
114116
}
115117

116118
// Convert to slice to make it sortable
117-
log.Println("convert to slice")
119+
l.Println("convert to slice")
118120
type meanNVal struct {
119121
dist float64
120122
tok tokenize.Token
@@ -129,7 +131,7 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
129131
}
130132

131133
// Sort by closest distance
132-
log.Println("sort by pos and distance")
134+
l.Println("sort by pos and distance")
133135
sort.Slice(meanNVals, func(i, j int) bool {
134136
if meanNVals[i].tok.PoS != meanNVals[j].tok.PoS {
135137
return meanNVals[i].tok.PoS < meanNVals[j].tok.PoS
@@ -138,7 +140,7 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
138140
})
139141

140142
// Top 10 per pos
141-
log.Println("limit top 10")
143+
l.Println("limit top 10")
142144
type topMeanNVal struct {
143145
Dist float64 `json:"distance"`
144146
Pos string `json:"pos"`
@@ -160,18 +162,18 @@ func scrape(texts, entities []string, tokenizer tokenize.Tokenizer) error {
160162

161163
poSCounter[meanNVal.tok.PoS] += 1
162164
}
163-
log.Printf("len(topMeanNVals)=%d", len(topMeanNVals))
165+
l.Printf("len(topMeanNVals)=%d", len(topMeanNVals))
164166

165167
// Write top 10 to disk
166-
log.Println("write to disk")
168+
l.Println("write to disk")
167169
file, err := json.MarshalIndent(&topMeanNVals, "", " ")
168170
if err != nil {
169-
log.Fatal(err)
171+
l.Fatal(err)
170172
}
171173
name := url.QueryEscape(strings.ToLower(entity))
172174
path := filepath.Join("web/public", name+".json")
173175
if err := os.WriteFile(path, file, 0600); err != nil {
174-
log.Fatal(err)
176+
l.Fatal(err)
175177
}
176178

177179
return nil

source/entities.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
Barack Obama,Obama
2+
Brexit
13
Donald Trump,Trump
24
ISIS
3-
Barack Obama,Obama
5+
North Korea,North Korean,North Korea
46
Vladimir Putin,Putin
5-
Xi Jinping,Xi,Jinping
6-
North Korea,North Korean,North Korea
7+
Xi Jinping,Xi,Jinping

0 commit comments

Comments
 (0)