Skip to content

Commit e405394

Browse files
committed
chore: wip
1 parent 55d254c commit e405394

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+851
-86054
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
.env
22
node_modules
3-
source/*.zip
3+
.vscode

.vscode/launch.json

Lines changed: 0 additions & 20 deletions
This file was deleted.

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2023 Julian Claus
3+
Copyright (c) 2025 Julian Claus
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,2 @@
11
# entityscrape
22

3-
This a social experiment which shows the mean distance between part of speeches
4-
(e. g. adjectives or nouns) in news articles (like from NBC or CNN) and randomly
5-
selected entities (like Xi Jingping or ISIS). Go straight to the
6-
[website](https://ndabap.github.io/entityscrape/index.html)!
7-
8-
The Go package [assocentity](https://github.com/ndabAP/assocentity) was
9-
used for creating this experiment. You can create new ones by updating the
10-
`source/entities.txt` file and run the CLI with the provided Visual Studio Code
11-
debug configuration.
12-
13-
**Data source**: [dai, tianru, 2017, "News Articles", Harvard Dataverse, V1](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/GMFCTR).

cases/isob/case.go

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
// International sentiment of brands
2+
package isob
3+
4+
import (
5+
"context"
6+
"encoding/json"
7+
"io"
8+
"log/slog"
9+
"path"
10+
"slices"
11+
"sort"
12+
13+
"cloud.google.com/go/language/apiv1/languagepb"
14+
"github.com/ndabAP/assocentity"
15+
"github.com/ndabAP/assocentity/dependency"
16+
"github.com/ndabAP/assocentity/tokenize"
17+
"github.com/ndabAP/entityscrape/cases"
18+
"github.com/ndabAP/entityscrape/parser"
19+
"golang.org/x/text/language"
20+
)
21+
22+
var logger = slog.Default()
23+
24+
type (
25+
sample struct {
26+
*tokenize.Token
27+
from bool
28+
}
29+
aggregate struct {
30+
Word [2]string `json:"word"`
31+
PoS string `json:"pos"`
32+
N int `json:"n"`
33+
}
34+
aggregates []aggregate
35+
)
36+
37+
var (
38+
corpus = "gpsc"
39+
40+
collector = func(analyses assocentity.Analyses) []sample {
41+
var (
42+
entities = analyses.Forest().Entities()
43+
samples = make([]sample, 0)
44+
)
45+
walker := func(
46+
from,
47+
to *tokenize.Token,
48+
_ tokenize.DependencyEdgeLabel,
49+
tree dependency.Tree,
50+
) bool {
51+
switch {
52+
case slices.Contains(entities, to):
53+
switch from.PartOfSpeech.Tag {
54+
case tokenize.PartOfSpeechTagVerb, tokenize.PartOfSpeechTagNoun, tokenize.PartOfSpeechTagAdj:
55+
slog.Debug("adding sample", "word", from.Text.Content)
56+
samples = append(samples, sample{
57+
Token: from,
58+
})
59+
default:
60+
// Skip
61+
}
62+
63+
return true
64+
65+
case slices.Contains(entities, from):
66+
switch from.PartOfSpeech.Tag {
67+
case tokenize.PartOfSpeechTagVerb, tokenize.PartOfSpeechTagNoun, tokenize.PartOfSpeechTagAdj:
68+
slog.Debug("adding sample", "word", from.Text.Content)
69+
samples = append(samples, sample{
70+
Token: from,
71+
from: true,
72+
})
73+
default:
74+
}
75+
}
76+
77+
return true
78+
}
79+
analyses.Forest().Dependencies(walker)
80+
81+
return samples
82+
}
83+
aggregator = func(samples []sample) aggregates {
84+
aggregates := make(aggregates, 0, len(samples))
85+
for _, sample := range samples {
86+
w := sample.Lemma
87+
i := slices.IndexFunc(aggregates, func(aggregate aggregate) bool {
88+
return w == aggregate.Word[0]
89+
})
90+
switch i {
91+
case -1:
92+
var (
93+
word = [2]string{w}
94+
pos = languagepb.PartOfSpeech_Tag_name[int32(sample.PartOfSpeech.Tag)]
95+
n = 1
96+
)
97+
aggregates = append(aggregates, aggregate{
98+
Word: word,
99+
PoS: pos,
100+
N: n,
101+
})
102+
// Found
103+
default:
104+
aggregates[i].N++
105+
}
106+
}
107+
108+
// Top n sorted
109+
const limit = 10
110+
sort.Slice(aggregates, func(i, j int) bool {
111+
return aggregates[i].N > aggregates[j].N
112+
})
113+
if len(aggregates) > limit {
114+
aggregates = aggregates[:limit]
115+
}
116+
117+
return aggregates
118+
}
119+
reporter = func(aggregates aggregates, translate cases.Translate, writer io.Writer) error {
120+
// Collect words to translate.
121+
words := make([]string, 0, len(aggregates))
122+
for _, aggregate := range aggregates {
123+
words = append(words, aggregate.Word[0])
124+
}
125+
w, err := translate(words)
126+
if err != nil {
127+
return err
128+
}
129+
// Add translated words back.
130+
for i := range aggregates {
131+
aggregates[i].Word[1] = w[i]
132+
}
133+
134+
return json.NewEncoder(writer).Encode(&aggregates)
135+
}
136+
)
137+
138+
func Conduct(ctx context.Context) error {
139+
select {
140+
case <-ctx.Done():
141+
return ctx.Err()
142+
default:
143+
}
144+
145+
slog.Debug("conducting national sentiment of political speeches")
146+
return conduct(ctx)
147+
}
148+
149+
func conduct(ctx context.Context) error {
150+
study := cases.NewStudy(corpus, collector, aggregator, reporter)
151+
152+
feats := tokenize.FeatureSyntax
153+
154+
// Deutschland
155+
{
156+
lang := language.German
157+
entity := []string{"Deutschland", "Deutschlands", "Deutschlande"}
158+
159+
// GPSC
160+
{
161+
slog.Debug("adding german political speeches corpus")
162+
var (
163+
filenames = []string{
164+
path.Join("German-Political-Speeches-Corpus", "Bundesregierung.xml"),
165+
}
166+
parser = parser.GPSC
167+
)
168+
study.Subjects["Deutschland"] = cases.Analyses{
169+
Entity: entity,
170+
Feats: feats,
171+
Filenames: filenames,
172+
Language: lang,
173+
Parser: parser,
174+
Ext: "json",
175+
}
176+
}
177+
}
178+
179+
if err := study.Conduct(ctx); err != nil {
180+
return err
181+
}
182+
183+
return nil
184+
}

0 commit comments

Comments
 (0)