Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
e405394
chore: wip
ndabAP Jul 12, 2025
918b6f6
chore: wip
ndabAP Jul 13, 2025
839679d
chore: wip
ndabAP Jul 13, 2025
96fe608
chore: wip
ndabAP Jul 17, 2025
dcaeb3a
chore: wip
ndabAP Jul 17, 2025
0a524ba
chore: wip
ndabAP Jul 18, 2025
9ce9b3b
chore: wip
ndabAP Jul 20, 2025
4844300
chore: wip
ndabAP Jul 26, 2025
a742862
chore: wip
ndabAP Jul 27, 2025
bc7a9e4
chore: wip
ndabAP Jul 27, 2025
fc4cd2f
chore: wip
ndabAP Jul 27, 2025
3db3f99
chore: wip
ndabAP Jul 31, 2025
940c505
chore: wip
ndabAP Jul 31, 2025
efe4885
chore: wip
ndabAP Jul 31, 2025
0177f35
chore: wip
ndabAP Aug 3, 2025
8f6d79b
chore: wip
ndabAP Aug 3, 2025
6b1d999
chore: wip
ndabAP Aug 3, 2025
8cc0fa5
chore: wip
ndabAP Aug 7, 2025
27b8598
chore: wip
ndabAP Aug 7, 2025
41efead
chore: wip
ndabAP Aug 9, 2025
87883ec
chore: wip
ndabAP Aug 23, 2025
02e259f
chore: wip
ndabAP Aug 30, 2025
128671c
chore: wip
ndabAP Aug 30, 2025
772323b
chore: wip
ndabAP Aug 31, 2025
39210a0
chore: wip
ndabAP Sep 1, 2025
21a62ad
chore: wip
ndabAP Sep 1, 2025
6030dbd
chore: wip
ndabAP Sep 3, 2025
5a57e16
chore: wip
ndabAP Sep 4, 2025
6172991
chore: wip
ndabAP Sep 4, 2025
35fe16f
chore: wip
ndabAP Sep 6, 2025
559d11b
chore: wip
ndabAP Sep 6, 2025
3011dd8
chore: wip
ndabAP Sep 10, 2025
c8bea58
chore: wip
ndabAP Sep 10, 2025
adacca1
chore: wip
ndabAP Sep 11, 2025
5d8bfd7
chore: wip
ndabAP Sep 11, 2025
3be88e0
chore: wip
ndabAP Sep 12, 2025
66f4d15
chore: wip
ndabAP Sep 12, 2025
7917f9c
chore: wip
ndabAP Sep 13, 2025
322e025
chore: wip
ndabAP Sep 13, 2025
2f5b5e7
chore: wip
ndabAP Sep 14, 2025
a1bc344
chore: wip
ndabAP Sep 19, 2025
3df6f8b
chore: wip
ndabAP Sep 24, 2025
4ea4c6b
chore: wip
ndabAP Sep 24, 2025
5f57574
chore: wip
ndabAP Sep 27, 2025
6b7a294
chore: wip
ndabAP Sep 27, 2025
9adc9e7
chore: wip
ndabAP Sep 29, 2025
567838f
chore: wip
ndabAP Oct 6, 2025
2083625
chore: wip
ndabAP Oct 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
.env
node_modules
source/*.zip
.vscode
__debug_bin*
20 changes: 0 additions & 20 deletions .vscode/launch.json

This file was deleted.

2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 Julian Claus
Copyright (c) 2025 Julian Claus

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
21 changes: 21 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
GOLANGCILINT_VERSION := v2.5.0
GO_PATH := $(shell go env GOPATH)/bin
GOLANGCILINT_BIN := $(GO_PATH)/golangci-lint

.PHONY: lint fmt test

all: lint fmt test

$(GOLANGCILINT_BIN):
@if ! test -x $(GOLANGCILINT_BIN) || ! $(GOLANGCILINT_BIN) --version | grep -q $(GOLANGCILINT_VERSION); then \
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GO_PATH) $(GOLANGCILINT_VERSION); \
fi

fmt: $(GOLANGCILINT_BIN)
$(GOLANGCILINT_BIN) fmt ./... -v

lint: $(GOLANGCILINT_BIN)
@$(GOLANGCILINT_BIN) run $(LINT_FLAGS) ./... --fix -v

test:
go test -v ./... -short
11 changes: 0 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,2 @@
# entityscrape

This a social experiment which shows the mean distance between part of speeches
(e. g. adjectives or nouns) in news articles (like from NBC or CNN) and randomly
selected entities (like Xi Jingping or ISIS). Go straight to the
[website](https://ndabap.github.io/entityscrape/index.html)!

The Go package [assocentity](https://github.com/ndabAP/assocentity) was
used for creating this experiment. You can create new ones by updating the
`source/entities.txt` file and run the CLI with the provided Visual Studio Code
debug configuration.

**Data source**: [dai, tianru, 2017, "News Articles", Harvard Dataverse, V1](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/GMFCTR).
192 changes: 192 additions & 0 deletions cases/conduct.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
package cases

import (
"bufio"
"context"
"errors"
"log/slog"
"math/rand/v2"
"os"
"strings"
"unicode"

"github.com/ndabAP/assocentity"
"github.com/ndabAP/assocentity/tokenize"
"github.com/ndabAP/assocentity/tokenize/nlp"
"github.com/ndabAP/entityscrape/translator"
"golang.org/x/text/language"
)

func (study study[samples, aggregated]) Conduct(ctx context.Context) error {
slog.Debug("processing subjects", "n", len(study.Subjects))

translator := translator.NewGoogle(ctx, GoogleCloudSvcAccountKey)
for subject, analyses := range study.Subjects {
select {
case <-ctx.Done():
return ctx.Err()
default:
}

slog.Debug("processing analyses", "subject", subject)
var (
entity = analyses.Entity
ext = analyses.Ext
feats = analyses.Feats
filenames = analyses.Filenames
reduct = analyses.Reduct
lang = analyses.Language
parser = analyses.Parser
)
tokenizer := nlp.New(GoogleCloudSvcAccountKey, lang.String())
analyses, err := study.analysis(
ctx,
entity,
filenames,
parser,
reduct,
tokenizer,
feats,
)
if err != nil {
return err
}
slog.Debug("analysis done")

slog.Debug("collecting samples")
samples := study.collect(analyses)
slog.Debug("sample collection done")

slog.Debug("aggregating samples")
aggregated := study.aggregate(samples)
slog.Debug("aggregation done")

slog.Debug("reporting aggregation")
translator := func(w []string) ([]string, error) {
switch lang {
case language.English:
slog.Debug("skipping translation for English")
return w, nil
default:
}

return translator.Translate(w, lang, language.English)
}
if err := func() error {
pref := strings.Map(func(r rune) rune {
switch {
case r >= 'a' && r <= 'z':
return r
case r >= 'A' && r <= 'Z':
return unicode.ToLower(r)
case r == ' ', r == '-':
return '_'
default:
return -1
}
}, subject)
writer, err := study.store.NewWriter(pref, ext)
if err != nil {
return err
}
//nolint:errcheck
defer writer.Close()

if err := study.report(aggregated, translator, writer); err != nil {
return err
}

return nil
}(); err != nil {
return err
}
slog.Debug("reporting done", "subject", subject)
}

return nil
}

func (study study[samples, aggregated]) analysis(
ctx context.Context,
entity,
filenames []string,
parser Parser,
reduct bool,
tokenizer tokenize.Tokenizer,
feats tokenize.Features,
) (
assocentity.Analyses,
error,
) {
slog.Debug("parsing files", "n", len(filenames))
if reduct {
slog.Debug("entity reduct enabled")
}

var (
texts = make([]string, 0, len(filenames))

textChan = make(chan []byte, 50)
errChan = make(chan error, 1)
)

// Consumer
go func() {
defer close(errChan)

for text := range textChan {
n := rand.Uint64N(100)
if n >= SampleRate {
continue
}

var err error
if reduct {
text, err = study.reduct(text, entity)
if errors.Is(err, errEntityNotFound) {
continue
}
}
if err != nil {
errChan <- err
return
}
texts = append(texts, string(text))
}
}()
// Producer
go func() {
defer close(textChan)

for _, filename := range filenames {
file, err := os.Open(filename)
if err != nil {
errChan <- err
return
}
for err := range parser(file, textChan) {
if errors.Is(err, bufio.ErrTooLong) {
continue
}
errChan <- err
}
//nolint:errcheck
_ = file.Close()
}
}()

select {
case <-ctx.Done():
return assocentity.Analyses{}, ctx.Err()

case err := <-errChan:
if err != nil {
return assocentity.Analyses{}, err
}
}
slog.Debug("texts sampled and parsed", "n", len(texts))

slog.Debug("creating analyses")
src := assocentity.NewSource(entity, texts)
return src.Analyses(ctx, tokenizer, feats, assocentity.NFKC)
}
35 changes: 35 additions & 0 deletions cases/corpus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package cases

import (
"io/fs"
"path/filepath"
)

// corpusDir is the absolute directory to the corpus.
var corpusDir string

func SetCorpusRootDir(base string) {
corpusDir = filepath.Join(base, "corpus")
}

func GetCorpusRootDir() string {
return corpusDir
}

func WalkCorpus(corpus string, fn func(filename string) error) error {
root := filepath.Join(corpusDir, corpus)
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
switch filepath.Ext(path) {
case ".gitignore", ".gitkeep":
return nil
}
return fn(path)
})
return err
}
Loading