-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathmissing_nouns.py
More file actions
executable file
·56 lines (47 loc) · 1.66 KB
/
missing_nouns.py
File metadata and controls
executable file
·56 lines (47 loc) · 1.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
"""
Read tokens of all articles in the database, look for nouns
and check if they can be found in vocabulary. If not,
add them to a dictionary and spit it out.
"""
import json, sys, os
from datetime import datetime, timezone
from collections import defaultdict
from pprint import pprint
from typing import Dict
# Look for modules in parent directory
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from db import SessionContext
from db.models import Article
with SessionContext(read_only=True) as session:
q = (
session.query(Article.id, Article.timestamp, Article.tokens) # type: ignore
.filter(Article.tree != None)
.filter(Article.timestamp != None)
.filter(Article.timestamp <= datetime.now(timezone.utc)) # type: ignore
.filter(Article.heading > "") # type: ignore
.filter(Article.num_sentences > 0) # type: ignore
)
nouns: Dict[str, int] = defaultdict(int)
for i, a in enumerate(q.yield_per(100)):
print("%d\r" % i, end="")
if not a.tokens:
continue
tokens = json.loads(a.tokens)
# Paragraphs
for p in tokens:
# Sentences
for s in p:
# Tokens
for t in s:
if (
"t" in t
and t["t"].startswith("no_")
and not t.get("m")
and not t.get("v")
):
# print(t["x"])
# print(t)
nouns[t["x"]] += 1
ordered = sorted(nouns.items(), key=lambda kv: kv[1])
pprint(ordered)