-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathget_articles_by_url.py
More file actions
executable file
·78 lines (63 loc) · 1.9 KB
/
get_articles_by_url.py
File metadata and controls
executable file
·78 lines (63 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
#
# Given a list of URLs, fetch the corresponding articles
# from the database and write them to a JSON file.
#
import sys, os, json
# Look for modules in parent directory
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from db import SessionContext
from db.models import Article
from tokenizer import correct_spaces
def tokens2text(tokens):
"""Reassemble text from tokens."""
text = ""
if not tokens:
return text
tokens = json.loads(tokens)
if not tokens:
return text
# Paragraphs
for p in tokens:
tx = ""
# Sentences
for s in p:
# Tokens
for t in s:
tx += t["x"] + " "
tx = correct_spaces(tx)
text += tx + "\n\n"
return text
def main():
# Read list of URLs from file
with open("urls.txt", "r") as f:
lines = f.readlines()
articles = list()
with SessionContext(read_only=True) as session:
for i in lines:
try:
# Fetch article from database and add to list
url = i
q = (
session.query(
Article.url, Article.timestamp, Article.tokens, Article.heading
)
.filter(Article.url == url)
.all()
)
if len(q) != 1:
continue
r = q[0]
item = {
"url": r.url,
"timestamp": r.timestamp.isoformat(),
"title": r.heading,
"text": tokens2text(r.tokens),
}
articles.append(item)
except Exception as e:
print(f"Error processing {i}: {e}")
with open("articles.json", "w") as f:
json.dump(articles, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
main()