GreynirServer/tools/get_articles_by_url.py at master · mideind/GreynirServer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
#
# Given a list of URLs, fetch the corresponding articles
# from the database and write them to a JSON file.
#

import sys, os, json

# Look for modules in parent directory
sys.path.insert(1, os.path.join(sys.path[0], ".."))

from db import SessionContext
from db.models import Article
from tokenizer import correct_spaces


def tokens2text(tokens):
    """Reassemble text from tokens."""
    text = ""
    if not tokens:
        return text
    tokens = json.loads(tokens)
    if not tokens:
        return text
    # Paragraphs
    for p in tokens:
        tx = ""
        # Sentences
        for s in p:
            # Tokens
            for t in s:
                tx += t["x"] + " "
        tx = correct_spaces(tx)
        text += tx + "\n\n"
    return text


def main():
    # Read list of URLs from file
    with open("urls.txt", "r") as f:
        lines = f.readlines()

    articles = list()

    with SessionContext(read_only=True) as session:
        for i in lines:
            try:
                # Fetch article from database and add to list
                url = i
                q = (
                    session.query(
                        Article.url, Article.timestamp, Article.tokens, Article.heading
                    )
                    .filter(Article.url == url)
                    .all()
                )
                if len(q) != 1:
                    continue

                r = q[0]

                item = {
                    "url": r.url,
                    "timestamp": r.timestamp.isoformat(),
                    "title": r.heading,
                    "text": tokens2text(r.tokens),
                }

                articles.append(item)
            except Exception as e:
                print(f"Error processing {i}: {e}")

    with open("articles.json", "w") as f:
        json.dump(articles, f, indent=4, ensure_ascii=False)


if __name__ == "__main__":
    main()