-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathdbclean.py
More file actions
executable file
·94 lines (71 loc) · 2.9 KB
/
dbclean.py
File metadata and controls
executable file
·94 lines (71 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# type: ignore
"""
Greynir: Natural language processing for Icelandic
Copyright (C) 2023 Miðeind ehf.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
Utility script that inspects articles in Greynir's database
and removes those that:
* Don't contain any sentences
* Are duplicates (e.g. https vs http URLs)
* Are non-Icelandic
* Contain lots of "chaff", i.e. many very short sentences (prob. scraper issues)
"""
import os
import sys
import re
from random import shuffle
# Hack to make this Python program executable from the tools subdirectory
basepath, _ = os.path.split(os.path.realpath(__file__))
_TOOLS = os.sep + "tools"
if basepath.endswith(_TOOLS):
basepath = basepath[0 : -len(_TOOLS)]
sys.path.append(basepath)
from settings import Settings, ConfigError
# from article import Article
from db import SessionContext
from db.models import Article as ArticleModel
from reynir.bintokenizer import tokens_are_foreign
def main():
try:
# Read configuration file
Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
except ConfigError as e:
print("Configuration error: {0}".format(e))
quit()
with SessionContext(commit=True) as session:
# Zero sentences
print("Deleting all articles with zero sentences")
res = session.execute(
ArticleModel.table().delete().where(ArticleModel.num_sentences == 0)
)
print(str(res.rowcount) + " articles deleted")
# Non-Icelandic
# TODO: Implement me!
# Duplicates
# For each https article, check whether there is a corresponding
# article URL with http URI scheme
dupl = 0
q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%"))
for r in q.all():
url = re.sub(r"^https://", r"http://", r.url)
# c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count()
res = session.execute(
ArticleModel.table().delete().where(ArticleModel.url == url)
)
dupl += res.rowcount
print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
# Chaff
# ???
# TODO: Implement me!
if __name__ == "__main__":
main()