Skip to content

Commit 57ec054

Browse files
Silvan-WMDErti
authored andcommitted
auto fetch gs-wiki articles
1 parent 4100064 commit 57ec054

File tree

7 files changed

+250
-42
lines changed

7 files changed

+250
-42
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,6 @@ __pycache__/
2727

2828
# macOS
2929
.DS_Store
30+
31+
# logs
32+
*.log

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ To build and run the container locally with hot reload on python files do:
1010
```
1111
DOCKER_BUILDKIT=1 docker build . -t gbnc
1212
docker run \
13+
--env DOCUMENTS_TOC=json_input/gs-wiki.json \
14+
--env GSWIKI_USER=<bot_username> \
15+
--env GSWIKI_PW=<bot_password> \
1316
--env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
1417
--volume "$(pwd)/gswikichat":/workspace/gswikichat \
1518
--volume gbnc_cache:/root/.cache \

gswikichat/fetch_articles.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import os
2+
import re
3+
import json
4+
import requests
5+
import configparser
6+
7+
from bs4 import BeautifulSoup
8+
9+
GSWIKI_USER = os.environ.get('GSWIKI_USER')
10+
GSWIKI_PW = os.environ.get('GSWIKI_PW')
11+
12+
HTML_FILTERS = {
13+
'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'],
14+
'span': ['mw-ext-cite-error'],
15+
'table': ['noprint','ombox'],
16+
'ol': ['breadcrumb-nav-container', 'references'],
17+
'sup': ['reference']
18+
}
19+
SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ]
20+
REGEX_FILTERS = {
21+
'p': '→.*ersion'
22+
}
23+
24+
def filterHtml(soup):
25+
for figure in soup.find_all('figure'):
26+
figure.decompose()
27+
28+
for tag, classes in HTML_FILTERS.items():
29+
for className in classes:
30+
for div in soup.find_all(tag, {'class': className}):
31+
div.decompose()
32+
33+
for tag, regex in REGEX_FILTERS.items():
34+
for element in soup.find_all(tag):
35+
if(re.search(regex, str(element)) != None):
36+
element.decompose()
37+
38+
return soup
39+
40+
def fetchFromWiki(url, titles, loginRequired):
41+
if(loginRequired == True):
42+
session = loginToWiki(url)
43+
else:
44+
session = requests.Session()
45+
46+
articles = {}
47+
for title in titles:
48+
sections = fetchSections(url, title, session.cookies)
49+
print("fetching {} sections for article {}".format(len(sections), title))
50+
for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections :
51+
if section['index'] == '' or section['line'] in SECTION_FILTERS:
52+
continue
53+
54+
query = {
55+
'action': 'parse',
56+
'page': title,
57+
'format': 'json',
58+
'prop':'text',
59+
'disabletoc': True,
60+
'disablelimitreport': True,
61+
'disableeditsection': True,
62+
'section': section['index']
63+
}
64+
section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*']
65+
section_soup = BeautifulSoup(section_html, 'lxml')
66+
articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text()
67+
68+
return articles
69+
70+
71+
def fetchSections(url, title, cookies=None):
72+
query = {
73+
'action':'parse',
74+
'page':title,
75+
'format':'json',
76+
'prop':'sections'
77+
}
78+
sectionsResponse = requests.get(url,params=query, cookies=cookies)
79+
toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ]
80+
return toplevelSections
81+
82+
def loginToWiki(url):
83+
session = requests.Session()
84+
85+
tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' }
86+
token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken']
87+
loginData = {
88+
'lgname': GSWIKI_USER,
89+
'lgpassword': GSWIKI_PW,
90+
'lgtoken': token,
91+
'action': 'login',
92+
'format': 'json'
93+
}
94+
response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' })
95+
#TODO: error handling in case of login failure
96+
return session
97+
98+
def fetch_articles(toc):
99+
articles = []
100+
for wiki in toc:
101+
url = wiki['host'] + wiki['api_path']
102+
wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login'])
103+
104+
articles.append( {
105+
'wiki': wiki['name'],
106+
'url': wiki['host'],
107+
'lang': wiki['lang'],
108+
'articles': wikiArticles
109+
} )
110+
return articles
111+

gswikichat/vector_store_interface.py

Lines changed: 28 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,17 @@
1414
import torch
1515

1616
from .logger import get_logger
17+
from .fetch_articles import fetch_articles
18+
1719

1820
# Create logger instance from base logger config in `logger.py`
1921
logger = get_logger(__name__)
2022

2123
HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN')
24+
DOCUMENTS_TOC = os.environ.get('DOCUMENTS_TOC')
2225

23-
# disable this line to disable the embedding cache
26+
# disable these lines to disable the cache
27+
DOCUMENTS_CACHE_FILE = '/root/.cache/gbnc_documents.json'
2428
EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json'
2529

2630
top_k = 5
@@ -32,49 +36,31 @@
3236
logger.info('GPU is available.')
3337
device = "cuda"
3438

39+
if DOCUMENTS_CACHE_FILE and os.path.isfile(DOCUMENTS_CACHE_FILE):
40+
logger.info('Loading documents from cache')
41+
42+
with open(DOCUMENTS_CACHE_FILE, 'r') as f_in:
43+
documents = json.load(f_in)
3544

36-
# TODO: Add the json strings as env variables
37-
json_dir = 'json_input'
38-
json_fname = 'excellent-articles_10.json'
39-
40-
json_fpath = os.path.join(json_dir, json_fname)
41-
42-
if os.path.isfile(json_fpath):
43-
logger.info(f'Loading data from {json_fpath}')
44-
with open(json_fpath, 'r') as finn:
45-
json_obj = json.load(finn)
46-
47-
if isinstance(json_obj, dict):
48-
input_documents = [
49-
Document(
50-
content=content_,
51-
meta={"src": url_}
52-
)
53-
for url_, content_ in tqdm(json_obj.items())
54-
]
55-
elif isinstance(json_obj, list):
56-
input_documents = [
57-
Document(
58-
content=obj_['content'],
59-
meta={'src': obj_['meta']}
60-
)
61-
for obj_ in tqdm(json_obj)
62-
]
6345
else:
64-
input_documents = [
65-
Document(
66-
content="My name is Asra, I live in Paris.",
67-
meta={"src": "doc_1"}
68-
),
69-
Document(
70-
content="My name is Lee, I live in Berlin.",
71-
meta={"src": "doc2"}
72-
),
73-
Document(
74-
content="My name is Giorgio, I live in Rome.",
75-
meta={"src": "doc_3"}
76-
),
77-
]
46+
logger.debug("fetch documents from wiki")
47+
with open(DOCUMENTS_TOC, 'r') as tocFile:
48+
toc = json.load(tocFile)
49+
articles = fetch_articles(toc)
50+
documents = {}
51+
for wiki in articles:
52+
documents.update(wiki['articles'])
53+
if DOCUMENTS_CACHE_FILE:
54+
with open(DOCUMENTS_CACHE_FILE, 'w') as f_out:
55+
json.dump(documents, f_out)
56+
57+
input_documents = [
58+
Document(
59+
content=content_,
60+
meta={"src": url_}
61+
)
62+
for url_, content_ in tqdm(documents.items())
63+
]
7864

7965
splitter = DocumentSplitter(
8066
split_by="sentence",

json_input/gs-wiki.json

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
[
2+
{
3+
"name": "GS-Wiki de",
4+
"host": "https://wiki.wikimedia.de/",
5+
"api_path": "/api.php",
6+
"lang": "de",
7+
"login": true,
8+
"titles" : [
9+
"Offboarding",
10+
"Arbeitszeit",
11+
"Beschwerdestelle_AGG",
12+
"Betriebliche_Altersvorsorge",
13+
"Betriebliches_Eingliederungsmanagement_(BEM)",
14+
"Betriebsvereinbarung",
15+
"Bildungszeit",
16+
"COVID-19",
17+
"Culture_Shock",
18+
"Digitale_Gehaltsunterlagen_(Datev_Arbeitnehmer_Online)",
19+
"Neue_Mitarbeiter",
20+
"Elternzeit",
21+
"Weiterbildung",
22+
"Jubiläum",
23+
"Krankmeldung",
24+
"Mitarbeitendenjahresgespräch",
25+
"Nebentätigkeit",
26+
"Onboarding",
27+
"Vorstandsbeschlüsse",
28+
"Personio",
29+
"Pme_Familienservice",
30+
"Probezeit",
31+
"Stellenausschreibungen",
32+
"Überstunden",
33+
"WMDE:Urlaub",
34+
"Weiterbildung",
35+
"Werkstudierende"
36+
]
37+
},
38+
{
39+
"name": "GS-Wiki en",
40+
"host": "https://wiki.wikimedia.de/",
41+
"api_path": "/api.php",
42+
"lang": "en",
43+
"login": true,
44+
"titles" : [
45+
"Jubiläum/en",
46+
"Betriebsvereinbarung",
47+
"Company_pension_plan",
48+
"COVID-19EN",
49+
"Culture_Shock",
50+
"Digital_Payslip",
51+
"Beschwerdestelle_AGG/en",
52+
"Betriebliches_Eingliederungsmanagement_(BEM)/en",
53+
"Offboarding/en",
54+
"Onboarding/en",
55+
"Decisions_of_the_ED",
56+
"Overtime",
57+
"Bildungszeit/en",
58+
"Parental_leave",
59+
"Personio/en",
60+
"Pme_Counselling_Service",
61+
"Probationary_Period",
62+
"Quartalsgespräche/en",
63+
"Decisions_of_the_ED",
64+
"Secondary_employment",
65+
"Sick_leave",
66+
"Fortbildung/en",
67+
"Stellenausschreibungen/en",
68+
"WMDE:Urlaub/en",
69+
"Arbeitszeit/en",
70+
"Werkstudierende/en"
71+
]
72+
}
73+
]

json_input/wp-policies.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"name": "German Wikipedia Policies",
4+
"host": "https://de.wikipedia.org",
5+
"api_path": "/w/api.php",
6+
"lang": "de",
7+
"login": false,
8+
"titles" : [
9+
"Wikipedia:Grundprinzipien",
10+
"Wikipedia:Was_Wikipedia_nicht_ist",
11+
"Wikipedia:Neutraler_Standpunkt",
12+
"Wikipedia:Urheberrechte_beachten",
13+
"Wikipedia:Wikiquette"
14+
]
15+
},
16+
{
17+
"name": "English Wikipedia Policies",
18+
"host": "https://en.wikipedia.org",
19+
"api_path": "/w/api.php",
20+
"lang": "en",
21+
"login": false,
22+
"titles" : [
23+
"Wikipedia:Five_pillars",
24+
"Wikipedia:What_Wikipedia_is_not",
25+
"Wikipedia:Neutral_point_of_view",
26+
"Wikipedia:Copyrights",
27+
"Wikipedia:Civility"
28+
]
29+
}
30+
]

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,5 @@ uvicorn==0.27.0
4545
uvloop==0.19.0
4646
watchfiles==0.21.0
4747
websockets==12.0
48+
beautifulsoup4==4.12.3
49+
lxml==5.1.0

0 commit comments

Comments
 (0)