Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ __pycache__/

# macOS
.DS_Store

# logs
*.log
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ To build and run the container locally with hot reload on python files do:
```
DOCKER_BUILDKIT=1 docker build . -t gbnc
docker run \
--env DOCUMENTS_TOC=json_input/gs-wiki.json \
--env GSWIKI_USER=<bot_username> \
--env GSWIKI_PW=<bot_password> \
--env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
--volume "$(pwd)/gswikichat":/workspace/gswikichat \
--volume gbnc_cache:/root/.cache \
Expand Down
111 changes: 111 additions & 0 deletions gswikichat/fetch_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import re
import json
import requests
import configparser

from bs4 import BeautifulSoup

GSWIKI_USER = os.environ.get('GSWIKI_USER')
GSWIKI_PW = os.environ.get('GSWIKI_PW')

HTML_FILTERS = {
'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'],
'span': ['mw-ext-cite-error'],
'table': ['noprint','ombox'],
'ol': ['breadcrumb-nav-container', 'references'],
'sup': ['reference']
}
SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ]
REGEX_FILTERS = {
'p': '→.*ersion'
}

def filterHtml(soup):
for figure in soup.find_all('figure'):
figure.decompose()

for tag, classes in HTML_FILTERS.items():
for className in classes:
for div in soup.find_all(tag, {'class': className}):
div.decompose()

for tag, regex in REGEX_FILTERS.items():
for element in soup.find_all(tag):
if(re.search(regex, str(element)) != None):
element.decompose()

return soup

def fetchFromWiki(url, titles, loginRequired):
if(loginRequired == True):
session = loginToWiki(url)
else:
session = requests.Session()

articles = {}
for title in titles:
sections = fetchSections(url, title, session.cookies)
print("fetching {} sections for article {}".format(len(sections), title))
for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections :
if section['index'] == '' or section['line'] in SECTION_FILTERS:
continue

query = {
'action': 'parse',
'page': title,
'format': 'json',
'prop':'text',
'disabletoc': True,
'disablelimitreport': True,
'disableeditsection': True,
'section': section['index']
}
section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*']
section_soup = BeautifulSoup(section_html, 'lxml')
articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text()

return articles


def fetchSections(url, title, cookies=None):
query = {
'action':'parse',
'page':title,
'format':'json',
'prop':'sections'
}
sectionsResponse = requests.get(url,params=query, cookies=cookies)
toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ]
return toplevelSections

def loginToWiki(url):
session = requests.Session()

tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' }
token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken']
loginData = {
'lgname': GSWIKI_USER,
'lgpassword': GSWIKI_PW,
'lgtoken': token,
'action': 'login',
'format': 'json'
}
response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' })
#TODO: error handling in case of login failure
return session

def fetch_articles(toc):
articles = []
for wiki in toc:
url = wiki['host'] + wiki['api_path']
wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login'])

articles.append( {
'wiki': wiki['name'],
'url': wiki['host'],
'lang': wiki['lang'],
'articles': wikiArticles
} )
return articles

70 changes: 28 additions & 42 deletions gswikichat/vector_store_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
import torch

from .logger import get_logger
from .fetch_articles import fetch_articles


# Create logger instance from base logger config in `logger.py`
logger = get_logger(__name__)

HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN')
DOCUMENTS_TOC = os.environ.get('DOCUMENTS_TOC')

# disable this line to disable the embedding cache
# disable these lines to disable the cache
DOCUMENTS_CACHE_FILE = '/root/.cache/gbnc_documents.json'
EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json'

top_k = 5
Expand All @@ -32,49 +36,31 @@
logger.info('GPU is available.')
device = "cuda"

if DOCUMENTS_CACHE_FILE and os.path.isfile(DOCUMENTS_CACHE_FILE):
logger.info('Loading documents from cache')

with open(DOCUMENTS_CACHE_FILE, 'r') as f_in:
documents = json.load(f_in)

# TODO: Add the json strings as env variables
json_dir = 'json_input'
json_fname = 'excellent-articles_10.json'

json_fpath = os.path.join(json_dir, json_fname)

if os.path.isfile(json_fpath):
logger.info(f'Loading data from {json_fpath}')
with open(json_fpath, 'r') as finn:
json_obj = json.load(finn)

if isinstance(json_obj, dict):
input_documents = [
Document(
content=content_,
meta={"src": url_}
)
for url_, content_ in tqdm(json_obj.items())
]
elif isinstance(json_obj, list):
input_documents = [
Document(
content=obj_['content'],
meta={'src': obj_['meta']}
)
for obj_ in tqdm(json_obj)
]
else:
input_documents = [
Document(
content="My name is Asra, I live in Paris.",
meta={"src": "doc_1"}
),
Document(
content="My name is Lee, I live in Berlin.",
meta={"src": "doc2"}
),
Document(
content="My name is Giorgio, I live in Rome.",
meta={"src": "doc_3"}
),
]
logger.debug("fetch documents from wiki")
with open(DOCUMENTS_TOC, 'r') as tocFile:
toc = json.load(tocFile)
articles = fetch_articles(toc)
documents = {}
for wiki in articles:
documents.update(wiki['articles'])
if DOCUMENTS_CACHE_FILE:
with open(DOCUMENTS_CACHE_FILE, 'w') as f_out:
json.dump(documents, f_out)

input_documents = [
Document(
content=content_,
meta={"src": url_}
)
for url_, content_ in tqdm(documents.items())
]

splitter = DocumentSplitter(
split_by="sentence",
Expand Down
73 changes: 73 additions & 0 deletions json_input/gs-wiki.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
[
{
"name": "GS-Wiki de",
"host": "https://wiki.wikimedia.de/",
"api_path": "/api.php",
"lang": "de",
"login": true,
"titles" : [
"Offboarding",
"Arbeitszeit",
"Beschwerdestelle_AGG",
"Betriebliche_Altersvorsorge",
"Betriebliches_Eingliederungsmanagement_(BEM)",
"Betriebsvereinbarung",
"Bildungszeit",
"COVID-19",
"Culture_Shock",
"Digitale_Gehaltsunterlagen_(Datev_Arbeitnehmer_Online)",
"Neue_Mitarbeiter",
"Elternzeit",
"Weiterbildung",
"Jubiläum",
"Krankmeldung",
"Mitarbeitendenjahresgespräch",
"Nebentätigkeit",
"Onboarding",
"Vorstandsbeschlüsse",
"Personio",
"Pme_Familienservice",
"Probezeit",
"Stellenausschreibungen",
"Überstunden",
"WMDE:Urlaub",
"Weiterbildung",
"Werkstudierende"
]
},
{
"name": "GS-Wiki en",
"host": "https://wiki.wikimedia.de/",
"api_path": "/api.php",
"lang": "en",
"login": true,
"titles" : [
"Jubiläum/en",
"Betriebsvereinbarung",
"Company_pension_plan",
"COVID-19EN",
"Culture_Shock",
"Digital_Payslip",
"Beschwerdestelle_AGG/en",
"Betriebliches_Eingliederungsmanagement_(BEM)/en",
"Offboarding/en",
"Onboarding/en",
"Decisions_of_the_ED",
"Overtime",
"Bildungszeit/en",
"Parental_leave",
"Personio/en",
"Pme_Counselling_Service",
"Probationary_Period",
"Quartalsgespräche/en",
"Decisions_of_the_ED",
"Secondary_employment",
"Sick_leave",
"Fortbildung/en",
"Stellenausschreibungen/en",
"WMDE:Urlaub/en",
"Arbeitszeit/en",
"Werkstudierende/en"
]
}
]
30 changes: 30 additions & 0 deletions json_input/wp-policies.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[
{
"name": "German Wikipedia Policies",
"host": "https://de.wikipedia.org",
"api_path": "/w/api.php",
"lang": "de",
"login": false,
"titles" : [
"Wikipedia:Grundprinzipien",
"Wikipedia:Was_Wikipedia_nicht_ist",
"Wikipedia:Neutraler_Standpunkt",
"Wikipedia:Urheberrechte_beachten",
"Wikipedia:Wikiquette"
]
},
{
"name": "English Wikipedia Policies",
"host": "https://en.wikipedia.org",
"api_path": "/w/api.php",
"lang": "en",
"login": false,
"titles" : [
"Wikipedia:Five_pillars",
"Wikipedia:What_Wikipedia_is_not",
"Wikipedia:Neutral_point_of_view",
"Wikipedia:Copyrights",
"Wikipedia:Civility"
]
}
]
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,5 @@ uvicorn==0.27.0
uvloop==0.19.0
watchfiles==0.21.0
websockets==12.0
beautifulsoup4==4.12.3
lxml==5.1.0