diff --git a/.gitignore b/.gitignore index bc7d212..c41a78a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ __pycache__/ # macOS .DS_Store + +# logs +*.log diff --git a/README.md b/README.md index 6f04ed3..ce15ae2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ + --env DOCUMENTS_TOC=json_input/gs-wiki.json \ + --env GSWIKI_USER= \ + --env GSWIKI_PW= \ --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ --volume gbnc_cache:/root/.cache \ diff --git a/gswikichat/fetch_articles.py b/gswikichat/fetch_articles.py new file mode 100644 index 0000000..8a6c3df --- /dev/null +++ b/gswikichat/fetch_articles.py @@ -0,0 +1,111 @@ +import os +import re +import json +import requests +import configparser + +from bs4 import BeautifulSoup + +GSWIKI_USER = os.environ.get('GSWIKI_USER') +GSWIKI_PW = os.environ.get('GSWIKI_PW') + +HTML_FILTERS = { + 'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'], + 'span': ['mw-ext-cite-error'], + 'table': ['noprint','ombox'], + 'ol': ['breadcrumb-nav-container', 'references'], + 'sup': ['reference'] +} +SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ] +REGEX_FILTERS = { + 'p': '→.*ersion' +} + +def filterHtml(soup): + for figure in soup.find_all('figure'): + figure.decompose() + + for tag, classes in HTML_FILTERS.items(): + for className in classes: + for div in soup.find_all(tag, {'class': className}): + div.decompose() + + for tag, regex in REGEX_FILTERS.items(): + for element in soup.find_all(tag): + if(re.search(regex, str(element)) != None): + element.decompose() + + return soup + +def fetchFromWiki(url, titles, loginRequired): + if(loginRequired == True): + session = loginToWiki(url) + else: + session = requests.Session() + + articles = {} + for title in titles: + sections = fetchSections(url, title, session.cookies) + print("fetching {} sections for article {}".format(len(sections), title)) + for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections : + if section['index'] == '' or section['line'] in SECTION_FILTERS: + continue + + query = { + 'action': 'parse', + 'page': title, + 'format': 'json', + 'prop':'text', + 'disabletoc': True, + 'disablelimitreport': True, + 'disableeditsection': True, + 'section': section['index'] + } + section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*'] + section_soup = BeautifulSoup(section_html, 'lxml') + articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text() + + return articles + + +def fetchSections(url, title, cookies=None): + query = { + 'action':'parse', + 'page':title, + 'format':'json', + 'prop':'sections' + } + sectionsResponse = requests.get(url,params=query, cookies=cookies) + toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ] + return toplevelSections + +def loginToWiki(url): + session = requests.Session() + + tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' } + token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken'] + loginData = { + 'lgname': GSWIKI_USER, + 'lgpassword': GSWIKI_PW, + 'lgtoken': token, + 'action': 'login', + 'format': 'json' + } + response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' }) + #TODO: error handling in case of login failure + return session + +def fetch_articles(toc): + articles = [] + for wiki in toc: + url = wiki['host'] + wiki['api_path'] + wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login']) + + articles.append( { + 'wiki': wiki['name'], + 'url': wiki['host'], + 'lang': wiki['lang'], + 'articles': wikiArticles + } ) + return articles + diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 95d52db..7febf39 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -14,13 +14,17 @@ import torch from .logger import get_logger +from .fetch_articles import fetch_articles + # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') +DOCUMENTS_TOC = os.environ.get('DOCUMENTS_TOC') -# disable this line to disable the embedding cache +# disable these lines to disable the cache +DOCUMENTS_CACHE_FILE = '/root/.cache/gbnc_documents.json' EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json' top_k = 5 @@ -32,49 +36,31 @@ logger.info('GPU is available.') device = "cuda" +if DOCUMENTS_CACHE_FILE and os.path.isfile(DOCUMENTS_CACHE_FILE): + logger.info('Loading documents from cache') + + with open(DOCUMENTS_CACHE_FILE, 'r') as f_in: + documents = json.load(f_in) -# TODO: Add the json strings as env variables -json_dir = 'json_input' -json_fname = 'excellent-articles_10.json' - -json_fpath = os.path.join(json_dir, json_fname) - -if os.path.isfile(json_fpath): - logger.info(f'Loading data from {json_fpath}') - with open(json_fpath, 'r') as finn: - json_obj = json.load(finn) - - if isinstance(json_obj, dict): - input_documents = [ - Document( - content=content_, - meta={"src": url_} - ) - for url_, content_ in tqdm(json_obj.items()) - ] - elif isinstance(json_obj, list): - input_documents = [ - Document( - content=obj_['content'], - meta={'src': obj_['meta']} - ) - for obj_ in tqdm(json_obj) - ] else: - input_documents = [ - Document( - content="My name is Asra, I live in Paris.", - meta={"src": "doc_1"} - ), - Document( - content="My name is Lee, I live in Berlin.", - meta={"src": "doc2"} - ), - Document( - content="My name is Giorgio, I live in Rome.", - meta={"src": "doc_3"} - ), - ] + logger.debug("fetch documents from wiki") + with open(DOCUMENTS_TOC, 'r') as tocFile: + toc = json.load(tocFile) + articles = fetch_articles(toc) + documents = {} + for wiki in articles: + documents.update(wiki['articles']) + if DOCUMENTS_CACHE_FILE: + with open(DOCUMENTS_CACHE_FILE, 'w') as f_out: + json.dump(documents, f_out) + +input_documents = [ + Document( + content=content_, + meta={"src": url_} + ) + for url_, content_ in tqdm(documents.items()) +] splitter = DocumentSplitter( split_by="sentence", diff --git a/json_input/gs-wiki.json b/json_input/gs-wiki.json new file mode 100644 index 0000000..25630df --- /dev/null +++ b/json_input/gs-wiki.json @@ -0,0 +1,73 @@ +[ + { + "name": "GS-Wiki de", + "host": "https://wiki.wikimedia.de/", + "api_path": "/api.php", + "lang": "de", + "login": true, + "titles" : [ + "Offboarding", + "Arbeitszeit", + "Beschwerdestelle_AGG", + "Betriebliche_Altersvorsorge", + "Betriebliches_Eingliederungsmanagement_(BEM)", + "Betriebsvereinbarung", + "Bildungszeit", + "COVID-19", + "Culture_Shock", + "Digitale_Gehaltsunterlagen_(Datev_Arbeitnehmer_Online)", + "Neue_Mitarbeiter", + "Elternzeit", + "Weiterbildung", + "Jubiläum", + "Krankmeldung", + "Mitarbeitendenjahresgespräch", + "Nebentätigkeit", + "Onboarding", + "Vorstandsbeschlüsse", + "Personio", + "Pme_Familienservice", + "Probezeit", + "Stellenausschreibungen", + "Überstunden", + "WMDE:Urlaub", + "Weiterbildung", + "Werkstudierende" + ] + }, + { + "name": "GS-Wiki en", + "host": "https://wiki.wikimedia.de/", + "api_path": "/api.php", + "lang": "en", + "login": true, + "titles" : [ + "Jubiläum/en", + "Betriebsvereinbarung", + "Company_pension_plan", + "COVID-19EN", + "Culture_Shock", + "Digital_Payslip", + "Beschwerdestelle_AGG/en", + "Betriebliches_Eingliederungsmanagement_(BEM)/en", + "Offboarding/en", + "Onboarding/en", + "Decisions_of_the_ED", + "Overtime", + "Bildungszeit/en", + "Parental_leave", + "Personio/en", + "Pme_Counselling_Service", + "Probationary_Period", + "Quartalsgespräche/en", + "Decisions_of_the_ED", + "Secondary_employment", + "Sick_leave", + "Fortbildung/en", + "Stellenausschreibungen/en", + "WMDE:Urlaub/en", + "Arbeitszeit/en", + "Werkstudierende/en" + ] + } +] \ No newline at end of file diff --git a/json_input/wp-policies.json b/json_input/wp-policies.json new file mode 100644 index 0000000..a0cfcd0 --- /dev/null +++ b/json_input/wp-policies.json @@ -0,0 +1,30 @@ +[ + { + "name": "German Wikipedia Policies", + "host": "https://de.wikipedia.org", + "api_path": "/w/api.php", + "lang": "de", + "login": false, + "titles" : [ + "Wikipedia:Grundprinzipien", + "Wikipedia:Was_Wikipedia_nicht_ist", + "Wikipedia:Neutraler_Standpunkt", + "Wikipedia:Urheberrechte_beachten", + "Wikipedia:Wikiquette" + ] + }, + { + "name": "English Wikipedia Policies", + "host": "https://en.wikipedia.org", + "api_path": "/w/api.php", + "lang": "en", + "login": false, + "titles" : [ + "Wikipedia:Five_pillars", + "Wikipedia:What_Wikipedia_is_not", + "Wikipedia:Neutral_point_of_view", + "Wikipedia:Copyrights", + "Wikipedia:Civility" + ] + } +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 723011a..1cef685 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,5 @@ uvicorn==0.27.0 uvloop==0.19.0 watchfiles==0.21.0 websockets==12.0 +beautifulsoup4==4.12.3 +lxml==5.1.0 \ No newline at end of file