auto fetch gs-wiki articles

Silvan-WMDE · rti · commit 57ec054eb375 · 2024-02-27T17:05:32.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,6 @@ __pycache__/
 
 # macOS
 .DS_Store
+
+# logs
+*.log
diff --git a/README.md b/README.md
@@ -10,6 +10,9 @@ To build and run the container locally with hot reload on python files do:
 ```
 DOCKER_BUILDKIT=1 docker build . -t gbnc
 docker run  \
+  --env DOCUMENTS_TOC=json_input/gs-wiki.json \
+  --env GSWIKI_USER=<bot_username> \
+  --env GSWIKI_PW=<bot_password> \
   --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
   --volume "$(pwd)/gswikichat":/workspace/gswikichat \
   --volume gbnc_cache:/root/.cache \
diff --git a/gswikichat/fetch_articles.py b/gswikichat/fetch_articles.py
@@ -0,0 +1,111 @@
+import os
+import re
+import json
+import requests
+import configparser
+
+from bs4 import BeautifulSoup
+
+GSWIKI_USER = os.environ.get('GSWIKI_USER')
+GSWIKI_PW = os.environ.get('GSWIKI_PW')
+
+HTML_FILTERS  = {
+    'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'],
+    'span': ['mw-ext-cite-error'],
+    'table': ['noprint','ombox'],
+    'ol': ['breadcrumb-nav-container', 'references'],
+    'sup': ['reference']
+}
+SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ]
+REGEX_FILTERS = {
+    'p': '→.*ersion'
+}
+
+def filterHtml(soup):
+    for figure in soup.find_all('figure'):
+        figure.decompose()
+
+    for tag, classes in HTML_FILTERS.items():
+        for className in classes:
+            for div in soup.find_all(tag, {'class': className}):
+                div.decompose()
+
+    for tag, regex in REGEX_FILTERS.items():
+        for element in soup.find_all(tag):
+            if(re.search(regex, str(element)) != None):
+                element.decompose()
+
+    return soup
+
+def fetchFromWiki(url, titles, loginRequired):
+    if(loginRequired == True):
+        session = loginToWiki(url)
+    else:
+        session = requests.Session()
+
+    articles = {}
+    for title in titles:
+        sections = fetchSections(url, title, session.cookies)
+        print("fetching {} sections for article {}".format(len(sections), title))
+        for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections :
+            if section['index'] == '' or section['line'] in SECTION_FILTERS:
+                continue
+
+            query = {
+                'action': 'parse',
+                'page': title,
+                'format': 'json',
+                'prop':'text',
+                'disabletoc': True,
+                'disablelimitreport': True,
+                'disableeditsection': True,
+                'section': section['index']
+            }
+            section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*']
+            section_soup = BeautifulSoup(section_html, 'lxml')
+            articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text()
+
+    return articles
+
+
+def fetchSections(url, title, cookies=None):
+    query = {
+        'action':'parse',
+        'page':title,
+        'format':'json',
+        'prop':'sections'
+    }
+    sectionsResponse = requests.get(url,params=query, cookies=cookies)
+    toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ]
+    return toplevelSections
+
+def loginToWiki(url):
+    session = requests.Session()
+
+    tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' }
+    token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken']
+    loginData = {
+        'lgname': GSWIKI_USER,
+        'lgpassword': GSWIKI_PW,
+        'lgtoken': token,
+        'action': 'login',
+        'format': 'json'
+    }
+    response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' })
+    #TODO: error handling in case of login failure
+    return session
+
+def fetch_articles(toc):
+    articles = []
+    for wiki in toc:
+        url = wiki['host'] + wiki['api_path']
+        wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login'])
+
+        articles.append( {
+            'wiki': wiki['name'],
+            'url': wiki['host'],
+            'lang': wiki['lang'],
+            'articles': wikiArticles
+            } )
+    return articles
+
diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py
@@ -14,13 +14,17 @@
 import torch
 
 from .logger import get_logger
+from .fetch_articles import fetch_articles
+
 
 # Create logger instance from base logger config in `logger.py`
 logger = get_logger(__name__)
 
 HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN')
+DOCUMENTS_TOC = os.environ.get('DOCUMENTS_TOC')
 
-# disable this line to disable the embedding cache
+# disable these lines to disable the cache
+DOCUMENTS_CACHE_FILE = '/root/.cache/gbnc_documents.json'
 EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json'
 
 top_k = 5
@@ -32,49 +36,31 @@
     logger.info('GPU is available.')
     device = "cuda"
 
+if DOCUMENTS_CACHE_FILE and os.path.isfile(DOCUMENTS_CACHE_FILE):
+    logger.info('Loading documents from cache')
+
+    with open(DOCUMENTS_CACHE_FILE, 'r') as f_in:
+        documents = json.load(f_in)
 
-# TODO: Add the json strings as env variables
-json_dir = 'json_input'
-json_fname = 'excellent-articles_10.json'
-
-json_fpath = os.path.join(json_dir, json_fname)
-
-if os.path.isfile(json_fpath):
-    logger.info(f'Loading data from {json_fpath}')
-    with open(json_fpath, 'r') as finn:
-        json_obj = json.load(finn)
-
-    if isinstance(json_obj, dict):
-        input_documents = [
-            Document(
-                content=content_,
-                meta={"src": url_}
-            )
-            for url_, content_ in tqdm(json_obj.items())
-        ]
-    elif isinstance(json_obj, list):
-        input_documents = [
-            Document(
-                content=obj_['content'],
-                meta={'src': obj_['meta']}
-            )
-            for obj_ in tqdm(json_obj)
-        ]
 else:
-    input_documents = [
-        Document(
-            content="My name is Asra, I live in Paris.",
-            meta={"src": "doc_1"}
-        ),
-        Document(
-            content="My name is Lee, I live in Berlin.",
-            meta={"src": "doc2"}
-        ),
-        Document(
-            content="My name is Giorgio, I live in Rome.",
-            meta={"src": "doc_3"}
-        ),
-    ]
+    logger.debug("fetch documents from wiki")
+    with open(DOCUMENTS_TOC, 'r') as tocFile:
+        toc = json.load(tocFile)
+        articles = fetch_articles(toc)
+        documents = {}
+        for wiki in articles:
+            documents.update(wiki['articles'])
+        if DOCUMENTS_CACHE_FILE:
+            with open(DOCUMENTS_CACHE_FILE, 'w') as f_out:
+                json.dump(documents, f_out)
+
+input_documents = [
+    Document(
+        content=content_,
+        meta={"src": url_}
+    )
+    for url_, content_ in tqdm(documents.items())
+]
 
 splitter = DocumentSplitter(
     split_by="sentence",
diff --git a/json_input/gs-wiki.json b/json_input/gs-wiki.json
@@ -0,0 +1,73 @@
+[
+    {
+        "name": "GS-Wiki de",
+        "host": "https://wiki.wikimedia.de/",
+        "api_path": "/api.php",
+        "lang": "de",
+        "login": true,
+        "titles" : [
+            "Offboarding",
+            "Arbeitszeit",
+            "Beschwerdestelle_AGG",
+            "Betriebliche_Altersvorsorge",
+            "Betriebliches_Eingliederungsmanagement_(BEM)",
+            "Betriebsvereinbarung",
+            "Bildungszeit",
+            "COVID-19",
+            "Culture_Shock",
+            "Digitale_Gehaltsunterlagen_(Datev_Arbeitnehmer_Online)",
+            "Neue_Mitarbeiter",
+            "Elternzeit",
+            "Weiterbildung",
+            "Jubiläum",
+            "Krankmeldung",
+            "Mitarbeitendenjahresgespräch",
+            "Nebentätigkeit",
+            "Onboarding",
+            "Vorstandsbeschlüsse",
+            "Personio",
+            "Pme_Familienservice",
+            "Probezeit",
+            "Stellenausschreibungen",
+            "Überstunden",
+            "WMDE:Urlaub",
+            "Weiterbildung",
+            "Werkstudierende"
+        ]
+    },
+    {
+        "name": "GS-Wiki en",
+        "host": "https://wiki.wikimedia.de/",
+        "api_path": "/api.php",
+        "lang": "en",
+        "login": true,
+        "titles" : [
+            "Jubiläum/en",
+            "Betriebsvereinbarung",
+            "Company_pension_plan",
+            "COVID-19EN",
+            "Culture_Shock",
+            "Digital_Payslip",
+            "Beschwerdestelle_AGG/en",
+            "Betriebliches_Eingliederungsmanagement_(BEM)/en",
+            "Offboarding/en",
+            "Onboarding/en",
+            "Decisions_of_the_ED",
+            "Overtime",
+            "Bildungszeit/en",
+            "Parental_leave",
+            "Personio/en",
+            "Pme_Counselling_Service",
+            "Probationary_Period",
+            "Quartalsgespräche/en",
+            "Decisions_of_the_ED",
+            "Secondary_employment",
+            "Sick_leave",
+            "Fortbildung/en",
+            "Stellenausschreibungen/en",
+            "WMDE:Urlaub/en",
+            "Arbeitszeit/en",
+            "Werkstudierende/en"
+        ]
+    }
+]
diff --git a/json_input/wp-policies.json b/json_input/wp-policies.json
@@ -0,0 +1,30 @@
+[
+    {
+        "name": "German Wikipedia Policies",
+        "host": "https://de.wikipedia.org",
+        "api_path": "/w/api.php",
+        "lang": "de",
+        "login": false,
+        "titles" : [
+            "Wikipedia:Grundprinzipien",
+            "Wikipedia:Was_Wikipedia_nicht_ist",
+            "Wikipedia:Neutraler_Standpunkt",
+            "Wikipedia:Urheberrechte_beachten",
+            "Wikipedia:Wikiquette"
+        ]
+    },
+    {
+        "name": "English Wikipedia Policies",
+        "host": "https://en.wikipedia.org",
+        "api_path": "/w/api.php",
+        "lang": "en",
+        "login": false,
+        "titles" : [
+            "Wikipedia:Five_pillars",
+            "Wikipedia:What_Wikipedia_is_not",
+            "Wikipedia:Neutral_point_of_view",
+            "Wikipedia:Copyrights",
+            "Wikipedia:Civility"
+        ]
+    }
+]
diff --git a/requirements.txt b/requirements.txt
@@ -45,3 +45,5 @@ uvicorn==0.27.0
 uvloop==0.19.0
 watchfiles==0.21.0
 websockets==12.0
+beautifulsoup4==4.12.3
+lxml==5.1.0

-Original file line number
+Diff line change
 # macOS
 .DS_Store
++
 +# logs
 +*.log