diff --git a/.gitignore b/.gitignore index bc7d212..c41a78a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ __pycache__/ # macOS .DS_Store + +# logs +*.log diff --git a/Dockerfile b/Dockerfile index dc8d862..d9eeac1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,18 +12,31 @@ FROM $CUDA_FROM ENV PATH="/usr/local/cuda/bin:${PATH}" +# Install unattendedly +ENV DEBIAN_FRONTEND=noninteractive + +# Force a config for tzdata package, otherwise it will interactively ask during install +RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime + # Install essential packages from ubuntu repository RUN apt-get update -y && \ apt-get install -y --no-install-recommends openssh-server openssh-client git git-lfs && \ apt-get install -y curl && \ apt-get install -y python3 python3-pip python3-venv && \ + apt-get install -y postgresql-14 && \ + apt-get install -y jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Install vecto.rs extension to postgres +RUN curl -L -O https://github.com/tensorchord/pgvecto.rs/releases/download/v0.2.0/vectors-pg14_0.2.0_amd64.deb +RUN dpkg -i vectors-pg14_0.2.0_amd64.deb + + # Install node from upstream, ubuntu packages are too old -RUN curl -sL https://deb.nodesource.com/setup_18.x | bash -RUN apt-get install -y nodejs && \ +RUN curl -sL https://deb.nodesource.com/setup_18.x | bash && \ + apt-get install -y nodejs && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -53,11 +66,13 @@ ARG OLLAMA_URL=http://localhost:11434 ENV OLLAMA_MODEL_NAME=${OLLAMA_MODEL_NAME} ENV OLLAMA_URL=${OLLAMA_URL} +# TODO: cache path RUN ollama serve & while ! curl ${OLLAMA_URL}; do sleep 1; done; ollama pull $OLLAMA_MODEL_NAME # Load sentence-transformers model once in order to cache it in the image # TODO: ARG / ENV for embedder model +# TODO: SENTENCE_TRANSFORMERS_HOME for cache path RUN echo "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\nSentenceTransformersDocumentEmbedder(model='svalabs/german-gpl-adapted-covid').warm_up()" | python3 diff --git a/README.md b/README.md index 6f04ed3..d3a64a6 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ - --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ - --volume gbnc_cache:/root/.cache \ --publish 8000:8000 \ --rm \ --interactive \ @@ -22,6 +20,21 @@ docker run \ ``` Point your browser to http://localhost:8000/ and use the frontend. +To fetch data from a `toc.json` wiki fetching definition, run: +``` +$ docker exec -it gbnc bash +# export WIKI_USER= +# export WIKI_PW= +# python3 -m gswikichat.fetch_articles toc.json > articles.json +``` + +To import data run: +``` +$ docker exec -it gbnc bash +# cat json_input/excellent-articles_10.json | jq 'to_entries | map({content: .value, meta: {source: .key}})' > import.json +# python3 -m gswikichat.db import.json +``` + ### Runpod.io The container works on [runpod.io](https://www.runpod.io/) GPU instances. A [template is available here](https://runpod.io/gsc?template=0w8z55rf19&ref=yfvyfa0s). diff --git a/frontend/src/components/field/FieldAnswer.vue b/frontend/src/components/field/FieldAnswer.vue index 99afac7..04f1f95 100644 --- a/frontend/src/components/field/FieldAnswer.vue +++ b/frontend/src/components/field/FieldAnswer.vue @@ -17,8 +17,8 @@ class="text-sm cursor-pointer text-light-distinct-text dark:text-dark-distinct-text" > - {{ $t('source') }} ({{ s.score.toFixed(1) }}/5): - {{ s.src }} + {{ $t('source') }} ({{ s.score.toFixed(1) }}): + {{ s.source }}

{{ s.content }}

diff --git a/frontend/src/types/source.d.ts b/frontend/src/types/source.d.ts index 4e3cd7e..8d0de16 100644 --- a/frontend/src/types/source.d.ts +++ b/frontend/src/types/source.d.ts @@ -1,6 +1,6 @@ export type Source = { id: number - src: string + source: string content: string score: number } diff --git a/gswikichat/api.py b/gswikichat/api.py index 03a3673..5b48ac3 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -6,14 +6,13 @@ from fastapi.staticfiles import StaticFiles from fastapi import FastAPI, Header -from .rag import rag_pipeline - from .logger import get_logger +from .rag import rag_pipeline # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -FRONTEND_STATIC_DIR = './frontend/dist' +FRONTEND_STATIC_DIR = "./frontend/dist" API_SECRET = os.environ.get("API_SECRET") app = FastAPI() @@ -21,46 +20,43 @@ app.mount( "/assets", StaticFiles(directory=f"{FRONTEND_STATIC_DIR}/assets"), - name="frontend-assets" + name="frontend-assets", ) + @app.get("/") async def root(): return FileResponse(f"{FRONTEND_STATIC_DIR}/index.html") + @app.get("/favicon.ico") async def favicon(): return FileResponse(f"{FRONTEND_STATIC_DIR}/favicon.ico") + @app.get("/api") async def api(x_api_secret: Annotated[str, Header()], query, top_k=3, lang='en'): if not API_SECRET == x_api_secret: raise Exception("API key is missing or incorrect") - if not lang in ['en', 'de']: + if not lang in ["en", "de"]: raise Exception("language must be 'en' or 'de'") - logger.debug(f'{query=}') # Assuming we change the input name - logger.debug(f'{top_k=}') - logger.debug(f'{lang=}') + logger.debug(f"{query=}") + logger.debug(f"{top_k=}") + logger.debug(f"{lang=}") + + answer = rag_pipeline(query=query, top_k=top_k, lang=lang) - answer = rag_pipeline( - query=query, - top_k=top_k, - lang=lang - ) + if not answer: + return {} sources = [ - { - "src": d_.meta['src'], - "content": d_.content, - "score": d_.score - } for d_ in answer.documents + {"id": d_.id, "source": d_.meta["source"], "content": d_.content, "score": d_.score} + for d_ in answer.documents ] - logger.debug(f'{answer=}') + logger.debug(f"{answer.data=}") + logger.debug(f"{answer.documents=}") - return { - "answer": answer.data.content, - "sources": sources - } + return {"answer": answer.data.content, "sources": sources} diff --git a/gswikichat/db.py b/gswikichat/db.py new file mode 100644 index 0000000..cc0b3f2 --- /dev/null +++ b/gswikichat/db.py @@ -0,0 +1,107 @@ +import os + +import torch + +from langchain.text_splitter import CharacterTextSplitter +from langchain_community.document_loaders import JSONLoader +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs + +from .logger import get_logger + + +SENTENCE_TRANSFORMER_MODEL = "svalabs/german-gpl-adapted-covid" + +logger = get_logger(__name__) + + +def get_device(): + device = "cpu" + if torch.cuda.is_available(): + logger.info("GPU is available.") + device = "cuda" + return device + + +def get_embedding_model(): + # https://huggingface.co/svalabs/german-gpl-adapted-covid + logger.info(f"Embedding model: {SENTENCE_TRANSFORMER_MODEL}") + + return HuggingFaceEmbeddings( + model_name=SENTENCE_TRANSFORMER_MODEL, + model_kwargs={"device": get_device()}, + show_progress=True, + ) + + +def get_db(): + PORT = os.getenv("DB_PORT", 5432) + HOST = os.getenv("DB_HOST", "127.0.0.1") + USER = os.getenv("DB_USER", "gbnc") + PASS = os.getenv("DB_PASS", "") + DB_NAME = os.getenv("DB_NAME", "gbnc") + + URL = "postgresql+psycopg://{username}:{password}@{host}:{port}/{db_name}".format( + port=PORT, + host=HOST, + username=USER, + password=PASS, + db_name=DB_NAME, + ) + + return PGVecto_rs.from_collection_name( + embedding=get_embedding_model(), + db_url=URL, + collection_name="gbnc", + ) + + +def import_data(file): + def metadata_func(record: dict, metadata: dict) -> dict: + metadata["source"] = record.get("meta", {}).get("source") + return metadata + + loader = JSONLoader( + file_path=file, + jq_schema=".[]", + content_key="content", + metadata_func=metadata_func, + ) + + documents = loader.load() + + logger.debug(f"Loaded {len(documents)} documents.") + + text_splitter = CharacterTextSplitter(chunk_size=250, chunk_overlap=0) + chunks = text_splitter.split_documents(documents) + logger.debug(f"Split documents into {len(chunks)} chunks.") + + logger.debug(f"Importing into database.") + get_db().add_documents(chunks) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + file = sys.argv[1] + import_data(file) + + else: + logger.error( + """Provide JSON file with the following structure as first parameter + [ + { + "content":"document content one", "meta":{ + "source": "https://source.url/one" + } + }, + { + "content":"document content two", "meta":{ + "source": "https://source.url/two" + } + } + ] + """ + ) + sys.exit(1) diff --git a/gswikichat/fetch_articles.py b/gswikichat/fetch_articles.py new file mode 100644 index 0000000..457ee12 --- /dev/null +++ b/gswikichat/fetch_articles.py @@ -0,0 +1,166 @@ +import os +import re +import json +import requests +import configparser + +from bs4 import BeautifulSoup + +from .logger import get_logger +logger = get_logger(__name__) + +WIKI_USER = os.environ.get('WIKI_USER') +WIKI_PW = os.environ.get('WIKI_PW') + +HTML_FILTERS = { + 'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'], + 'span': ['mw-ext-cite-error'], + 'table': ['noprint','ombox'], + 'ol': ['breadcrumb-nav-container', 'references'], + 'sup': ['reference'] +} +SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ] +REGEX_FILTERS = { + 'p': '→.*ersion' +} + +def filterHtml(soup): + for figure in soup.find_all('figure'): + figure.decompose() + + for tag, classes in HTML_FILTERS.items(): + for className in classes: + for div in soup.find_all(tag, {'class': className}): + div.decompose() + + for tag, regex in REGEX_FILTERS.items(): + for element in soup.find_all(tag): + if(re.search(regex, str(element)) != None): + element.decompose() + + return soup + +def fetchFromWiki(url, titles, loginRequired): + if(loginRequired == True): + session = loginToWiki(url) + else: + session = requests.Session() + + articles = {} + for title in titles: + sections = fetchSections(url, title, session.cookies) + print("fetching {} sections for article {}".format(len(sections), title), file=sys.stderr) + for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections : + if section['index'] == '' or section['line'] in SECTION_FILTERS: + continue + + query = { + 'action': 'parse', + 'page': title, + 'format': 'json', + 'prop':'text', + 'disabletoc': True, + 'disablelimitreport': True, + 'disableeditsection': True, + 'section': section['index'] + } + section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*'] + section_soup = BeautifulSoup(section_html, 'lxml') + articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text() + + return articles + + +def fetchSections(url, title, cookies=None): + query = { + 'action':'parse', + 'page':title, + 'format':'json', + 'prop':'sections' + } + sectionsResponse = requests.get(url,params=query, cookies=cookies) + toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ] + return toplevelSections + +def loginToWiki(url): + session = requests.Session() + + tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' } + token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken'] + loginData = { + 'lgname': WIKI_USER, + 'lgpassword': WIKI_PW, + 'lgtoken': token, + 'action': 'login', + 'format': 'json' + } + response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' }) + #TODO: error handling in case of login failure + return session + +def fetch_articles(toc): + articles = [] + for wiki in toc: + url = wiki['host'] + wiki['api_path'] + wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login']) + + articles.append( { + 'wiki': wiki['name'], + 'url': wiki['host'], + 'lang': wiki['lang'], + 'articles': wikiArticles + } ) + + return articles + +def transform_articles(articles): + output = {} + for wiki in articles: + url = wiki.get("url") + "/wiki/" + articles = wiki.get("articles") + for name, content in articles.items(): + output[url+name] = content + return output + +if __name__ == "__main__": + import sys + import json + + if len(sys.argv) > 1: + file = sys.argv[1] + with open(file) as f: + data = json.load(f) + + articles = fetch_articles(data) + print(json.dumps(transform_articles(articles), indent=4)) + + else: + logger.error( + """Provide JSON file with the following structure as first parameter + [ + { + "name": "Name of the wiki", + "host": "https://somewiki.org", + "api_path": "/w/api.php", + "lang": "en", + "login": false, + "titles" : [ + "Namespace:Page1", + "Namespace:Page2" + ] + }, + { + "name": "Name of the another wiki", + "host": "https://someotherwiki.org", + "api_path": "/w/api.php", + "lang": "de", + "login": false, + "titles" : [ + "Namespace:SeiteEins", + "Namespace:SeiteZwei" + ] + } + ] + """ + ) + sys.exit(1) diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 5af6c1c..67459c2 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -10,14 +10,14 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL") OLLAMA_CHAT_URL = f"{OLLAMA_URL}/api/chat" -logger.info(f'Using {OLLAMA_MODEL_NAME=}') -logger.info(f'Endpoint: {OLLAMA_URL=}') -logger.info(f'Generate: {OLLAMA_CHAT_URL=}') +def get_llm(): + logger.info(f'Using {OLLAMA_MODEL_NAME=}') + logger.info(f'Endpoint: {OLLAMA_URL=}') + logger.info(f'Generate: {OLLAMA_CHAT_URL=}') + logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") -logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") - -llm = OllamaChatGenerator( - model=OLLAMA_MODEL_NAME, - url=OLLAMA_CHAT_URL, - timeout=120 -) + return OllamaChatGenerator( + model=OLLAMA_MODEL_NAME, + url=OLLAMA_CHAT_URL, + timeout=120 + ) diff --git a/gswikichat/rag.py b/gswikichat/rag.py index b916686..d9e33f1 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -3,52 +3,54 @@ from haystack.components.builders.answer_builder import AnswerBuilder from haystack.dataclasses import ChatMessage -from .llm_config import llm +from .db import get_db +from .llm_config import get_llm from .logger import get_logger from .prompt import user_prompt_builders, system_prompts -from .vector_store_interface import embedder, retriever, input_documents # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): +def langchain_to_haystack_doc(langchain_document, score): + return Document.from_dict( + { + "content": langchain_document.page_content, + "meta": langchain_document.metadata, + "score": score, + } + ) - query_document = Document(content=query) - query_embedded = embedder.run([query_document]) - query_embedding = query_embedded['documents'][0].embedding - retriever_results = retriever.run( - query_embedding=list(query_embedding), - filters=None, - top_k=top_k, - scale_score=None, - return_embedding=None - ) +def rag_pipeline(query: str, top_k: int, lang: str): + docs_with_score = get_db().similarity_search_with_score(query, top_k) + docs_with_score.reverse() # best first - logger.debug('retriever results:') - for retriever_result_ in retriever_results: - logger.debug(retriever_result_) + if len(docs_with_score) == 0: + return None - system_prompt = system_prompts[lang] - user_prompt_builder = user_prompt_builders[lang] + logger.debug("Matching documents: ") + for doc, score in docs_with_score: + logger.debug("-" * 80) + logger.debug(f"Score: {score}") + logger.debug(doc.page_content) - user_prompt_build = user_prompt_builder.run( - question=query_document.content, - documents=retriever_results['documents'] - ) + docs = [langchain_to_haystack_doc(d, s) for d, s in docs_with_score] - prompt = user_prompt_build['prompt'] + system_prompt = system_prompts[lang] + user_prompt_builder = user_prompt_builders[lang] + user_prompt_build = user_prompt_builder.run(question=query, documents=docs) + prompt = user_prompt_build["prompt"] - logger.debug(f'{prompt=}') + logger.debug(f"{prompt=}") messages = [ ChatMessage.from_system(system_prompt), ChatMessage.from_user(prompt), ] - response = llm.run( - messages, + response = get_llm().run( + messages, # generation_kwargs={"temperature": 0.2} ) @@ -56,14 +58,14 @@ def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): answer_builder = AnswerBuilder() answer_build = answer_builder.run( - query=query_document.content, - replies=response['replies'], - meta=[r.meta for r in response['replies']], - documents=retriever_results['documents'], + query=query, + replies=response["replies"], + meta=[r.meta for r in response["replies"]], + documents=docs, pattern=None, - reference_pattern=None + reference_pattern=None, ) - logger.debug(f'{answer_build=}') + logger.debug(f"{answer_build=}") - return answer_build['answers'][0] + return answer_build["answers"][0] diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py deleted file mode 100644 index 95d52db..0000000 --- a/gswikichat/vector_store_interface.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import json - -from tqdm import tqdm - -from haystack import Document # , Pipeline -from haystack.components.embedders import SentenceTransformersDocumentEmbedder -from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever -from haystack.document_stores.types.policy import DuplicatePolicy -from haystack.components.preprocessors import DocumentSplitter -from haystack.components.preprocessors import DocumentCleaner - -import torch - -from .logger import get_logger - -# Create logger instance from base logger config in `logger.py` -logger = get_logger(__name__) - -HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') - -# disable this line to disable the embedding cache -EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json' - -top_k = 5 -input_documents = [] - -device = "cpu" - -if torch.cuda.is_available(): - logger.info('GPU is available.') - device = "cuda" - - -# TODO: Add the json strings as env variables -json_dir = 'json_input' -json_fname = 'excellent-articles_10.json' - -json_fpath = os.path.join(json_dir, json_fname) - -if os.path.isfile(json_fpath): - logger.info(f'Loading data from {json_fpath}') - with open(json_fpath, 'r') as finn: - json_obj = json.load(finn) - - if isinstance(json_obj, dict): - input_documents = [ - Document( - content=content_, - meta={"src": url_} - ) - for url_, content_ in tqdm(json_obj.items()) - ] - elif isinstance(json_obj, list): - input_documents = [ - Document( - content=obj_['content'], - meta={'src': obj_['meta']} - ) - for obj_ in tqdm(json_obj) - ] -else: - input_documents = [ - Document( - content="My name is Asra, I live in Paris.", - meta={"src": "doc_1"} - ), - Document( - content="My name is Lee, I live in Berlin.", - meta={"src": "doc2"} - ), - Document( - content="My name is Giorgio, I live in Rome.", - meta={"src": "doc_3"} - ), - ] - -splitter = DocumentSplitter( - split_by="sentence", - split_length=5, - split_overlap=0 -) -input_documents = splitter.run(input_documents)['documents'] - -cleaner = DocumentCleaner( - remove_empty_lines=True, - remove_extra_whitespaces=True, - remove_repeated_substrings=False -) -input_documents = cleaner.run(input_documents)['documents'] - - -document_store = InMemoryDocumentStore( - embedding_similarity_function="cosine", - # embedding_dim=768, - # duplicate_documents="overwrite" -) - -# https://huggingface.co/svalabs/german-gpl-adapted-covid -sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' -logger.info(f'Sentence Transformer Name: {sentence_transformer_model}') - -embedder = SentenceTransformersDocumentEmbedder( - model=sentence_transformer_model, - device=device -) -embedder.warm_up() - - -if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE): - logger.info('Loading embeddings from cache') - - with open(EMBEDDING_CACHE_FILE, 'r') as f_in: - documents_dict = json.load(f_in) - document_store.write_documents( - documents=[Document.from_dict(d_) for d_ in documents_dict], - policy=DuplicatePolicy.OVERWRITE - ) - -else: - logger.debug("Generating embeddings") - - embedded = embedder.run(input_documents) - document_store.write_documents( - documents=embedded['documents'], - policy=DuplicatePolicy.OVERWRITE - ) - - if EMBEDDING_CACHE_FILE: - with open(EMBEDDING_CACHE_FILE, 'w') as f_out: - documents_dict = [ - Document.to_dict(d_) - for d_ in embedded['documents'] - ] - json.dump(documents_dict, f_out) - -retriever = InMemoryEmbeddingRetriever(document_store=document_store) diff --git a/requirements.txt b/requirements.txt index 723011a..e304443 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,11 @@ uvicorn==0.27.0 uvloop==0.19.0 watchfiles==0.21.0 websockets==12.0 +pgvecto-rs==0.1.4 +langchain==0.1.9 +langchain-community==0.0.24 +langchain-core==0.1.26 +jq==1.6.0 +psycopg==3.1.18 +beautifulsoup4==4.12.3 +lxml==5.1.0 diff --git a/start.sh b/start.sh index 685bb91..0e9935a 100644 --- a/start.sh +++ b/start.sh @@ -1,6 +1,7 @@ #!/bin/bash set -e +set -x function generate_random_string() { length=32 @@ -27,6 +28,38 @@ else echo "No public key provided, skipping ssh setup" fi + +echo "Setting up postgres database server with vecto.rs extension" + +service postgresql start + +su postgres <<'EOF' +psql -c 'ALTER SYSTEM SET shared_preload_libraries = "vectors.so"' +psql -c 'ALTER SYSTEM SET search_path TO "$user", public, vectors' +EOF + +service postgresql restart + +cat > ~/.env <> ~/.bashrc + +su --preserve-environment postgres <<'EOF' +psql -c "CREATE EXTENSION vectors;" +psql -c "ALTER USER $DB_USER WITH PASSWORD '$DB_PASS';" +psql -c "CREATE DATABASE $DB_NAME OWNER $DB_USER;" +EOF + + echo "Starting ollama" ollama serve &