-
Notifications
You must be signed in to change notification settings - Fork 1
wip: Integration #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
wip: Integration #23
Changes from 12 commits
ff3fe68
38a3bf9
3fb6fd0
a4c7294
d38c5f0
dc4501a
42cdcc5
13bc12e
4933a9a
b23833b
da1017b
41ff046
4e69697
10103c6
0ee6ed5
7a2c955
0a5e2be
332e3dc
6225fcc
4877807
da9859d
291aaaf
936d83e
1b88437
3e0b8f4
edf5eb2
63baf2b
56a7b8c
8e05473
8276e35
22b04d0
10f6b21
bfbd245
7b6ba0a
0428f87
8104dde
fbc4591
caecfd1
5c0b4d0
8833af7
9ee8a32
b2357e3
b518abf
69800b0
7803649
ff1fcab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1 @@ | ||
| from .api import * | ||
| # from .haystack2beta_tutorial_InMemoryEmbeddingRetriever import * |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,6 @@ | |
| from fastapi.staticfiles import StaticFiles | ||
| from fastapi import FastAPI | ||
|
|
||
| # from .rag import rag_pipeline | ||
| from .rag import embedder, retriever, prompt_builder, llm, answer_builder | ||
| from haystack import Document | ||
|
|
||
|
|
@@ -22,50 +21,49 @@ async def root(): | |
|
|
||
| @app.get("/api") | ||
| async def api(q): | ||
| print("query: ", q) | ||
|
|
||
| embedder, retriever, prompt_builder, llm, answer_builder | ||
|
|
||
| # query = "How many languages are there?" | ||
| query = Document(content=q) | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| result = embedder.run([query]) | ||
| queryEmbedded = embedder.run([query]) | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| queryEmbedding = queryEmbedded['documents'][0].embedding | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| results = retriever.run( | ||
| query_embedding=list(result['documents'][0].embedding), | ||
| retrieverResults = retriever.run( | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| query_embedding=list(queryEmbedding), | ||
| filters=None, | ||
| top_k=None, | ||
| top_k=3, | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| scale_score=None, | ||
| return_embedding=None | ||
| ) | ||
| # .run( | ||
| # result['documents'][0].embedding | ||
| # ) | ||
|
|
||
| prompt = prompt_builder.run(documents=results['documents'])['prompt'] | ||
| print("retriever results:") | ||
|
||
| for retrieverResult in retrieverResults: | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| print(retrieverResult) | ||
|
||
|
|
||
| response = llm.run(prompt=prompt, generation_kwargs=None) | ||
| # reply = response['replies'][0] | ||
| promptBuild = prompt_builder.run(question=q, documents=retrieverResults['documents']) | ||
| prompt = promptBuild['prompt'] | ||
|
||
|
|
||
| print("prompt: ", prompt) | ||
|
|
||
| # rag_pipeline.connect("llm.replies", "answer_builder.replies") | ||
| # rag_pipeline.connect("llm.metadata", "answer_builder.meta") | ||
| # rag_pipeline.connect("retriever", "answer_builder.documents") | ||
| response = llm.run(prompt=prompt, generation_kwargs=None) | ||
|
|
||
| results = answer_builder.run( | ||
| answerBuild = answer_builder.run( | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| query=q, | ||
| replies=response['replies'], | ||
| meta=response['meta'], | ||
| documents=results['documents'], | ||
| documents=retrieverResults['documents'], | ||
| pattern=None, | ||
| reference_pattern=None | ||
| ) | ||
| print("answerBuild", answerBuild) | ||
|
||
|
|
||
| answer = answerBuild['answers'][0] | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| sources= [{ "src": d.meta['src'], "content": d.content, "score": d.score } for d in answer.documents] | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| answer = results['answers'][0] | ||
| print("answer", answer) | ||
|
||
|
|
||
| return { | ||
| "answer": answer.data, | ||
| "sources": [{ | ||
| "src": d.meta['src'], | ||
| "content": d.content, | ||
| "score": d.score | ||
| } for d in answer.documents] | ||
| "sources": sources | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,24 +1,25 @@ | ||
| import os | ||
| import json | ||
|
|
||
| # from sentence_transformers import SentenceTransformer | ||
| from tqdm import tqdm | ||
|
|
||
| from haystack import Document # , Pipeline | ||
| from haystack.components.embedders import SentenceTransformersDocumentEmbedder | ||
| # from haystack.components.embedders import SentenceTransformersTextEmbedder | ||
| from haystack.document_stores.in_memory import InMemoryDocumentStore | ||
| # from haystack.components.retrievers.in_memory import InMemoryBM25Retriever | ||
| from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever | ||
| # from haystack.components.writers import DocumentWriter | ||
| from haystack.document_stores.types.policy import DuplicatePolicy | ||
| from haystack.components.preprocessors import DocumentSplitter | ||
rti marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| from haystack.components.preprocessors import DocumentCleaner | ||
|
|
||
| HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') | ||
| EMBEDDING_CACHE_FILE = '/tmp/gbnc_embeddings.json' | ||
exowanderer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| top_k = 5 | ||
| input_documents = [] | ||
|
|
||
| json_dir = 'json_input' | ||
| json_fname = 'excellent-articles_10_paragraphs.json' | ||
| json_fname = 'excellent-articles_10.json' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have we tried our existing package with the full 2000 articles? |
||
|
|
||
| json_fpath = os.path.join(json_dir, json_fname) | ||
|
|
||
| if os.path.isfile(json_fpath): | ||
|
|
@@ -30,11 +31,11 @@ | |
| for k, v in tqdm(json_obj.items()): | ||
|
||
| print(f"Loading {k}") | ||
|
||
| input_documents.append(Document(content=v, meta={"src": k})) | ||
|
||
|
|
||
| elif isinstance(json_obj, list): | ||
| for obj_ in tqdm(json_obj): | ||
|
||
| url = obj_['meta'] | ||
| content = obj_['content'] | ||
|
|
||
| input_documents.append( | ||
| Document( | ||
| content=content, | ||
|
|
@@ -57,112 +58,55 @@ | |
| ), | ||
| ] | ||
|
|
||
| # Write documents to InMemoryDocumentStore | ||
| # cleaner = DocumentCleaner( | ||
| # remove_empty_lines=True, | ||
| # remove_extra_whitespaces=True, | ||
| # remove_repeated_substrings=False) | ||
| # input_documents = cleaner.run(input_documents)['documents'] | ||
|
|
||
| splitter = DocumentSplitter(split_by="sentence", split_length=20, split_overlap=0) | ||
| input_documents = splitter.run(input_documents)['documents'] | ||
|
|
||
| document_store = InMemoryDocumentStore( | ||
| embedding_similarity_function="cosine", | ||
| # embedding_dim=768, | ||
| # duplicate_documents="overwrite" | ||
| ) | ||
| # document_store.write_documents(input_documents) | ||
|
|
||
| # TODO Introduce Jina.AI from HuggingFace. Establish env-variable for trust_... | ||
|
|
||
| # basic_transformer_models = [ | ||
| # "all-MiniLM-L6-v2", | ||
| # "xlm-clm-ende-1024", | ||
| # "xlm-mlm-ende-1024", | ||
| # "bert-base-german-cased", | ||
| # "bert-base-german-dbmdz-cased", | ||
| # "bert-base-german-dbmdz-uncased", | ||
| # "distilbert-base-german-cased", | ||
| # "xlm-roberta-large-finetuned-conll03-german", | ||
| # "deutsche-telekom/gbert-large-paraphrase-cosine" | ||
| # ] | ||
|
|
||
| # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 | ||
| # sentence_transformer_model = "all-MiniLM-L6-v2" | ||
| # 3 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/deutsche-telekom/gbert-large-paraphrase-cosine | ||
| # sentence_transformer_model = 'deutsche-telekom/gbert-large-paraphrase-cosine' | ||
| # 76 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/jinaai/jina-embeddings-v2-base-de | ||
| # sentence_transformer_model = 'jinaai/jina-embeddings-v2-base-de' | ||
| # Cannot find or load the embedding model | ||
| # Unknown minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/aari1995/German_Semantic_STS_V2 | ||
| # sentence_transformer_model = 'aari1995/German_Semantic_STS_V2' | ||
| # 75 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/Sahajtomar/German-semantic | ||
| # sentence_transformer_model = 'Sahajtomar/German-semantic' | ||
| # 72 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/svalabs/german-gpl-adapted-covid | ||
| sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' | ||
| # 2 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german | ||
| # sentence_transformer_model = 'PM-AI/bi-encoder_msmarco_bert-base_german' | ||
| # 14 minutes to batch 82 | ||
|
|
||
| # https://huggingface.co/JoBeer/german-semantic-base | ||
| # sentence_transformer_model = 'JoBeer/german-semantic-base' | ||
| # 22 minutes to batch 82 | ||
|
|
||
| print(f'Sentence Transformer Name:{sentence_transformer_model}') | ||
| print(f'Sentence Transformer Name: {sentence_transformer_model}') | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| embedder = SentenceTransformersDocumentEmbedder( | ||
| model=sentence_transformer_model, | ||
| # model="T-Systems-onsite/german-roberta-sentence-transformer-v2", | ||
| # model="jinaai/jina-embeddings-v2-base-de", | ||
| # token=HUGGING_FACE_HUB_TOKEN | ||
| ) | ||
|
|
||
| # hg_embedder = SentenceTransformer( | ||
| # "jinaai/jina-embeddings-v2-base-de", | ||
| # token=HUGGING_FACE_HUB_TOKEN | ||
| # ) | ||
|
|
||
| embedder.warm_up() | ||
|
|
||
| documents_with_embeddings = embedder.run(input_documents) | ||
| # documents_with_embeddings = embedder.encode(input_documents) | ||
|
|
||
|
|
||
| # print('\n\n') | ||
| # # print(documents_with_embeddings['documents']) | ||
| # print(type(documents_with_embeddings['documents'])) | ||
| # print(len(documents_with_embeddings['documents'])) | ||
| # print(dir(documents_with_embeddings['documents'][0])) | ||
| # print('\n\n') | ||
| # print(type(embedder.model)) | ||
| # print('\n\n') | ||
| # # print(dir(hg_embedder)) | ||
|
|
||
|
|
||
| document_store.write_documents( | ||
| documents=documents_with_embeddings['documents'], | ||
| policy=DuplicatePolicy.OVERWRITE | ||
| ) | ||
| # if os.path.isfile(EMBEDDING_CACHE_FILE): | ||
| # print("[INFO] Loading embeddings from cache") | ||
| # | ||
| # with open(EMBEDDING_CACHE_FILE, 'r') as f: | ||
| # documentsDict = json.load(f) | ||
| # document_store.write_documents( | ||
| # documents=[Document.from_dict(d) for d in documentsDict], | ||
| # policy=DuplicatePolicy.OVERWRITE | ||
| # ) | ||
| # | ||
| # else: | ||
| if True: | ||
rti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| embedded = embedder.run(input_documents) | ||
| document_store.write_documents( | ||
| documents=embedded['documents'], | ||
| policy=DuplicatePolicy.OVERWRITE | ||
| ) | ||
|
|
||
| with open(EMBEDDING_CACHE_FILE, 'w') as f: | ||
| documentsDict = [Document.to_dict(d) for d in embedded['documents']] | ||
| json.dump(documentsDict, f) | ||
|
|
||
| retriever = InMemoryEmbeddingRetriever( | ||
| # embedding_model="sentence-transformers/all-MiniLM-L6-v2", | ||
| document_store=document_store, | ||
| top_k=top_k | ||
| ) | ||
|
|
||
| # writer = DocumentWriter(document_store=document_store) | ||
|
|
||
| # indexing_pipeline = Pipeline() | ||
| # indexing_pipeline.add_component("embedder", embedder) | ||
| # indexing_pipeline.add_component("writer", writer) | ||
| # indexing_pipeline.connect("embedder", "writer") | ||
| # indexing_pipeline.run( | ||
| # { | ||
| # "embedder": {"documents": input_documents} | ||
| # } | ||
| # ) | ||
Uh oh!
There was an error while loading. Please reload this page.