Skip to content

Commit 38a558d

Browse files
exowandererrti
authored andcommitted
UPdated with PEP8 formatting in vector_store_interface.py
1 parent 9ab649f commit 38a558d

File tree

3 files changed

+46
-49
lines changed

3 files changed

+46
-49
lines changed

gswikichat/llm_config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# Create logger instance from base logger config in `logger.py`
77
logger = get_logger(__name__)
88

9-
109
OLLAMA_MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME")
1110
OLLAMA_URL = os.environ.get("OLLAMA_URL")
1211
OLLAMA_GENERATE_URL = f"{OLLAMA_URL}/api/generate"

gswikichat/prompt.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,5 @@
11
from haystack.components.builders.prompt_builder import PromptBuilder
22

3-
# prompt_template = """
4-
# Given these documents, answer the question. Answer in a full sentence. Give the response only, no explanation. Don't mention the documents.
5-
# Documents:
6-
# {% for doc in documents %}
7-
# If {{ doc.content }} answers the Question: {{question}}
8-
# Then return {{ doc.meta["src"] }}
9-
# {% endfor %}
10-
# """
11-
123
prompt_template_en = """
134
<|system|>
145
You are a helpful assistant. You answer questions based on the given documents.
@@ -17,8 +8,8 @@
178
<|endoftext|>
189
<|user|>
1910
Documents:
20-
{% for doc in documents %}
21-
{{ doc.content }}
11+
{% for doc_ in documents %}
12+
{{ doc_.content }}
2213
{% endfor %}
2314
With this documents, answer the following question: {{question}}
2415
<|endoftext|>
@@ -33,22 +24,14 @@
3324
<|endoftext|>
3425
<|user|>
3526
Dokumente:
36-
{% for doc in documents %}
37-
{{ doc.content }}
27+
{% for doc_ in documents %}
28+
{{ doc_.content }}
3829
{% endfor %}
3930
Mit diesen Dokumenten, beantworte die folgende Frage: {{question}}
4031
<|endoftext|>
4132
<|assistant|>
4233
"""
4334

44-
# prompt_template = """
45-
# Given these documents, answer the question. Answer in a full sentence. Give the response only, no explanation. Don't mention the documents.
46-
# Documents:
47-
# If {{ doc.content }} answers the Question: {{question}}
48-
# Then only return {{ doc.meta["src"] }} and nothing at all.
49-
# {% endfor %}
50-
# """
51-
5235
prompt_builders = {
5336
'en': PromptBuilder(template=prompt_template_en),
5437
'de': PromptBuilder(template=prompt_template_de),

gswikichat/vector_store_interface.py

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
from haystack.components.preprocessors import DocumentSplitter
1212
from haystack.components.preprocessors import DocumentCleaner
1313

14+
15+
from .logger import get_logger
16+
17+
# Create logger instance from base logger config in `logger.py`
18+
logger = get_logger(__name__)
19+
1420
HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN')
1521

1622
# disable this line to disable the embedding cache
@@ -19,31 +25,33 @@
1925
top_k = 5
2026
input_documents = []
2127

28+
# TODO: Add the json strings as env variables
2229
json_dir = 'json_input'
2330
json_fname = 'excellent-articles_10.json'
2431

2532
json_fpath = os.path.join(json_dir, json_fname)
2633

2734
if os.path.isfile(json_fpath):
28-
print(f'[INFO] Loading data from {json_fpath}')
35+
logger.info(f'Loading data from {json_fpath}')
2936
with open(json_fpath, 'r') as finn:
3037
json_obj = json.load(finn)
3138

3239
if isinstance(json_obj, dict):
33-
for k, v in tqdm(json_obj.items()):
34-
print(f"Loading {k}")
35-
input_documents.append(Document(content=v, meta={"src": k}))
36-
40+
input_documents = [
41+
Document(
42+
content=content_,
43+
meta={"src": url_}
44+
)
45+
for url_, content_ in tqdm(json_obj.items())
46+
]
3747
elif isinstance(json_obj, list):
38-
for obj_ in tqdm(json_obj):
39-
url = obj_['meta']
40-
content = obj_['content']
41-
input_documents.append(
42-
Document(
43-
content=content,
44-
meta={'src': url}
45-
)
48+
input_documents = [
49+
Document(
50+
content=obj_['content'],
51+
meta={'src': obj_['meta']}
4652
)
53+
for obj_ in tqdm(json_obj)
54+
]
4755
else:
4856
input_documents = [
4957
Document(
@@ -60,13 +68,18 @@
6068
),
6169
]
6270

63-
splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=0)
71+
splitter = DocumentSplitter(
72+
split_by="sentence",
73+
split_length=5,
74+
split_overlap=0
75+
)
6476
input_documents = splitter.run(input_documents)['documents']
6577

6678
cleaner = DocumentCleaner(
67-
remove_empty_lines=True,
68-
remove_extra_whitespaces=True,
69-
remove_repeated_substrings=False)
79+
remove_empty_lines=True,
80+
remove_extra_whitespaces=True,
81+
remove_repeated_substrings=False
82+
)
7083
input_documents = cleaner.run(input_documents)['documents']
7184

7285

@@ -78,7 +91,7 @@
7891

7992
# https://huggingface.co/svalabs/german-gpl-adapted-covid
8093
sentence_transformer_model = 'svalabs/german-gpl-adapted-covid'
81-
print(f'Sentence Transformer Name: {sentence_transformer_model}')
94+
logger.info(f'Sentence Transformer Name: {sentence_transformer_model}')
8295

8396
embedder = SentenceTransformersDocumentEmbedder(
8497
model=sentence_transformer_model,
@@ -87,17 +100,17 @@
87100

88101

89102
if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE):
90-
print("[INFO] Loading embeddings from cache")
103+
logger.info('Loading embeddings from cache')
91104

92-
with open(EMBEDDING_CACHE_FILE, 'r') as f:
93-
documentsDict = json.load(f)
105+
with open(EMBEDDING_CACHE_FILE, 'r') as f_in:
106+
documents_dict = json.load(f_in)
94107
document_store.write_documents(
95-
documents=[Document.from_dict(d) for d in documentsDict],
108+
documents=[Document.from_dict(d_) for d_ in documents_dict],
96109
policy=DuplicatePolicy.OVERWRITE
97110
)
98111

99112
else:
100-
print("[INFO] Generating embeddings")
113+
logger.debug("Generating embeddings")
101114

102115
embedded = embedder.run(input_documents)
103116
document_store.write_documents(
@@ -106,9 +119,11 @@
106119
)
107120

108121
if EMBEDDING_CACHE_FILE:
109-
with open(EMBEDDING_CACHE_FILE, 'w') as f:
110-
documentsDict = [Document.to_dict(d) for d in embedded['documents']]
111-
json.dump(documentsDict, f)
122+
with open(EMBEDDING_CACHE_FILE, 'w') as f_out:
123+
documents_dict = [
124+
Document.to_dict(d_)
125+
for d_ in embedded['documents']
126+
]
127+
json.dump(documents_dict, f_out)
112128

113129
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
114-

0 commit comments

Comments
 (0)