Skip to content

Commit cbe263d

Browse files
authored
Fix / update LLM Complete Guide (side quest) (#134)
* remove extra step invocation * update requirements * fixes and updates * update chunking pipeline logic * handle JSON changes * formatting * add a typos.toml * update typos.toml * move typos.toml to repo root
1 parent 42f952a commit cbe263d

File tree

12 files changed

+171
-89
lines changed

12 files changed

+171
-89
lines changed

.typos.toml

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,36 @@
11
[files]
22
extend-exclude = [
3-
"*.csv",
4-
"sign-language-detection-yolov5/*",
5-
"orbit-user-analysis/steps/report.py",
6-
"customer-satisfaction/pipelines/deployment_pipeline.py",
7-
"customer-satisfaction/streamlit_app.py",
8-
"nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb",
9-
"customer-satisfaction/tests/data_test.py",
10-
"end-to-end-computer-vision/**/*.ipynb",
11-
"classifier-e2e/run_skip_basics.ipynb",
12-
"classifier-e2e/run_full.ipynb",
13-
"classifier-e2e/run_skip_basics.ipynb",
14-
"classifier-e2e/run_full.ipynb",
15-
"classifier-e2e/run_skip_basics.ipynb"
3+
"*.json",
4+
"*.js",
5+
"*.ipynb",
166
]
177

188
[default.extend-identifiers]
19-
# HashiCorp = "HashiCorp"
20-
connexion = "connexion"
21-
preprocesser = "preprocesser"
22-
Preprocesser = "Preprocesser"
9+
HashiCorp = "HashiCorp"
10+
NDArray = "NDArray"
11+
K_Scatch = "K_Scatch"
12+
MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h = "MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h"
13+
VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi = "VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi"
14+
MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1 = "MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1"
2315

2416
[default.extend-words]
25-
# aks = "aks"
26-
GOES = "GOES"
27-
lenght = "lenght"
28-
preprocesser = "preprocesser"
29-
Preprocesser = "Preprocesser"
30-
Implicitly = "Implicitly"
31-
fo = "fo"
32-
mapp = "mapp"
33-
polution = "polution"
34-
magent = "magent"
17+
# Don't correct the surname "Teh"
18+
aks = "aks"
19+
hashi = "hashi"
20+
womens = "womens"
21+
prepend = "prepend"
22+
prepended = "prepended"
23+
goes = "goes"
24+
bare = "bare"
25+
prepending = "prepending"
26+
prev = "prev"
27+
creat = "creat"
28+
ret = "ret"
29+
daa = "daa"
30+
arange = "arange"
31+
cachable = "cachable"
32+
OT = "OT"
33+
cll = "cll"
3534

3635
[default]
3736
locale = "en-us"

llm-complete-guide/pipelines/generate_chunk_questions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
from zenml.client import Client
2020

2121

22-
@pipeline
22+
@pipeline(enable_cache=False)
2323
def generate_chunk_questions():
2424
"""Pipeline to generate questions from chunks."""
25-
local_setting = ExternalArtifact(value=True)
25+
local_setting = ExternalArtifact(value=False)
2626
client = Client()
2727
docs_with_embeddings = client.get_artifact_version(
2828
name_id_or_prefix="documents_with_embeddings"

llm-complete-guide/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ tiktoken
1717
umap-learn
1818
matplotlib
1919
pyarrow
20-
rerankers[all]
20+
rerankers[flashrank]
2121
datasets
2222

2323
# optional requirements for S3 artifact store

llm-complete-guide/run.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from materializers.document_materializer import DocumentMaterializer
4343
from pipelines import (
4444
finetune_embeddings,
45+
generate_chunk_questions,
4546
generate_synthetic_data,
4647
llm_basic_rag,
4748
llm_eval,
@@ -145,6 +146,13 @@
145146
default=False,
146147
help="Whether to use the reranker.",
147148
)
149+
@click.option(
150+
"--chunks",
151+
"chunks",
152+
is_flag=True,
153+
default=False,
154+
help="Generate chunks for Hugging Face dataset",
155+
)
148156
def main(
149157
rag: bool = False,
150158
evaluation: bool = False,
@@ -157,6 +165,7 @@ def main(
157165
dummyembeddings: bool = False,
158166
argilla: bool = False,
159167
reranked: bool = False,
168+
chunks: bool = False,
160169
):
161170
"""Main entry point for the pipeline execution.
162171
@@ -170,6 +179,7 @@ def main(
170179
local (bool): If `True`, the local LLM via Ollama will be used.
171180
embeddings (bool): If `True`, the embeddings will be fine-tuned.
172181
argilla (bool): If `True`, the Argilla annotations will be used.
182+
chunks (bool): If `True`, the chunks pipeline will be run.
173183
"""
174184
pipeline_args = {"enable_cache": not no_cache}
175185
embeddings_finetune_args = {
@@ -201,6 +211,8 @@ def main(
201211
finetune_embeddings.with_options(**embeddings_finetune_args)()
202212
if dummyembeddings:
203213
chunking_experiment.with_options(**pipeline_args)()
214+
if chunks:
215+
generate_chunk_questions.with_options(**pipeline_args)()
204216

205217

206218
if __name__ == "__main__":

llm-complete-guide/steps/eval_retrieval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(
198198

199199
if all(url_ending not in url for url in urls):
200200
logging.error(
201-
f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
201+
f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
202202
)
203203
failures += 1
204204

llm-complete-guide/steps/finetune_embeddings.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,14 @@ def visualize_results(
373373
color="red",
374374
)
375375
for i, v in enumerate(finetuned_values):
376-
ax.text(v - 1.5, i - height / 2, f"{v:.1f}", va="center", ha="right", color="white")
376+
ax.text(
377+
v - 1.5,
378+
i - height / 2,
379+
f"{v:.1f}",
380+
va="center",
381+
ha="right",
382+
color="white",
383+
)
377384
ax.barh(
378385
[i + height / 2 for i in y],
379386
base_values,
@@ -382,7 +389,14 @@ def visualize_results(
382389
color="blue",
383390
)
384391
for i, v in enumerate(base_values):
385-
ax.text(v - 1.5, i + height / 2, f"{v:.1f}", va="center", ha="right", color="white")
392+
ax.text(
393+
v - 1.5,
394+
i + height / 2,
395+
f"{v:.1f}",
396+
va="center",
397+
ha="right",
398+
color="white",
399+
)
386400

387401
ax.set_xlabel("Scores (%)")
388402
ax.set_title("Evaluation Results")

llm-complete-guide/steps/hf_dataset_loader.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,3 @@ def load_hf_dataset() -> (
2929
train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train")
3030
test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test")
3131
return train_dataset, test_dataset
32-
33-
34-
load_hf_dataset()

llm-complete-guide/steps/populate_index.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
# https://www.timescale.com/blog/postgresql-as-a-vector-database-create-store-and-query-openai-embeddings-with-pgvector/
2020
# for providing the base implementation for this indexing functionality
2121

22+
import json
2223
import logging
2324
import math
24-
from typing import Annotated, List
25+
from typing import Annotated
2526

2627
from constants import (
2728
CHUNK_OVERLAP,
@@ -41,16 +42,16 @@
4142

4243
@step
4344
def preprocess_documents(
44-
documents: List[Document],
45-
) -> Annotated[List[Document], ArtifactConfig(name="split_chunks")]:
45+
documents: str,
46+
) -> Annotated[str, ArtifactConfig(name="split_chunks")]:
4647
"""
47-
Preprocesses a list of documents by splitting them into chunks.
48+
Preprocesses a JSON string of documents by splitting them into chunks.
4849
4950
Args:
50-
documents (List[Document]): A list of documents to be preprocessed.
51+
documents (str): A JSON string containing a list of documents to be preprocessed.
5152
5253
Returns:
53-
Annotated[List[Document], ArtifactConfig(name="split_chunks")]: A list of preprocessed documents annotated with an ArtifactConfig.
54+
Annotated[str, ArtifactConfig(name="split_chunks")]: A JSON string containing a list of preprocessed documents annotated with an ArtifactConfig.
5455
5556
Raises:
5657
Exception: If an error occurs during preprocessing.
@@ -64,29 +65,34 @@ def preprocess_documents(
6465
},
6566
)
6667

68+
# Parse the JSON string into a list of Document objects
69+
document_list = [Document(**doc) for doc in json.loads(documents)]
70+
6771
split_docs = split_documents(
68-
documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
72+
document_list, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
6973
)
70-
return split_docs
74+
75+
# Convert the list of Document objects back to a JSON string
76+
split_docs_json = json.dumps([doc.__dict__ for doc in split_docs])
77+
78+
return split_docs_json
7179
except Exception as e:
7280
logger.error(f"Error in preprocess_documents: {e}")
7381
raise
7482

7583

7684
@step
7785
def generate_embeddings(
78-
split_documents: List[Document],
79-
) -> Annotated[
80-
List[Document], ArtifactConfig(name="documents_with_embeddings")
81-
]:
86+
split_documents: str,
87+
) -> Annotated[str, ArtifactConfig(name="documents_with_embeddings")]:
8288
"""
8389
Generates embeddings for a list of split documents using a SentenceTransformer model.
8490
8591
Args:
8692
split_documents (List[Document]): A list of Document objects that have been split into chunks.
8793
8894
Returns:
89-
Annotated[List[Document], ArtifactConfig(name="embeddings")]: The list of Document objects with generated embeddings, annotated with an ArtifactConfig.
95+
Annotated[str, ArtifactConfig(name="documents_with_embeddings")]: A JSON string containing the Document objects with generated embeddings, annotated with an ArtifactConfig.
9096
9197
Raises:
9298
Exception: If an error occurs during the generation of embeddings.
@@ -95,28 +101,36 @@ def generate_embeddings(
95101
model = SentenceTransformer(EMBEDDINGS_MODEL)
96102

97103
log_artifact_metadata(
98-
artifact_name="embeddings",
104+
artifact_name="documents_with_embeddings",
99105
metadata={
100106
"embedding_type": EMBEDDINGS_MODEL,
101107
"embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
102108
},
103109
)
104110

105-
document_texts = [doc.page_content for doc in split_documents]
111+
# Parse the JSON string into a list of Document objects
112+
document_list = [
113+
Document(**doc) for doc in json.loads(split_documents)
114+
]
115+
116+
document_texts = [doc.page_content for doc in document_list]
106117
embeddings = model.encode(document_texts)
107118

108-
for doc, embedding in zip(split_documents, embeddings):
109-
doc.embedding = embedding
119+
for doc, embedding in zip(document_list, embeddings):
120+
doc.embedding = embedding.tolist()
121+
122+
# Convert the list of Document objects to a JSON string
123+
documents_json = json.dumps([doc.__dict__ for doc in document_list])
110124

111-
return split_documents
125+
return documents_json
112126
except Exception as e:
113127
logger.error(f"Error in generate_embeddings: {e}")
114128
raise
115129

116130

117131
@step
118132
def index_generator(
119-
documents: List[Document],
133+
documents: str,
120134
) -> None:
121135
"""Generates an index for the given documents.
122136
@@ -126,7 +140,7 @@ def index_generator(
126140
using the cosine distance measure.
127141
128142
Args:
129-
documents (List[Document]): The list of Document objects with generated embeddings.
143+
documents (str): A JSON string containing the Document objects with generated embeddings.
130144
131145
Raises:
132146
Exception: If an error occurs during the index generation.
@@ -155,11 +169,14 @@ def index_generator(
155169

156170
register_vector(conn)
157171

172+
# Parse the JSON string into a list of Document objects
173+
document_list = [Document(**doc) for doc in json.loads(documents)]
174+
158175
# Insert data only if it doesn't already exist
159-
for doc in documents:
176+
for doc in document_list:
160177
content = doc.page_content
161178
token_count = doc.token_count
162-
embedding = doc.embedding.tolist()
179+
embedding = doc.embedding
163180
filename = doc.filename
164181
parent_section = doc.parent_section
165182
url = doc.url

0 commit comments

Comments
 (0)