RAG working again

AlexejPenner · AlexejPenner · commit ebe36210c250 · 2024-10-22T08:19:01.000+02:00
diff --git a/llm-complete-guide/requirements-argilla.txt b/llm-complete-guide/requirements-argilla.txt
@@ -5,5 +5,5 @@ litellm
 ollama
 polars
 datasets
-git+https://github.com/argilla-io/argilla.git@releases/2.0.1#subdirectory=argilla # replace once released
+argilla
 distilabel
diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt
@@ -1,4 +1,4 @@
-zenml[server]>=0.63.0
+zenml[server]>=0.67.0
 langchain-community
 ratelimit
 langchain>=0.0.325
@@ -19,6 +19,7 @@ matplotlib
 pyarrow
 rerankers[all]
 datasets
+torch
 
 # optional requirements for S3 artifact store
 # s3fs>2022.3.0
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -42,7 +42,6 @@
 from materializers.document_materializer import DocumentMaterializer
 from pipelines import (
     finetune_embeddings,
-    generate_synthetic_data,
     llm_basic_rag,
     llm_eval,
 )
@@ -103,13 +102,6 @@
     default=False,
     help="Disable cache.",
 )
-@click.option(
-    "--synthetic",
-    "synthetic",
-    is_flag=True,
-    default=False,
-    help="Run the synthetic data pipeline.",
-)
 @click.option(
     "--local",
     "local",
@@ -151,7 +143,6 @@ def main(
     query: Optional[str] = None,
     model: str = OPENAI_MODEL,
     no_cache: bool = False,
-    synthetic: bool = False,
     local: bool = False,
     embeddings: bool = False,
     dummyembeddings: bool = False,
@@ -166,10 +157,11 @@ def main(
         query (Optional[str]): If provided, the RAG model will be queried with this string.
         model (str): The model to use for the completion. Default is OPENAI_MODEL.
         no_cache (bool): If `True`, cache will be disabled.
-        synthetic (bool): If `True`, the synthetic data pipeline will be run.
         local (bool): If `True`, the local LLM via Ollama will be used.
+        dummyembeddings (bool): If `True`, dummyembeddings will be used
         embeddings (bool): If `True`, the embeddings will be fine-tuned.
         argilla (bool): If `True`, the Argilla annotations will be used.
+        reranked (bool): If `True`, rerankers will be used
     """
     pipeline_args = {"enable_cache": not no_cache}
     embeddings_finetune_args = {
@@ -191,12 +183,11 @@ def main(
         md = Markdown(response)
         console.print(md)
 
+    print(f"Running Pipeline with pipeline args: {pipeline_args}")
     if rag:
         llm_basic_rag.with_options(**pipeline_args)()
     if evaluation:
         llm_eval.with_options(**pipeline_args)()
-    if synthetic:
-        generate_synthetic_data.with_options(**pipeline_args)()
     if embeddings:
         finetune_embeddings.with_options(**embeddings_finetune_args)()
     if dummyembeddings:
diff --git a/llm-complete-guide/steps/hf_dataset_loader.py b/llm-complete-guide/steps/hf_dataset_loader.py
@@ -29,6 +29,3 @@ def load_hf_dataset() -> (
     train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train")
     test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test")
     return train_dataset, test_dataset
-
-
-load_hf_dataset()
diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -95,7 +95,7 @@ def generate_embeddings(
         model = SentenceTransformer(EMBEDDINGS_MODEL)
 
         log_artifact_metadata(
-            artifact_name="embeddings",
+            artifact_name="documents_with_embeddings",
             metadata={
                 "embedding_type": EMBEDDINGS_MODEL,
                 "embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py
@@ -20,7 +20,7 @@
 from steps.url_scraping_utils import get_all_pages
 
 
-@step
+@step(enable_cache=True)
 def url_scraper(
     docs_url: str = "https://docs.zenml.io",
     repo_url: str = "https://github.com/zenml-io/zenml",
@@ -31,7 +31,6 @@ def url_scraper(
     Args:
         docs_url: URL to the documentation.
         repo_url: URL to the repository.
-        release_notes_url: URL to the release notes.
         website_url: URL to the website.
 
     Returns: