Further small changes

AlexejPenner · AlexejPenner · commit e7d66ed35c3a · 2024-10-22T13:23:01.000+02:00
diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md
@@ -43,6 +43,8 @@ environment and install the dependencies using the following command:
 pip install -r requirements.txt
 ```
 
+blah blah if it fails  FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation
+
 In order to use the default LLM for this query, you'll need an account and an
 API key from OpenAI specified as another environment variable:
 
diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt
@@ -17,7 +17,7 @@ tiktoken
 umap-learn
 matplotlib
 pyarrow
-rerankers[all]
+rerankers[flashrank]
 datasets
 torch
 
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -42,6 +42,7 @@
 from materializers.document_materializer import DocumentMaterializer
 from pipelines import (
     finetune_embeddings,
+    generate_synthetic_data,
     llm_basic_rag,
     llm_eval,
 )
@@ -102,6 +103,13 @@
     default=False,
     help="Disable cache.",
 )
+@click.option(
+    "--synthetic",
+    "synthetic",
+    is_flag=True,
+    default=False,
+    help="Run the synthetic data pipeline.",
+)
 @click.option(
     "--local",
     "local",
@@ -143,6 +151,7 @@ def main(
     query: Optional[str] = None,
     model: str = OPENAI_MODEL,
     no_cache: bool = False,
+    synthetic: bool = False,
     local: bool = False,
     embeddings: bool = False,
     dummyembeddings: bool = False,
@@ -157,6 +166,7 @@ def main(
         query (Optional[str]): If provided, the RAG model will be queried with this string.
         model (str): The model to use for the completion. Default is OPENAI_MODEL.
         no_cache (bool): If `True`, cache will be disabled.
+        synthetic (bool): If `True`, the synthetic data pipeline will be run.
         local (bool): If `True`, the local LLM via Ollama will be used.
         dummyembeddings (bool): If `True`, dummyembeddings will be used
         embeddings (bool): If `True`, the embeddings will be fine-tuned.
@@ -188,6 +198,8 @@ def main(
         llm_basic_rag.with_options(**pipeline_args)()
     if evaluation:
         llm_eval.with_options(**pipeline_args)()
+    if synthetic:
+        generate_synthetic_data.with_options(**pipeline_args)()
     if embeddings:
         finetune_embeddings.with_options(**embeddings_finetune_args)()
     if dummyembeddings:
diff --git a/llm-complete-guide/steps/distilabel_generate_queries.py b/llm-complete-guide/steps/distilabel_generate_queries.py
@@ -15,7 +15,6 @@
 import os
 from typing import Annotated, Tuple
 
-import distilabel
 from constants import (
     DATASET_NAME_DEFAULT,
     OPENAI_MODEL_GEN,
@@ -25,6 +24,7 @@
 from distilabel.llms import OpenAILLM
 from distilabel.steps import LoadDataFromHub
 from distilabel.steps.tasks import GenerateSentencePair
+from distilabel.pipeline import Pipeline
 from zenml import step
 
 synthetic_generation_context = """
@@ -45,7 +45,7 @@ def generate_synthetic_queries(
         model=OPENAI_MODEL_GEN, api_key=os.getenv("OPENAI_API_KEY")
     )
 
-    with distilabel.pipeline.Pipeline(
+    with Pipeline(
         name="generate_embedding_queries"
     ) as pipeline:
         load_dataset = LoadDataFromHub(
@@ -74,7 +74,7 @@ def generate_synthetic_queries(
                 }
             },
         },
-        # use_cache=False, # comment out for demo
+        use_cache=False, # comment out for demo
     )
 
     test_distiset = pipeline.run(
@@ -89,7 +89,7 @@ def generate_synthetic_queries(
                 }
             },
         },
-        # use_cache=False, # comment out for demo
+        use_cache=False, # comment out for demo
     )
 
     train_dataset = train_distiset["default"]["train"]
diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py
@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(
 
         if all(url_ending not in url for url in urls):
             logging.error(
-                f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
+                f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
             )
             failures += 1
 
diff --git a/llm-complete-guide/steps/url_scraping_utils.py b/llm-complete-guide/steps/url_scraping_utils.py
@@ -48,6 +48,10 @@ def is_valid_url(url: str, base: str) -> bool:
     return not re.search(version_pattern, url)
 
 
+def strip_query_params(url):
+    return url.split('?')[0]
+
+
 def get_all_pages(url: str) -> List[str]:
     """
     Retrieve all pages with the same base as the given URL.
@@ -61,9 +65,11 @@ def get_all_pages(url: str) -> List[str]:
     logger.info(f"Scraping all pages from {url}...")
     base_url = urlparse(url).netloc
     pages = crawl(url, base_url)
-    logger.info(f"Found {len(pages)} pages.")
+    stripped_pages = [strip_query_params(page) for page in pages]
+
+    logger.info(f"Found {len(stripped_pages)} pages.")
     logger.info("Done scraping pages.")
-    return list(pages)
+    return list(stripped_pages)
 
 
 def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(`
`198`	`198`
`199`	`199`	`if all(url_ending not in url for url in urls):`
`200`	`200`	`logging.error(`
`201`		`- f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"`
	`201`	`+ f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"`
`202`	`202`	`)`
`203`	`203`	`failures += 1`
`204`	`204`