fixes and updates

strickvl · strickvl · commit abd4fe365fc9 · 2024-10-22T14:27:03.000+02:00
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -42,6 +42,7 @@
 from materializers.document_materializer import DocumentMaterializer
 from pipelines import (
     finetune_embeddings,
+    generate_chunk_questions,
     generate_synthetic_data,
     llm_basic_rag,
     llm_eval,
@@ -145,6 +146,13 @@
     default=False,
     help="Whether to use the reranker.",
 )
+@click.option(
+    "--chunks",
+    "chunks",
+    is_flag=True,
+    default=False,
+    help="Generate chunks for Hugging Face dataset",
+)
 def main(
     rag: bool = False,
     evaluation: bool = False,
@@ -157,6 +165,7 @@ def main(
     dummyembeddings: bool = False,
     argilla: bool = False,
     reranked: bool = False,
+    chunks: bool = False,
 ):
     """Main entry point for the pipeline execution.
 
@@ -170,6 +179,7 @@ def main(
         local (bool): If `True`, the local LLM via Ollama will be used.
         embeddings (bool): If `True`, the embeddings will be fine-tuned.
         argilla (bool): If `True`, the Argilla annotations will be used.
+        chunks (bool): If `True`, the chunks pipeline will be run.
     """
     pipeline_args = {"enable_cache": not no_cache}
     embeddings_finetune_args = {
@@ -201,6 +211,8 @@ def main(
         finetune_embeddings.with_options(**embeddings_finetune_args)()
     if dummyembeddings:
         chunking_experiment.with_options(**pipeline_args)()
+    if chunks:
+        generate_chunk_questions.with_options(**pipeline_args)()
 
 
 if __name__ == "__main__":
diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py
@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(
 
         if all(url_ending not in url for url in urls):
             logging.error(
-                f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
+                f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
             )
             failures += 1
 
diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py
@@ -373,7 +373,14 @@ def visualize_results(
         color="red",
     )
     for i, v in enumerate(finetuned_values):
-        ax.text(v - 1.5, i - height / 2, f"{v:.1f}", va="center", ha="right", color="white")
+        ax.text(
+            v - 1.5,
+            i - height / 2,
+            f"{v:.1f}",
+            va="center",
+            ha="right",
+            color="white",
+        )
     ax.barh(
         [i + height / 2 for i in y],
         base_values,
@@ -382,7 +389,14 @@ def visualize_results(
         color="blue",
     )
     for i, v in enumerate(base_values):
-        ax.text(v - 1.5, i + height / 2, f"{v:.1f}", va="center", ha="right", color="white")
+        ax.text(
+            v - 1.5,
+            i + height / 2,
+            f"{v:.1f}",
+            va="center",
+            ha="right",
+            color="white",
+        )
 
     ax.set_xlabel("Scores (%)")
     ax.set_title("Evaluation Results")
diff --git a/llm-complete-guide/steps/hf_dataset_loader.py b/llm-complete-guide/steps/hf_dataset_loader.py
@@ -29,4 +29,3 @@ def load_hf_dataset() -> (
     train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train")
     test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test")
     return train_dataset, test_dataset
-
diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -19,9 +19,10 @@
 # https://www.timescale.com/blog/postgresql-as-a-vector-database-create-store-and-query-openai-embeddings-with-pgvector/
 # for providing the base implementation for this indexing functionality
 
+import json
 import logging
 import math
-from typing import Annotated, List
+from typing import Annotated
 
 from constants import (
     CHUNK_OVERLAP,
@@ -41,16 +42,16 @@
 
 @step
 def preprocess_documents(
-    documents: List[Document],
-) -> Annotated[List[Document], ArtifactConfig(name="split_chunks")]:
+    documents: str,
+) -> Annotated[str, ArtifactConfig(name="split_chunks")]:
     """
-    Preprocesses a list of documents by splitting them into chunks.
+    Preprocesses a JSON string of documents by splitting them into chunks.
 
     Args:
-        documents (List[Document]): A list of documents to be preprocessed.
+        documents (str): A JSON string containing a list of documents to be preprocessed.
 
     Returns:
-        Annotated[List[Document], ArtifactConfig(name="split_chunks")]: A list of preprocessed documents annotated with an ArtifactConfig.
+        Annotated[str, ArtifactConfig(name="split_chunks")]: A JSON string containing a list of preprocessed documents annotated with an ArtifactConfig.
 
     Raises:
         Exception: If an error occurs during preprocessing.
@@ -64,29 +65,34 @@ def preprocess_documents(
             },
         )
 
+        # Parse the JSON string into a list of Document objects
+        document_list = [Document(**doc) for doc in json.loads(documents)]
+
         split_docs = split_documents(
-            documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+            document_list, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
         )
-        return split_docs
+
+        # Convert the list of Document objects back to a JSON string
+        split_docs_json = json.dumps([doc.__dict__ for doc in split_docs])
+
+        return split_docs_json
     except Exception as e:
         logger.error(f"Error in preprocess_documents: {e}")
         raise
 
 
 @step
 def generate_embeddings(
-    split_documents: List[Document],
-) -> Annotated[
-    List[Document], ArtifactConfig(name="documents_with_embeddings")
-]:
+    split_documents: str,
+) -> Annotated[str, ArtifactConfig(name="documents_with_embeddings")]:
     """
     Generates embeddings for a list of split documents using a SentenceTransformer model.
 
     Args:
         split_documents (List[Document]): A list of Document objects that have been split into chunks.
 
     Returns:
-        Annotated[List[Document], ArtifactConfig(name="embeddings")]: The list of Document objects with generated embeddings, annotated with an ArtifactConfig.
+        Annotated[str, ArtifactConfig(name="documents_with_embeddings")]: A JSON string containing the Document objects with generated embeddings, annotated with an ArtifactConfig.
 
     Raises:
         Exception: If an error occurs during the generation of embeddings.
@@ -95,7 +101,7 @@ def generate_embeddings(
         model = SentenceTransformer(EMBEDDINGS_MODEL)
 
         log_artifact_metadata(
-            artifact_name="embeddings",
+            artifact_name="documents_with_embeddings",
             metadata={
                 "embedding_type": EMBEDDINGS_MODEL,
                 "embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
@@ -106,17 +112,22 @@ def generate_embeddings(
         embeddings = model.encode(document_texts)
 
         for doc, embedding in zip(split_documents, embeddings):
-            doc.embedding = embedding
+            doc.embedding = (
+                embedding.tolist()
+            )  # Convert numpy array to list for JSON serialization
 
-        return split_documents
+        # Convert the list of Document objects to a JSON string
+        documents_json = json.dumps([doc.__dict__ for doc in split_documents])
+
+        return documents_json
     except Exception as e:
         logger.error(f"Error in generate_embeddings: {e}")
         raise
 
 
 @step
 def index_generator(
-    documents: List[Document],
+    documents: str,
 ) -> None:
     """Generates an index for the given documents.
 
@@ -126,7 +137,7 @@ def index_generator(
     using the cosine distance measure.
 
     Args:
-        documents (List[Document]): The list of Document objects with generated embeddings.
+        documents (str): A JSON string containing the Document objects with generated embeddings.
 
     Raises:
         Exception: If an error occurs during the index generation.
@@ -155,6 +166,9 @@ def index_generator(
 
             register_vector(conn)
 
+            # load the documents from the JSON string
+            documents = json.loads(documents)
+
             # Insert data only if it doesn't already exist
             for doc in documents:
                 content = doc.page_content
diff --git a/llm-complete-guide/steps/synthetic_data.py b/llm-complete-guide/steps/synthetic_data.py
@@ -14,14 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import List, Annotated
 
 import pandas as pd
 from datasets import Dataset
 from huggingface_hub import create_repo
 from litellm import completion
 from structures import Document
-from zenml import step
+from zenml import step, ArtifactConfig
 from zenml.client import Client
 
 LOCAL_MODEL = "ollama/mixtral"
@@ -36,7 +36,7 @@ def generate_question(chunk: str, local: bool = False) -> str:
     Returns:
         Generated question.
     """
-    model = LOCAL_MODEL if local else "gpt-3.5-turbo"
+    model = LOCAL_MODEL if local else "gpt-4o"
     response = completion(
         model=model,
         messages=[
@@ -54,16 +54,19 @@ def generate_question(chunk: str, local: bool = False) -> str:
 def generate_questions_from_chunks(
     docs_with_embeddings: List[Document],
     local: bool = False,
-) -> List[Document]:
+) -> Annotated[str, ArtifactConfig(name="synthetic_questions")]:
     """Generate questions from chunks.
 
     Args:
-        docs_with_embeddings: List of documents with embeddings.
         local: Whether to run the pipeline with a local LLM.
 
     Returns:
-        List of documents with generated questions added.
+        JSON string containing a list of documents with generated questions added.
     """
+    client = Client()
+    docs_with_embeddings = client.get_artifact_version(
+        name_id_or_prefix="documents_with_embeddings"
+    ).load()
     for doc in docs_with_embeddings:
         doc.generated_questions = [generate_question(doc.page_content, local)]
 
diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py
@@ -12,10 +12,11 @@
 #  or implied. See the License for the specific language governing
 #  permissions and limitations under the License.
 
-from typing import List
+
+import json
 
 from typing_extensions import Annotated
-from zenml import log_artifact_metadata, step
+from zenml import ArtifactConfig, log_artifact_metadata, step
 
 from steps.url_scraping_utils import get_all_pages
 
@@ -25,17 +26,16 @@ def url_scraper(
     docs_url: str = "https://docs.zenml.io",
     repo_url: str = "https://github.com/zenml-io/zenml",
     website_url: str = "https://zenml.io",
-) -> Annotated[List[str], "urls"]:
+) -> Annotated[str, ArtifactConfig(name="urls")]:
     """Generates a list of relevant URLs to scrape.
 
     Args:
         docs_url: URL to the documentation.
         repo_url: URL to the repository.
-        release_notes_url: URL to the release notes.
         website_url: URL to the website.
 
     Returns:
-        List of URLs to scrape.
+        JSON string containing a list of URLs to scrape.
     """
     # We comment this out to make this pipeline faster
     # examples_readme_urls = get_nested_readme_urls(repo_url)
@@ -44,8 +44,9 @@ def url_scraper(
     # all_urls = docs_urls + website_urls + examples_readme_urls
     all_urls = docs_urls
     log_artifact_metadata(
+        artifact_name="urls",
         metadata={
             "count": len(all_urls),
         },
     )
-    return all_urls
+    return json.dumps(all_urls)
diff --git a/llm-complete-guide/steps/url_scraping_utils.py b/llm-complete-guide/steps/url_scraping_utils.py
@@ -48,6 +48,18 @@ def is_valid_url(url: str, base: str) -> bool:
     return not re.search(version_pattern, url)
 
 
+def strip_query_params(url: str) -> str:
+    """Strip query parameters from a URL.
+
+    Args:
+        url (str): The URL to strip query parameters from.
+
+    Returns:
+        str: The URL without query parameters.
+    """
+    return url.split("?")[0]
+
+
 def get_all_pages(url: str) -> List[str]:
     """
     Retrieve all pages with the same base as the given URL.
@@ -60,10 +72,23 @@ def get_all_pages(url: str) -> List[str]:
     """
     logger.info(f"Scraping all pages from {url}...")
     base_url = urlparse(url).netloc
-    pages = crawl(url, base_url)
-    logger.info(f"Found {len(pages)} pages.")
+
+    # Use a queue-based approach instead of recursion
+    pages = set()
+    queue = [url]
+    while queue:
+        current_url = queue.pop(0)
+        if current_url not in pages:
+            pages.add(current_url)
+            links = get_all_links(current_url, base_url)
+            queue.extend(links)
+            sleep(1 / RATE_LIMIT)  # Rate limit the requests
+
+    stripped_pages = [strip_query_params(page) for page in pages]
+
+    logger.info(f"Found {len(stripped_pages)} pages.")
     logger.info("Done scraping pages.")
-    return list(pages)
+    return list(stripped_pages)
 
 
 def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:
@@ -118,6 +143,7 @@ def get_all_links(url: str, base: str) -> List[str]:
         parsed_url = urlparse(full_url)
         cleaned_url = parsed_url._replace(fragment="").geturl()
         if is_valid_url(cleaned_url, base):
+            print(cleaned_url)
             links.append(cleaned_url)
 
     logger.debug(f"Found {len(links)} valid links from {url}")
diff --git a/llm-complete-guide/steps/web_url_loader.py b/llm-complete-guide/steps/web_url_loader.py

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(`
`198`	`198`
`199`	`199`	`if all(url_ending not in url for url in urls):`
`200`	`200`	`logging.error(`
`201`		`- f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"`
	`201`	`+ f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"`
`202`	`202`	`)`
`203`	`203`	`failures += 1`
`204`	`204`