handle JSON changes

strickvl · strickvl · commit 4b092919ce84 · 2024-10-22T17:11:38.000+02:00
diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -108,16 +108,19 @@ def generate_embeddings(
             },
         )
 
-        document_texts = [doc.page_content for doc in split_documents]
+        # Parse the JSON string into a list of Document objects
+        document_list = [
+            Document(**doc) for doc in json.loads(split_documents)
+        ]
+
+        document_texts = [doc.page_content for doc in document_list]
         embeddings = model.encode(document_texts)
 
-        for doc, embedding in zip(split_documents, embeddings):
-            doc.embedding = (
-                embedding.tolist()
-            )  # Convert numpy array to list for JSON serialization
+        for doc, embedding in zip(document_list, embeddings):
+            doc.embedding = embedding.tolist()
 
         # Convert the list of Document objects to a JSON string
-        documents_json = json.dumps([doc.__dict__ for doc in split_documents])
+        documents_json = json.dumps([doc.__dict__ for doc in document_list])
 
         return documents_json
     except Exception as e:
@@ -166,14 +169,14 @@ def index_generator(
 
             register_vector(conn)
 
-            # load the documents from the JSON string
-            documents = json.loads(documents)
+            # Parse the JSON string into a list of Document objects
+            document_list = [Document(**doc) for doc in json.loads(documents)]
 
             # Insert data only if it doesn't already exist
-            for doc in documents:
+            for doc in document_list:
                 content = doc.page_content
                 token_count = doc.token_count
-                embedding = doc.embedding.tolist()
+                embedding = doc.embedding
                 filename = doc.filename
                 parent_section = doc.parent_section
                 url = doc.url
diff --git a/llm-complete-guide/steps/synthetic_data.py b/llm-complete-guide/steps/synthetic_data.py
@@ -14,16 +14,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Annotated
+from typing import Annotated
+import logging
+import json
 
 import pandas as pd
 from datasets import Dataset
 from huggingface_hub import create_repo
 from litellm import completion
 from structures import Document
-from zenml import step, ArtifactConfig
+from zenml import ArtifactConfig, step
 from zenml.client import Client
 
+logger = logging.getLogger(__name__)
+
 LOCAL_MODEL = "ollama/mixtral"
 
 
@@ -52,31 +56,37 @@ def generate_question(chunk: str, local: bool = False) -> str:
 
 @step
 def generate_questions_from_chunks(
-    docs_with_embeddings: List[Document],
+    docs_with_embeddings: str,
     local: bool = False,
+    logging_interval: int = 10,
 ) -> Annotated[str, ArtifactConfig(name="synthetic_questions")]:
     """Generate questions from chunks.
 
     Args:
+        docs_with_embeddings: JSON string containing a list of Document objects with embeddings.
         local: Whether to run the pipeline with a local LLM.
 
     Returns:
         JSON string containing a list of documents with generated questions added.
     """
-    client = Client()
-    docs_with_embeddings = client.get_artifact_version(
-        name_id_or_prefix="documents_with_embeddings"
-    ).load()
-    for doc in docs_with_embeddings:
+    document_list = [
+        Document(**doc) for doc in json.loads(docs_with_embeddings)
+    ]
+
+    for i, doc in enumerate(document_list, 1):
         doc.generated_questions = [generate_question(doc.page_content, local)]
+        if i % logging_interval == 0:
+            logger.info(
+                f"Progress: {i}/{len(document_list)} documents processed"
+            )
+            logger.info(
+                f"Generated question for document {i}: {doc.generated_questions[0]}"
+            )
 
-    assert all(doc.generated_questions for doc in docs_with_embeddings)
+    assert all(doc.generated_questions for doc in document_list)
 
     # Convert List[Document] to DataFrame
-    df = pd.DataFrame([doc.__dict__ for doc in docs_with_embeddings])
-
-    # Convert numpy arrays to lists
-    df["embedding"] = df["embedding"].apply(lambda x: x.tolist())
+    df = pd.DataFrame([doc.__dict__ for doc in document_list])
 
     # upload the parquet file to a private dataset on the huggingface hub
     client = Client()
@@ -86,14 +96,15 @@ def generate_questions_from_chunks(
         "zenml/rag_qa_embedding_questions",
         token=hf_token,
         exist_ok=True,
-        private=True,
         repo_type="dataset",
     )
 
+    # add an extra `__pydantic_initialised__` column to the dataframe
+    df["__pydantic_initialised__"] = True
+
     dataset = Dataset.from_pandas(df)
     dataset.push_to_hub(
         repo_id="zenml/rag_qa_embedding_questions",
-        private=True,
         token=hf_token,
         create_pr=True,
     )