Skip to content

Commit 4b09291

Browse files
committed
handle JSON changes
1 parent 8f95929 commit 4b09291

File tree

2 files changed

+39
-25
lines changed

2 files changed

+39
-25
lines changed

llm-complete-guide/steps/populate_index.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,19 @@ def generate_embeddings(
108108
},
109109
)
110110

111-
document_texts = [doc.page_content for doc in split_documents]
111+
# Parse the JSON string into a list of Document objects
112+
document_list = [
113+
Document(**doc) for doc in json.loads(split_documents)
114+
]
115+
116+
document_texts = [doc.page_content for doc in document_list]
112117
embeddings = model.encode(document_texts)
113118

114-
for doc, embedding in zip(split_documents, embeddings):
115-
doc.embedding = (
116-
embedding.tolist()
117-
) # Convert numpy array to list for JSON serialization
119+
for doc, embedding in zip(document_list, embeddings):
120+
doc.embedding = embedding.tolist()
118121

119122
# Convert the list of Document objects to a JSON string
120-
documents_json = json.dumps([doc.__dict__ for doc in split_documents])
123+
documents_json = json.dumps([doc.__dict__ for doc in document_list])
121124

122125
return documents_json
123126
except Exception as e:
@@ -166,14 +169,14 @@ def index_generator(
166169

167170
register_vector(conn)
168171

169-
# load the documents from the JSON string
170-
documents = json.loads(documents)
172+
# Parse the JSON string into a list of Document objects
173+
document_list = [Document(**doc) for doc in json.loads(documents)]
171174

172175
# Insert data only if it doesn't already exist
173-
for doc in documents:
176+
for doc in document_list:
174177
content = doc.page_content
175178
token_count = doc.token_count
176-
embedding = doc.embedding.tolist()
179+
embedding = doc.embedding
177180
filename = doc.filename
178181
parent_section = doc.parent_section
179182
url = doc.url

llm-complete-guide/steps/synthetic_data.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,20 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
from typing import List, Annotated
17+
from typing import Annotated
18+
import logging
19+
import json
1820

1921
import pandas as pd
2022
from datasets import Dataset
2123
from huggingface_hub import create_repo
2224
from litellm import completion
2325
from structures import Document
24-
from zenml import step, ArtifactConfig
26+
from zenml import ArtifactConfig, step
2527
from zenml.client import Client
2628

29+
logger = logging.getLogger(__name__)
30+
2731
LOCAL_MODEL = "ollama/mixtral"
2832

2933

@@ -52,31 +56,37 @@ def generate_question(chunk: str, local: bool = False) -> str:
5256

5357
@step
5458
def generate_questions_from_chunks(
55-
docs_with_embeddings: List[Document],
59+
docs_with_embeddings: str,
5660
local: bool = False,
61+
logging_interval: int = 10,
5762
) -> Annotated[str, ArtifactConfig(name="synthetic_questions")]:
5863
"""Generate questions from chunks.
5964
6065
Args:
66+
docs_with_embeddings: JSON string containing a list of Document objects with embeddings.
6167
local: Whether to run the pipeline with a local LLM.
6268
6369
Returns:
6470
JSON string containing a list of documents with generated questions added.
6571
"""
66-
client = Client()
67-
docs_with_embeddings = client.get_artifact_version(
68-
name_id_or_prefix="documents_with_embeddings"
69-
).load()
70-
for doc in docs_with_embeddings:
72+
document_list = [
73+
Document(**doc) for doc in json.loads(docs_with_embeddings)
74+
]
75+
76+
for i, doc in enumerate(document_list, 1):
7177
doc.generated_questions = [generate_question(doc.page_content, local)]
78+
if i % logging_interval == 0:
79+
logger.info(
80+
f"Progress: {i}/{len(document_list)} documents processed"
81+
)
82+
logger.info(
83+
f"Generated question for document {i}: {doc.generated_questions[0]}"
84+
)
7285

73-
assert all(doc.generated_questions for doc in docs_with_embeddings)
86+
assert all(doc.generated_questions for doc in document_list)
7487

7588
# Convert List[Document] to DataFrame
76-
df = pd.DataFrame([doc.__dict__ for doc in docs_with_embeddings])
77-
78-
# Convert numpy arrays to lists
79-
df["embedding"] = df["embedding"].apply(lambda x: x.tolist())
89+
df = pd.DataFrame([doc.__dict__ for doc in document_list])
8090

8191
# upload the parquet file to a private dataset on the huggingface hub
8292
client = Client()
@@ -86,14 +96,15 @@ def generate_questions_from_chunks(
8696
"zenml/rag_qa_embedding_questions",
8797
token=hf_token,
8898
exist_ok=True,
89-
private=True,
9099
repo_type="dataset",
91100
)
92101

102+
# add an extra `__pydantic_initialised__` column to the dataframe
103+
df["__pydantic_initialised__"] = True
104+
93105
dataset = Dataset.from_pandas(df)
94106
dataset.push_to_hub(
95107
repo_id="zenml/rag_qa_embedding_questions",
96-
private=True,
97108
token=hf_token,
98109
create_pr=True,
99110
)

0 commit comments

Comments
 (0)