feast-dev
diff --git a/‎module_4_rag/README.md‎
Lines changed: 7 additions & 1 deletion b/‎module_4_rag/README.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎module_4_rag/batch_score_documents.py‎
Lines changed: 24 additions & 12 deletions b/‎module_4_rag/batch_score_documents.py‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎module_4_rag/feature_repo/features.py‎
Lines changed: 2 additions & 1 deletion b/‎module_4_rag/feature_repo/features.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎module_4_rag/generate_random_questions.py‎
Lines changed: 16 additions & 4 deletions b/‎module_4_rag/generate_random_questions.py‎
Lines changed: 16 additions & 4 deletions
@@ -25,15 +25,21 @@ Which will output data to `data/city_wikipedia_summaries_with_embeddings.parquet
 
 Next we'll need to do some Feast work and move the data into a repo created by
 Feast.
+
 ## Feast
 
 To get started, make sure to have Feast installed and PostGreSQL.
 
 First run
 ```bash
-feast apply
+cp ./data feature_repo/
 ```
 
+And then open the `module_4.ipynb` notebook and follow those instructions.
+
+It will walk you through a trivial tutorial to retrieve the top `k` most similar
+documents using PGVector.
+
 # Overview
 
 The overview is relatively simple, the goal is to define an architecture
 
@@ -6,42 +6,54 @@
 
 INPUT_FILENAME = "./data/city_wikipedia_summaries.csv"
 EXPORT_FILENAME = "./data/city_wikipedia_summaries_with_embeddings.parquet"
-TOKENIZER = 'sentence-transformers/all-MiniLM-L6-v2'
-MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
+MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+
 
 def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+
 
 def run_model(sentences, tokenizer, model):
-    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+    encoded_input = tokenizer(
+        sentences, padding=True, truncation=True, return_tensors="pt"
+    )
     # Compute token embeddings
     with torch.no_grad():
         model_output = model(**encoded_input)
 
-    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
     sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
     return sentence_embeddings
 
+
 def score_data() -> None:
     if EXPORT_FILENAME not in os.listdir():
         print("scored data not found...generating embeddings...")
         df = pd.read_csv(INPUT_FILENAME)
         tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
         model = AutoModel.from_pretrained(MODEL)
-        embeddings = run_model(df['Wiki Summary'].tolist(), tokenizer, model)
+        embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
         print(embeddings)
-        print('shape = ', df.shape)
-        df['Embeddings'] = list(embeddings.detach().cpu().numpy())
+        print("shape = ", df.shape)
+        df["Embeddings"] = list(embeddings.detach().cpu().numpy())
         print("embeddings generated...")
-        df['event_timestamp'] = pd.to_datetime('today')
+        df["event_timestamp"] = pd.to_datetime("today")
         df["item_id"] = df.index
         print(df.head())
         df.to_parquet(EXPORT_FILENAME, index=False)
         print("...data exported. job complete")
     else:
         print("scored data found...skipping generating embeddings.")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     score_data()
@@ -2,7 +2,8 @@
 
 from feast import (
     FeatureView,
-    Field, FileSource,
+    Field,
+    FileSource,
 )
 from feast.data_format import ParquetFormat
 from feast.types import Float32, Array
 
@@ -1,7 +1,18 @@
 import csv
 import random
 
-topics = ["science", "history", "technology", "mathematics", "geography", "literature", "sports", "art", "music", "cinema"]
+topics = [
+    "science",
+    "history",
+    "technology",
+    "mathematics",
+    "geography",
+    "literature",
+    "sports",
+    "art",
+    "music",
+    "cinema",
+]
 
 # Define a pattern for generating questions
 question_patterns = [
@@ -14,7 +25,7 @@
     "How does {} affect our daily lives?",
     "What are the future prospects of {}?",
     "What are the major challenges in {} today?",
-    "How can one get started with {}?"
+    "How can one get started with {}?",
 ]
 
 # Generate a list of 50 random questions
@@ -28,13 +39,14 @@
 
 def main():
     # Define the file path
-    file_path = './random_questions.csv'
+    file_path = "./random_questions.csv"
 
     # Write the questions to a CSV file
-    with open(file_path, 'w', newline='') as file:
+    with open(file_path, "w", newline="") as file:
         writer = csv.writer(file)
         writer.writerow(["Question"])  # Writing header
         writer.writerows(questions)
 
+
 if __name__ == "__main__":
     main()
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,8 @@`
`2`	`2`
`3`	`3`	`from feast import (`
`4`	`4`	`FeatureView,`
`5`		`- Field, FileSource,`
	`5`	`+ Field,`
	`6`	`+ FileSource,`
`6`	`7`	`)`
`7`	`8`	`from feast.data_format import ParquetFormat`
`8`	`9`	`from feast.types import Float32, Array`