Skip to content

Commit c19e1b7

Browse files
linted
1 parent 031967c commit c19e1b7

File tree

7 files changed

+685
-23
lines changed

7 files changed

+685
-23
lines changed

module_4_rag/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,21 @@ Which will output data to `data/city_wikipedia_summaries_with_embeddings.parquet
2525

2626
Next we'll need to do some Feast work and move the data into a repo created by
2727
Feast.
28+
2829
## Feast
2930

3031
To get started, make sure to have Feast installed and PostGreSQL.
3132

3233
First run
3334
```bash
34-
feast apply
35+
cp ./data feature_repo/
3536
```
3637

38+
And then open the `module_4.ipynb` notebook and follow those instructions.
39+
40+
It will walk you through a trivial tutorial to retrieve the top `k` most similar
41+
documents using PGVector.
42+
3743
# Overview
3844

3945
The overview is relatively simple, the goal is to define an architecture

module_4_rag/batch_score_documents.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,54 @@
66

77
INPUT_FILENAME = "./data/city_wikipedia_summaries.csv"
88
EXPORT_FILENAME = "./data/city_wikipedia_summaries_with_embeddings.parquet"
9-
TOKENIZER = 'sentence-transformers/all-MiniLM-L6-v2'
10-
MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
9+
TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
10+
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11+
1112

1213
def mean_pooling(model_output, attention_mask):
13-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
14-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
15-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
14+
token_embeddings = model_output[
15+
0
16+
] # First element of model_output contains all token embeddings
17+
input_mask_expanded = (
18+
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
19+
)
20+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
21+
input_mask_expanded.sum(1), min=1e-9
22+
)
23+
1624

1725
def run_model(sentences, tokenizer, model):
18-
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
26+
encoded_input = tokenizer(
27+
sentences, padding=True, truncation=True, return_tensors="pt"
28+
)
1929
# Compute token embeddings
2030
with torch.no_grad():
2131
model_output = model(**encoded_input)
2232

23-
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
33+
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
2434
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
2535
return sentence_embeddings
2636

37+
2738
def score_data() -> None:
2839
if EXPORT_FILENAME not in os.listdir():
2940
print("scored data not found...generating embeddings...")
3041
df = pd.read_csv(INPUT_FILENAME)
3142
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
3243
model = AutoModel.from_pretrained(MODEL)
33-
embeddings = run_model(df['Wiki Summary'].tolist(), tokenizer, model)
44+
embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
3445
print(embeddings)
35-
print('shape = ', df.shape)
36-
df['Embeddings'] = list(embeddings.detach().cpu().numpy())
46+
print("shape = ", df.shape)
47+
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
3748
print("embeddings generated...")
38-
df['event_timestamp'] = pd.to_datetime('today')
49+
df["event_timestamp"] = pd.to_datetime("today")
3950
df["item_id"] = df.index
4051
print(df.head())
4152
df.to_parquet(EXPORT_FILENAME, index=False)
4253
print("...data exported. job complete")
4354
else:
4455
print("scored data found...skipping generating embeddings.")
4556

46-
if __name__ == '__main__':
57+
58+
if __name__ == "__main__":
4759
score_data()

module_4_rag/feature_repo/features.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from feast import (
44
FeatureView,
5-
Field, FileSource,
5+
Field,
6+
FileSource,
67
)
78
from feast.data_format import ParquetFormat
89
from feast.types import Float32, Array

module_4_rag/generate_random_questions.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,18 @@
11
import csv
22
import random
33

4-
topics = ["science", "history", "technology", "mathematics", "geography", "literature", "sports", "art", "music", "cinema"]
4+
topics = [
5+
"science",
6+
"history",
7+
"technology",
8+
"mathematics",
9+
"geography",
10+
"literature",
11+
"sports",
12+
"art",
13+
"music",
14+
"cinema",
15+
]
516

617
# Define a pattern for generating questions
718
question_patterns = [
@@ -14,7 +25,7 @@
1425
"How does {} affect our daily lives?",
1526
"What are the future prospects of {}?",
1627
"What are the major challenges in {} today?",
17-
"How can one get started with {}?"
28+
"How can one get started with {}?",
1829
]
1930

2031
# Generate a list of 50 random questions
@@ -28,13 +39,14 @@
2839

2940
def main():
3041
# Define the file path
31-
file_path = './random_questions.csv'
42+
file_path = "./random_questions.csv"
3243

3344
# Write the questions to a CSV file
34-
with open(file_path, 'w', newline='') as file:
45+
with open(file_path, "w", newline="") as file:
3546
writer = csv.writer(file)
3647
writer.writerow(["Question"]) # Writing header
3748
writer.writerows(questions)
3849

50+
3951
if __name__ == "__main__":
4052
main()

0 commit comments

Comments
 (0)