Skip to content

Commit e959053

Browse files
updated to export fields with appropriate name
Signed-off-by: Francisco Javier Arceo <[email protected]>
1 parent 8271955 commit e959053

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

module_4_rag/batch_score_documents.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,20 +59,40 @@ def score_data() -> None:
5959

6060
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
6161
model = AutoModel.from_pretrained(MODEL)
62-
embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
62+
embeddings = run_model(df["Sentence Chunks"].tolist(), tokenizer, model)
6363
print("embeddings generated...")
64+
df["id"] = [i for i in range(len(df))]
6465
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
6566
df["event_timestamp"] = pd.to_datetime("today")
6667
df["item_id"] = df.index
68+
df = _rename_df(df)
6769

6870
df.to_parquet(EXPORT_FILENAME, index=False)
6971
print("...data exported. Job complete")
7072
else:
73+
df = pd.read_parquet(EXPORT_FILENAME)
7174
print("Scored data found... skipping generating embeddings.")
72-
75+
7376
print("preview of data:")
7477
print(df.head().T)
7578

7679

80+
def _rename_df(df: pd.DataFrame) -> pd.DataFrame:
81+
df.columns = [c.replace(" ", "_").lower() for c in df.columns]
82+
df.rename({"embeddings": "vector"}, axis=1, inplace=True)
83+
df = df[
84+
[
85+
"id",
86+
"item_id",
87+
"event_timestamp",
88+
"state",
89+
"wiki_summary",
90+
"sentence_chunks",
91+
"vector",
92+
]
93+
]
94+
return df
95+
96+
7797
if __name__ == "__main__":
7898
score_data()
Binary file not shown.

0 commit comments

Comments
 (0)