-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_vector_store.py
More file actions
124 lines (104 loc) · 4.7 KB
/
build_vector_store.py
File metadata and controls
124 lines (104 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# ==============================================================================
# Phase 2, Task 2: Build the Final Vector Store
# ==============================================================================
#
# **Objective:**
# This script loads the embeddings, assigns the human-readable archetype labels,
# and builds the final FAISS vector store for the RAG application.
#
# **Instructions:**
# 1. **CRITICAL:** Update the `ARCHETYPE_MAP` dictionary below with the
# interpretations you decided on from the last step.
# 2. Install the new dependency: pip install faiss-cpu
# 3. Run the script. It will create a 'vector_store' folder containing
# your final, ready-to-use AI knowledge base.
#
# ==============================================================================
import os
import pandas as pd
import numpy as np
import faiss
import pickle
# --- Configuration ---
# ! ================== ACTION REQUIRED ================== !
# Update this dictionary with your interpretations of the clusters.
# Based on your output, I've filled in my suggestions:
ARCHETYPE_MAP = {
0: "Core Engineering & Business",
1: "Data Science & AI/ML",
2: "General CS & Software Development",
3: "Full-Stack Web Development",
4: "Core Engineering (Mechanical/Civil)"
}
# ! ===================================================== !
# Input files from the previous steps
csv_input_path = os.path.join('processed_data', 'resume_clustered_data.csv')
embeddings_input_path = os.path.join('processed_data', 'resume_embeddings.npy')
# Output directory for the final vector store
store_output_dir = 'vector_store'
faiss_index_path = os.path.join(store_output_dir, 'srm_resumes.index')
metadata_path = os.path.join(store_output_dir, 'srm_resumes.pkl')
# ---------------------
def build_and_save_store(df, embeddings):
"""
Builds the FAISS index and saves it along with the metadata.
"""
print("--- Building Final Vector Store ---")
# Check if all clusters are mapped
if not all(cluster in ARCHETYPE_MAP for cluster in df['archetype_cluster'].unique()):
print("❌ Error: Not all cluster IDs in your data are present in the ARCHETYPE_MAP.")
print("Please check the map in the configuration section.")
return
# 1. Apply the human-readable labels
df['archetype_label'] = df['archetype_cluster'].map(ARCHETYPE_MAP)
print("Applied human-readable labels to clusters.")
# 2. Get the dimensions of the embeddings
d = embeddings.shape[1]
# 3. Create the FAISS index
print(f"Creating FAISS index with {embeddings.shape[0]} vectors of dimension {d}...")
index = faiss.IndexFlatL2(d) # Using L2 distance for similarity
index = faiss.IndexIDMap(index) # Maps index position to our embedding_id
# 4. Add vectors to the index
# We need to map our DataFrame's 'embedding_id' to the vectors
# This ensures the index ID matches our CSV's ID.
index_ids = df['embedding_id'].values.astype('int64')
index.add_with_ids(embeddings.astype('float32'), index_ids)
print(f"Successfully added {index.ntotal} vectors to the index.")
# 5. Create the metadata
# This is a list of dictionaries, where each dict is the "document"
# our LLM will see. We store the text and the label.
metadata = []
for _, row in df.iterrows():
metadata.append({
'file_name': row['file_name'],
'text': row['cleaned_text'],
'archetype': row['archetype_label']
})
# 6. Save the index and metadata to disk
os.makedirs(store_output_dir, exist_ok=True)
print(f"Saving FAISS index to '{faiss_index_path}'...")
faiss.write_index(index, faiss_index_path)
print(f"Saving metadata to '{metadata_path}'...")
with open(metadata_path, 'wb') as f:
pickle.dump(metadata, f)
print("\n--- ✅ Phase 2 Complete! ---")
print("Your vector store is built and ready for the application.")
print(f"Files created in '{store_output_dir}':")
print(f" - {os.path.basename(faiss_index_path)} (The AI vector database)")
print(f" - {os.path.basename(metadata_path)} (The resume text & labels)")
def main():
"""
Main function to run the vector store creation process.
"""
# Check for input files
if not os.path.exists(csv_input_path) or not os.path.exists(embeddings_input_path):
print("❌ Error: Input files from the clustering step were not found.")
print("Please run 'cluster_and_visualize.py' first.")
return
print("Loading data and embeddings...")
df = pd.read_csv(csv_input_path)
embeddings = np.load(embeddings_input_path)
build_and_save_store(df, embeddings)
# --- Run the script ---
if __name__ == "__main__":
main()