-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvectorize_resumes.py
More file actions
96 lines (80 loc) · 3.83 KB
/
vectorize_resumes.py
File metadata and controls
96 lines (80 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# ==============================================================================
# Phase 1, Task 3: Vectorization (Embedding)
# ==============================================================================
#
# **Objective:**
# This script converts the cleaned text of each resume into a numerical vector
# (embedding) using a sentence-transformer model.
#
# **Instructions:**
# 1. Place this script in your 'Placement-Project' root folder.
# 2. Ensure the 'resume_cleaned_data.csv' file exists in 'processed_data'.
# 3. Install the new required libraries by running this in your terminal:
# pip install sentence-transformers torch numpy
# 4. Run the script. It will create two new files: 'resume_embeddings.npy'
# and 'resume_embedded_data.csv'.
# 5. NOTE: The first time you run this, it will download the ML model
# (approx. 90MB), so an internet connection is required.
#
# ==============================================================================
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
# --- Configuration ---
# Input file from the previous step.
input_csv_path = os.path.join('processed_data', 'resume_cleaned_data.csv')
# The name of the final output CSV file.
output_csv_path = os.path.join('processed_data', 'resume_embedded_data.csv')
# The path to save the embeddings numpy array.
embeddings_output_path = os.path.join('processed_data', 'resume_embeddings.npy')
# The pre-trained model we will use.
model_name = 'all-MiniLM-L6-v2'
# ---------------------
def main():
"""
Main function to run the vectorization process.
"""
print("--- Starting Resume Vectorization Process ---")
# Check if the input file exists
if not os.path.exists(input_csv_path):
print(f"❌ Error: The input file '{input_csv_path}' was not found.")
print("Please run the 'clean_text.py' script first.")
return
print(f"Reading data from '{input_csv_path}'...")
df = pd.read_csv(input_csv_path)
if 'cleaned_text' not in df.columns:
print("❌ Error: 'cleaned_text' column not found in the CSV.")
return
# Drop rows where cleaned_text might be empty or just whitespace
df.dropna(subset=['cleaned_text'], inplace=True)
df = df[df['cleaned_text'].str.strip() != '']
if df.empty:
print("❌ Error: No valid text to process after cleaning up empty rows.")
return
print(f"Loading sentence-transformer model: '{model_name}'...")
print("(This may take a moment and will download the model on the first run)")
model = SentenceTransformer(model_name)
# Convert the cleaned text column to a list for the model
texts_to_embed = df['cleaned_text'].tolist()
print(f"Generating embeddings for {len(texts_to_embed)} resumes...")
# The model's encode function can show a progress bar
embeddings = model.encode(texts_to_embed, show_progress_bar=True)
print(f"\nEmbeddings generated successfully. Shape: {embeddings.shape}")
# Save the embeddings array to a .npy file for efficient loading
print(f"Saving embeddings to '{embeddings_output_path}'...")
np.save(embeddings_output_path, embeddings)
# Add an index to the dataframe that corresponds to the row in the numpy array
df['embedding_id'] = range(len(df))
# Save the updated dataframe with the mapping
print(f"Saving updated data with embedding IDs to '{output_csv_path}'...")
df.to_csv(output_csv_path, index=False)
print("\n--- ✅ Process Complete! ---")
print("You can now proceed to the 'Clustering with K-Means' step.")
print("\nTwo files were created:")
print(f"1. {embeddings_output_path} - Contains the numerical vectors.")
print(f"2. {output_csv_path} - Your data with a new 'embedding_id' column.")
# --- Run the script ---
if __name__ == "__main__":
main()