BioSearch-CPP/IndexAndEmbeddingsGeneration.py at main · shreyas463/BioSearch-CPP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pickle
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

# MongoDB connection setup
client = MongoClient('mongodb://localhost:27017/')
db = client['CPP_Biology']
faculty_collection = db['FacultyInfo']
inverted_index_collection = db['InvertedIndex']
embeddings_collection = db['Embeddings']

# File path for saving the trained TF-IDF vectorizer
TFIDF_PKL_FILE = "tfidf_vectorizer.pkl"


def generate_index_and_store_embeddings():
    """
    Generates and stores an inverted index and TF-IDF embeddings in MongoDB.
    """
    documents, doc_ids = fetch_documents()
    vectorizer, tfidf_matrix, terms = create_tfidf_matrix(documents)
    save_vectorizer(vectorizer)
    inverted_index = build_inverted_index(tfidf_matrix, terms, doc_ids)
    store_inverted_index(inverted_index)
    store_document_embeddings(tfidf_matrix, doc_ids)


def fetch_documents():
    """
    Fetches documents and their IDs from the MongoDB collection.
    Returns:
        documents (list): List of document texts.
        doc_ids (list): List of document IDs.
    """
    documents = []
    doc_ids = []
    # Adjust to use the `faculty_info` field
    for doc in faculty_collection.find({}, {"faculty_info": 1, "_id": 1}):
        text = doc.get('faculty_info')  # Use the `faculty_info` field for text
        if text:  # Skip documents with missing or empty `faculty_info`
            documents.append(text)
            doc_ids.append(str(doc['_id']))
    return documents, doc_ids


def create_tfidf_matrix(documents):
    """
    Creates a TF-IDF matrix for the documents.
    Args:
        documents (list): List of document texts.
    Returns:
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        tfidf_matrix (sparse matrix): TF-IDF matrix for the documents.
        terms (list): List of terms from the TF-IDF model.
    """
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
    tfidf_matrix = vectorizer.fit_transform(documents)
    terms = vectorizer.get_feature_names_out()
    return vectorizer, tfidf_matrix, terms


def save_vectorizer(vectorizer):
    """
    Saves the TF-IDF vectorizer to a file for reuse.
    Args:
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
    """
    print("Saving vectorizer...")
    with open(TFIDF_PKL_FILE, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f"Vectorizer saved to {TFIDF_PKL_FILE}.")


def build_inverted_index(tfidf_matrix, terms, doc_ids):
    """
    Builds an inverted index from the TF-IDF matrix.
    Args:
        tfidf_matrix (sparse matrix): TF-IDF matrix for the documents.
        terms (list): List of terms from the TF-IDF model.
        doc_ids (list): List of document IDs.
    Returns:
        inverted_index (defaultdict): Inverted index mapping terms to documents and scores.
    """
    inverted_index = defaultdict(list)
    for term_idx, term in enumerate(terms):
        for doc_idx in range(tfidf_matrix.shape[0]):
            score = tfidf_matrix[doc_idx, term_idx]
            if score > 0:
                inverted_index[term].append(
                    {"document_id": doc_ids[doc_idx], "tfidf_score": score}
                )
    return inverted_index


def store_inverted_index(inverted_index):
    """
    Stores the inverted index in MongoDB.
    Args:
        inverted_index (defaultdict): Inverted index mapping terms to documents and scores.
    """
    inverted_index_collection.delete_many({})
    for term, docs in inverted_index.items():
        inverted_index_collection.insert_one({"term": term, "documents": docs})
    print("Inverted index has been stored in MongoDB.")


def store_document_embeddings(tfidf_matrix, doc_ids):
    """
    Stores document embeddings (TF-IDF vectors) in MongoDB.
    Args:
        tfidf_matrix (sparse matrix): TF-IDF matrix for the documents.
        doc_ids (list): List of document IDs.
    """
    document_vectors = tfidf_matrix.toarray()
    embeddings_collection.delete_many({})
    for doc_idx, doc_id in enumerate(doc_ids):
        embeddings_collection.insert_one({
            "document_id": doc_id,
            "tfidf": document_vectors[doc_idx].tolist()
        })
    print("Document TF-IDF embeddings have been stored in MongoDB.")


def main():
    """
    The main function that orchestrates the execution of the script.
    It calls the `generate_index_and_store_embeddings` function
    to create an inverted index and store TF-IDF embeddings in MongoDB.
    """
    generate_index_and_store_embeddings()


if __name__ == "__main__":
    """
    Entry point of the script.
    Ensures that the `main` function is executed only when the script is run directly,
    and not when it is imported as a module in another script.
    """
    main()