4250.01-Search-Engine-Project/inverted_index.py at main · CastroIROCZ/4250.01-Search-Engine-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from bs4 import BeautifulSoup
from pymongo import MongoClient
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

'''
Parser file that creates an inverted index of each term and their respective documents
of the Civil Engineering faculty pages

'''

def connectDataBase():
    '''
    Connects to project database

    '''

    DB_NAME = "pages"
    DB_HOST = "localhost"
    DB_PORT = 27017

    try:
        client = MongoClient(host=DB_HOST, port=DB_PORT)
        db = client[DB_NAME]

        return db
    except:
        print("Database not connected successfully")

db = connectDataBase()

pages = db.pages
inverted_index = db.invertedIndex

def parse_text_pages(col, index, url_pattern):
    '''
    Parsing function that stores term objects into the invertedIndex collection.

    Parameters
    ----------

    col: MongoDB collection
        The collection of all of the previously crawled pages
        that are already stored in the database

    index: MongoDB collection
        The collection to store all of the parsed terms and
        a list of their documents present in

    url_pattern: dict
        A dictionary represented in the pages collection to
        extract all of the pages only of the faculty members

    '''
    faculty = list(col.find(url_pattern))

    inverted_index = {}

    for member in faculty:
        id = member['_id']
        html = member['html']
        bs = BeautifulSoup(html, 'html.parser')
        main_section = bs.find_all('div', class_='blurb')
        side_bar = bs.find_all('div', class_= 'accolades')
        main_section.extend(side_bar)

        for cell in main_section:

            cleaned_lemmatized_tokens = filter_text(cell)

            for token in cleaned_lemmatized_tokens:
                if token not in inverted_index:
                    inverted_index[token] = [id]
                elif id not in inverted_index[token]:
                    inverted_index[token].append(id)

    add_term_object(index, inverted_index)


def add_term_object(col, term_dictionary):
    '''
    Adds a term object (dictionary) to the collection,
    where the object is the term, and a list of its documents

    Parameters
    ----------
    col: MongoDB collection
        Database collection for objects to be stored in

    term_dicionary: dict
        A dictionary to represent a term object of the term
        and its documents present in

    '''

    for term, documents in term_dictionary.items():
        term_object = {
            "term" : term,
            "documents" : documents
        }

        col.insert_one(term_object)

def filter_text(html_cell):
    '''
    Uses stopwords, lemmatizing, and regular expression
    filtering to clean the text to be ready to
    store in the inverted index

    Parameters
    ----------

    html_cell: BeautifulSoup tag object
        A BeautifulSoup tag inputted to extract the text
        and filter it fully for the inverted index

    '''

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    text = html_cell.get_text().replace(u'\xa0', '').replace('\n', ' ').replace('\t', '')
    text = re.sub(r'https?://[^\s,]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in text.split()]
    filtered_lemmatized_tokens = [word for word in lemmatized_tokens if word not in stop_words]

    return filtered_lemmatized_tokens


faculty_url_pattern = {'url': {'$regex': r"^https:\/\/www\.cpp\.edu\/faculty\/.*"}}
parse_text_pages(pages, inverted_index, faculty_url_pattern)