-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinverted_index.py
More file actions
135 lines (97 loc) · 3.59 KB
/
inverted_index.py
File metadata and controls
135 lines (97 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from bs4 import BeautifulSoup
from pymongo import MongoClient
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
'''
Parser file that creates an inverted index of each term and their respective documents
of the Civil Engineering faculty pages
'''
def connectDataBase():
'''
Connects to project database
'''
DB_NAME = "pages"
DB_HOST = "localhost"
DB_PORT = 27017
try:
client = MongoClient(host=DB_HOST, port=DB_PORT)
db = client[DB_NAME]
return db
except:
print("Database not connected successfully")
db = connectDataBase()
pages = db.pages
inverted_index = db.invertedIndex
def parse_text_pages(col, index, url_pattern):
'''
Parsing function that stores term objects into the invertedIndex collection.
Parameters
----------
col: MongoDB collection
The collection of all of the previously crawled pages
that are already stored in the database
index: MongoDB collection
The collection to store all of the parsed terms and
a list of their documents present in
url_pattern: dict
A dictionary represented in the pages collection to
extract all of the pages only of the faculty members
'''
faculty = list(col.find(url_pattern))
inverted_index = {}
for member in faculty:
id = member['_id']
html = member['html']
bs = BeautifulSoup(html, 'html.parser')
main_section = bs.find_all('div', class_='blurb')
side_bar = bs.find_all('div', class_= 'accolades')
main_section.extend(side_bar)
for cell in main_section:
cleaned_lemmatized_tokens = filter_text(cell)
for token in cleaned_lemmatized_tokens:
if token not in inverted_index:
inverted_index[token] = [id]
elif id not in inverted_index[token]:
inverted_index[token].append(id)
add_term_object(index, inverted_index)
def add_term_object(col, term_dictionary):
'''
Adds a term object (dictionary) to the collection,
where the object is the term, and a list of its documents
Parameters
----------
col: MongoDB collection
Database collection for objects to be stored in
term_dicionary: dict
A dictionary to represent a term object of the term
and its documents present in
'''
for term, documents in term_dictionary.items():
term_object = {
"term" : term,
"documents" : documents
}
col.insert_one(term_object)
def filter_text(html_cell):
'''
Uses stopwords, lemmatizing, and regular expression
filtering to clean the text to be ready to
store in the inverted index
Parameters
----------
html_cell: BeautifulSoup tag object
A BeautifulSoup tag inputted to extract the text
and filter it fully for the inverted index
'''
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
text = html_cell.get_text().replace(u'\xa0', '').replace('\n', ' ').replace('\t', '')
text = re.sub(r'https?://[^\s,]+', '', text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in text.split()]
filtered_lemmatized_tokens = [word for word in lemmatized_tokens if word not in stop_words]
return filtered_lemmatized_tokens
faculty_url_pattern = {'url': {'$regex': r"^https:\/\/www\.cpp\.edu\/faculty\/.*"}}
parse_text_pages(pages, inverted_index, faculty_url_pattern)