-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
78 lines (69 loc) · 2.4 KB
/
preprocessing.py
File metadata and controls
78 lines (69 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
# nltk.download("punkt_tab")
# nltk.download("wordnet")
# nltk.download("stopwords")
# unquote the lines above if running this program for the first time
l = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# tokenize
def tokenizer(p):
"""
"Hi you!" --> ["hi", "you"]
"""
return word_tokenize(p.lower())
# Lemmatize
def lemmatize(w):
"""
"competing" --> "compete"
"""
return l.lemmatize(w)
def lemmatize_count(p, res=None): # p is a raw paragraph
"""
"Hi you!" --> ["hi", "you", "!"] --> ["hi", "you"] --> {"hi": 1, "you": 1}
"""
q = tokenizer(p)
junk = {"!", ",", ".", ":", ";", "...", "engineer", "full-stack", "frontend",
"backend", "software", "year", "years", "python", "javascript",
"develop", "development", "with", "5+", "application", "of", "solution",
"design", "developing", "cloud", "platform", "ability", "project", "system",
"deliver", "user", "web", "java", "react"} | stop_words
q = [tok for tok in q if tok not in junk]
for i in q:
ld = lemmatize(i)
if ld in res and ld not in junk:
res[ld] += 1
else:
res[ld] = 1
return res
def get_first_k_elements(dictionary, k):
if not isinstance(dictionary, dict):
raise TypeError("Input must be a dictionary.")
if not isinstance(k, int) or k < 0:
raise ValueError("k must be a non-negative integer.")
# Convert the dictionary items to a list and slice it
first_k_items = list(dictionary.items())[:k]
# Convert the sliced list of tuples back into a dictionary
return dict(first_k_items)
# visualization
def bar_plot(d):
labels, vals = list(d.keys()), list(d.values())
plt.bar(labels, vals, color="grey")
plt.xlabel("Word")
plt.ylabel("Frequency")
plt.title("Most Frequent Words in Descriptions")
plt.xticks(rotation=30)
plt.show()
if __name__ == "__main__":
n = int(input("num of queries: "))
res = {}
for i in range(n):
p = input("paragraph: ")
res = lemmatize_count(p, res)
k = sorted(res, key=res.get, reverse=True)
s = {key : res[key] for key in k}
limit = 10 if len(s.keys()) > 10 else len(s.keys())
bar_plot(get_first_k_elements(s, limit))