-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtfidf.py
More file actions
67 lines (62 loc) · 2.14 KB
/
tfidf.py
File metadata and controls
67 lines (62 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
import sys
# nltk.download('punkt')
# nltk.download('stopwords')
def encode(text):
# stopwords is a list of common words that do not add value to the sentence
stopwords = nltk.corpus.stopwords.words('english')
# We remove all non-alphanumeric characters
text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I|re.A)
# We convert all characters to lowercase
text = text.lower()
# We remove all the stopwords
text = text.strip()
# We tokenize the text
tokens = nltk.tokenize.word_tokenize(text)
# If the token is not a common word, we ignore it
tokens = [token for token in tokens if token not in stopwords]
return tokens
def tf(text):
# tf compute the term frequency of each token in the text
# We use our function encode to tokenize the text
tokens = encode(text)
# We use the Counter function to count the number of times each token appears in the text
tf = Counter(tokens)
for i in tf:
# for each token, we divide the number of times it appears by the total number of tokens
tf[i] = tf[i]/float(len(tokens))
return tf
def idf(text):
# idf compute the inverse document frequency of each token in the text
tokens = encode(text)
idf = {}
for token in tokens:
# for each token, we count the number of documents in which it appears
idf[token] = idf.get(token, 0) + 1
for i in idf:
# for each token, we divide the number of documents by the number of documents in which it appears
idf[i] = np.log(len(tokens)/float(idf[i]))
return idf
def tfidf(text):
# tfidf compute the product of tf and idf
tfidf = {}
_tf = tf(text)
_idf = idf(text)
for i in _tf:
tfidf[i] = _tf[i] * _idf[i]
# We normalize the tfidf vector
tfidf[i] /= np.linalg.norm(list(tfidf.values()))
return tfidf
# Example
# if argv is empty, use default text
if len(sys.argv) == 1:
d = "The quick brown fox jumped over the lazy dog. The dog slept over the verandah."
else:
d = open(sys.argv[1]).read()
print(d)
d = tfidf(d)
print(d)