forked from carsonprindle/team_survivor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean.py
More file actions
70 lines (55 loc) · 2.14 KB
/
clean.py
File metadata and controls
70 lines (55 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#Cleaning program
#read data
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
#relic of the movie review thing, rev is the number review to read
rev = 32
# Import BeautifulSoup
from bs4 import BeautifulSoup
import re
# Initialize the BeautifulSoup object on a single movie review
example1 = BeautifulSoup(train["review"][rev], "html5lib")
# Method to give a list of all the words in the comparison string
def just_words(input):
letters_only = re.sub("[^a-zA-Z]", # The pattern to search for
" ", # The pattern to replace it with
input) # The text to search
lower_case = letters_only.lower() # Convert to lower case
words = lower_case.split() # Split into words
return words
# Just the words from the input
in_words = just_words(example1.get_text())
#print(words)
# Get stopwords
import nltk
from nltk.corpus import stopwords # Import the stop word list
#print(str(stopwords.words("english")))
# Create new list of input words without stopwords using list comprehension
trim = [w for w in in_words if w not in set(stopwords.words("english"))]
#print (trim)
# Import collections to count the number of duplicates
import collections
count = collections.Counter(trim)
#print (count)
# Method taking two input lists giving a dictionary of the first list and a 1 or 0 depending on if the word appears in the second list
def match(list, comp_list):
D = {}
for w in trim:
if w in comp_words:
D[w]=1
else:
D[w]=0
return(D)
# Comparison Text
comp = "faithful adaptation, humerous, this is the best, down with the bourgeois"
comp_words = just_words(comp)
# Dictionary with input compared to comparison list with 1's and 0's
comp = match(trim, comp_words)
print(comp)
# Print just the 1's and 0's
print(list(comp.values()))
# Counter of similarites
counter=collections.Counter(list(comp.values()))
print(counter.values())
sim = float(counter[1]/(counter[0]+counter[1])) #percent of words that appeared in the compared text
print("Percent Similarity: " + str(sim))