-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcollocation_make_table.py
More file actions
109 lines (74 loc) · 3.35 KB
/
collocation_make_table.py
File metadata and controls
109 lines (74 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
"""
This program finds collocations in a corpus of text. It can find
the collocations of keywords you enter manually.
"""
#Import packages
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk.corpus import stopwords
import pandas as pd
#Import clean data in form of csv, get list of words and remove unfriendly characters
tbl = pd.read_csv('/Users/codyotoole/Desktop/Planetary:OneHealth/Health/text/oh_2020_clean.csv')
text = list(tbl['word'])
text = '\n'.join(text)
data = "".join(i for i in text if ord(i) < 128)
#tokenize data
tokens = word_tokenize(data)
#empty lists used in function
freq = []
score = []
collocate = []
r = []
def get_keyword_collocations(tokens, keyword, windowsize=10, numresults=35):
'''This function uses the Natural Language Toolkit to find collocations
for a specific keyword in a corpus. It takes as an argument a string that
contains the corpus you want to find collocations from. It prints the top
collocations it finds for each keyword.
'''
# initialize the bigram association measures object to score each collocation
bigram_measures = BigramAssocMeasures()
# initialize the bigram collocation finder object to find and rank collocations
finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
# initialize a function that will narrow down collocates that don't contain the keyword
keyword_filter = lambda *w: keyword not in w
# apply a series of filters to narrow down the collocation results
ignored_words = stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() in ignored_words)
finder.apply_freq_filter(1)
finder.apply_ngram_filter(keyword_filter)
# calculate the top results by T-score
# list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
results = finder.score_ngrams(bigram_measures.student_t)
results = results[:numresults]
t = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
for p in range(0,len(results)):
for n in range(0,len(t)):
if t[n][0] == results[p][0]:
freq.append(t[n][1])
# print the results
for n in range(0,len(results)):
r.append(results[n][0])
print("Top collocations for ", str(keyword), ":")
print('total occurences of'+' '+keyword+':'+' ',tokens.count(keyword))
for n in range(0,len(results)):
score.append(results[n][1])
for k,v in r:
collocations = ''
if k != keyword:
collocations = k
else:
collocations = v
collocate.append(collocations)
# Replace this with a list of keywords you want to find collocations for
words_of_interest = ["public"]
# Get the top collocations for each keyword in the list above
for word in words_of_interest:
get_keyword_collocations(tokens, word)
#make data frame and then save that frame
df = pd.DataFrame(
{'collocate': collocate,
'frequency': freq,
'score': score})
df.to_csv('/Users/codyotoole/Desktop/oh_public_2020.csv')