This repository was archived by the owner on Jul 17, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathseed_database.py
More file actions
98 lines (88 loc) · 3.56 KB
/
seed_database.py
File metadata and controls
98 lines (88 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from flask_sqlalchemy import SQLAlchemy
import re
from models import *
import nltk
from nltk.corpus import cmudict
import curses
from curses.ascii import isdigit
d = cmudict.dict()
def scrubText(text, punctuation):
separatedLines = [line.split(punctuation) for line in text]
separatedLines = [string for sublist in separatedLines for string in sublist]
separatedLines = filter(None, separatedLines)
return separatedLines
def parseIntoProbabilityHash(text, existingHash):
stripPunctuationFailsafe = [re.sub(ur"[^\w\d'\s]+",'',string) for string in text]
sectionsToParse = [string.split() for string in text]
masterHash = existingHash
masterHash = [build_hash(masterHash, section) for section in sectionsToParse][0]
return masterHash
def build_hash(existingHash, listToFormat):
i = 0
count = len(listToFormat) - 1
while (i < count):
word1 = listToFormat[i].lower()
word2 = listToFormat[i+1].lower()
if word1 in d and word2 in d:
twoWordString = word1 + " " + word2
format_hash(existingHash, twoWordString)
i += 1
return existingHash
def format_hash(existingHash, twoWordString):
if twoWordString in existingHash:
existingHash[twoWordString] += 1
else:
existingHash[twoWordString] = 1
return existingHash
def createUnigram(unigramSourcePair, count):
split_text = unigramSourcePair.split(" ")
new_unigram = Unigram(word1 = split_text[0], word2 = split_text[1], count = count)
db.session.add(new_unigram)
db.session.commit()
def unicodetoascii(text):
uni2ascii = {
ord('\xe2\x80\x99'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x9c'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9d'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9e'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9f'.decode('utf-8')): ord('"'),
ord('\xc3\xa9'.decode('utf-8')): ord('e'),
ord('\xe2\x80\x9c'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x93'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x92'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x94'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x94'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x98'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x9b'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x90'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x91'.decode('utf-8')): ord('-'),
ord('\xe2\x80\xb2'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb3'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb4'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb5'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb6'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb7'.decode('utf-8')): ord("'"),
ord('\xe2\x81\xba'.decode('utf-8')): ord("+"),
ord('\xe2\x81\xbb'.decode('utf-8')): ord("-"),
ord('\xe2\x81\xbc'.decode('utf-8')): ord("="),
ord('\xe2\x81\xbd'.decode('utf-8')): ord("("),
ord('\xe2\x81\xbe'.decode('utf-8')): ord(")"),
}
encodedString = text.decode('utf-8').translate(uni2ascii).encode('ascii', 'ignore')
return encodedString
files = ['example_poetry/your_text_file.txt']
def seedDatabase(files):
hashed_haikus = {}
for txtfile in files:
haikuFile = open(txtfile, "r")
haikus = haikuFile.readlines()
haikus = [string.replace("\n", "") for string in haikus]
haikus = [unicodetoascii(line) for line in haikus]
haikus = [" ".join(haikus)]
punctuationList = [".", "?", "!", ":", ";", "(", ")", "/", ","]
for punctuation in punctuationList:
haikus = scrubText(haikus, punctuation)
hashed_haikus = parseIntoProbabilityHash(haikus, hashed_haikus)
for sourcePair, count in hashed_haikus.items():
createUnigram(sourcePair, count)
seedDatabase(files)