-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
84 lines (74 loc) · 3.16 KB
/
main.py
File metadata and controls
84 lines (74 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pickle
DEFAULT_WORD_SOURCE_FILENAME = "words.txt"
DEFAULT_WORD_FREQUENCY_FILENAME = "word_frequencies.txt"
DEFAULT_DICTIONARY_OUTPUT_FILENAME = "dictionary.pickle"
def main():
"""Constructs a linked hash map (referred to as the dictionary). The keys are the words from the input word list.
The value for each key is a character position map. The key is a unique letter in the word and the value is a set
of indices where that letter appears in the word.
example: "books":
'b' -> {0}
'o' -> {1,2}
'k' -> {3}
's' -> {4}
Each word (starting with the word saved in the dictionary under the key "FIRST_WORD") links to its neighbors in the
word frequency list with the keys in its character count map of "prev" and "next". So for the example of "books"
above, the value of dictionary["books"] would end up being:
'b' -> {0}
'o' -> {1,2}
'k' -> {3}
's' -> {4}
'prev' -> "rugby"
'next' -> "roman"
The result is pickled to disk at the specified file name.
"""
dictionary = {}
with open(DEFAULT_WORD_SOURCE_FILENAME, 'r') as source_file:
source_lines = source_file.readlines()
# construct the dictionary from the input file
for line in source_lines:
word = line.strip()
# only add words with 5 letters
if len(word) == 5:
# positions map: letter -> {positions}
char_count = {}
for index in range(len(word)):
if word[index] not in char_count:
char_count[word[index]] = set()
char_count[word[index]].add(index)
dictionary[word] = char_count
# used for sorting by most common word
dictionary[word]["prev"] = ''
dictionary[word]["next"] = ''
with open(DEFAULT_WORD_FREQUENCY_FILENAME, 'r') as frequencies_file:
freq_lines = frequencies_file.readlines()
prev_word = ''
first_word = ''
# find the most commonly used words according to the frequency file
# frequency file is sorted by most common word first
for line in freq_lines:
# format is: word # # #...
# (only care about the word)
split_line = line.split()
if len(split_line) > 0:
word = split_line[0]
if word in dictionary:
if first_word == '':
first_word = word
else:
dictionary[prev_word]["next"] = word
dictionary[word]["prev"] = prev_word
prev_word = word
# not every word in the dictionary is in the frequency file, so loop through the dictionary and add any unlinked
# words in the order they're found
for word in list(dictionary):
if dictionary[word]["prev"] == '' and dictionary[word]["next"] == '':
dictionary[prev_word]["next"] = word
dictionary[word]["prev"] = prev_word
prev_word = word
dictionary["FIRST_WORD"] = first_word
print(len(dictionary))
with open(DEFAULT_DICTIONARY_OUTPUT_FILENAME, 'wb') as output:
pickle.dump(dictionary, output, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
main()