forked from auberonedu/ramblebot
-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathUnigramWordPredictor.java
More file actions
165 lines (148 loc) · 5.83 KB
/
UnigramWordPredictor.java
File metadata and controls
165 lines (148 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
/**
* A class for predicting the next word in a sequence using a unigram model.
* The model is trained on input text and maps each word to a list of
* words that directly follow it in the text.
*/
public class UnigramWordPredictor implements WordPredictor
{
private Map<String, List<String>> neighborMap;
private Tokenizer tokenizer;
/**
* Constructs a UnigramWordPredictor with the specified tokenizer.
*
* @param tokenizer the tokenizer used to process the input text
*/
public UnigramWordPredictor(Tokenizer tokenizer)
{
this.tokenizer = tokenizer;
}
/**
* Trains the predictor using the text provided by the Scanner.
* The method tokenizes the text and builds a map where each word
* is associated with a list of words that immediately follow it
* in the text. The resultant map is stored in the neighborMap
* instance variable.
*
* For example:
* If the input text is: "The cat sat. The cat slept. The dog barked."
* After tokenizing, the tokens would be: ["the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", "."]
*
* The resulting map (neighborMap) would be:
* {
* "the" -> ["cat", "cat", "dog"],
* "cat" -> ["sat", "slept"],
* "sat" -> ["."],
* "." -> ["the", "the"],
* "slept" -> ["."],
* "dog" -> ["barked"],
* "barked" -> ["."]
* }
*
* The order of the map and the order of each list is not important.
*
* @param scanner the Scanner to read the training text from
*/
public void train(Scanner scanner)
{
List<String> trainingWords = tokenizer.tokenize(scanner);
// Map
Map<String, List<String>> prepNeighborMap = new HashMap<String, List<String>>();
// Go through each token in trainingWords
for (int i = 0; i < trainingWords.size() -1; i++)
{
// Test for null
if (trainingWords.get(i + 1) == null)
{
break;
}
else if (!prepNeighborMap.containsKey(trainingWords.get(i))) // If the token from trainingWords doesn't exist as a key in prepNeighborMap, create one
{
List<String> addList = new ArrayList<String>();
addList.add(trainingWords.get(i + 1)); // Add the next token from trainingWords to a new list
prepNeighborMap.put(trainingWords.get(i), addList);
}
else // If the token from trainingWords exists as a key in prepNeighborMap, add new token to list
{
List<String> addList = prepNeighborMap.get(trainingWords.get(i)); // Create a new list from the existing list based on the key
addList.add(trainingWords.get(i + 1)); // Add new list item
prepNeighborMap.put(trainingWords.get(i), addList);
}
}
// Put the completed map into neighborMap
this.neighborMap = prepNeighborMap;
}
/**
* Predicts the next word based on the given context.
* The prediction is made by randomly selecting from all words
* that follow the last word in the context in the training data.
*
* For example:
* If the input text is: "The cat sat. The cat slept. The dog barked."
*
* The resulting map (neighborMap) would be:
* {
* "the" -> ["cat", "cat", "dog"],
* "cat" -> ["sat", "slept"],
* "sat" -> ["."],
* "." -> ["the", "the"],
* "slept" -> ["."],
* "dog" -> ["barked"],
* "barked" -> ["."]
* }
*
* When predicting the next word given a context, the predictor should use
* the neighbor map to select a word based on the observed frequencies in
* the training data. For example:
*
* - If the last word in the context is "the", the next word should be randomly chosen
* from ["cat", "cat", "dog"]. In this case, "cat" has a 2/3 probability
* of being selected, and "dog" has a 1/3 probability, reflecting the
* original distribution of words following "the" in the text.
*
* - If the last word in the context is "cat", the next word should be randomly chosen
* from ["sat", "slept"], giving each an equal 1/2 probability.
*
* - If the last word in the context is ".", the next word should be randomly chosen
* from ["the", "the"], meaning "the" will always be selected
* since it's the only option.
*
* - If the last word in the context is "dog", the next word should be "barked" because
* "barked" is the only word that follows "dog" in the training data.
*
* The probabilities of selecting each word should match the relative
* frequencies of the words that follow in the original training data.
*
* @param context a list of words representing the current context
* @return the predicted next word, or null if no prediction can be made
*/
public String predictNextWord(List<String> context)
{
// Generating list of next word options
List<String> contextList = neighborMap.get(context.getLast());
// Creating a random num generator based on the number of next word options
int randNum = (int)Math.floor(Math.random() * contextList.size());
// Returning decided next word
return contextList.get(randNum);
}
/**
* Returns a copy of the neighbor map. The neighbor map is a mapping
* from each word to a list of words that have followed it in the training data.
*
* You do not need to modify this method for your project.
*
* @return a copy of the neighbor map
*/
public Map<String, List<String>> getNeighborMap() {
Map<String, List<String>> copy = new HashMap<>();
for (Map.Entry<String, List<String>> entry : neighborMap.entrySet()) {
List<String> newList = new ArrayList<>(entry.getValue());
copy.put(entry.getKey(), newList);
}
return copy;
}
}