|
| 1 | +package main |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "io/ioutil" |
| 6 | + "log" |
| 7 | + "strings" |
| 8 | + "time" |
| 9 | +) |
| 10 | + |
| 11 | +// User defined. |
| 12 | +// Set to true to print various extra stuff out (slows down the execution) |
| 13 | +// Set to false for quick and quiet execution. |
| 14 | +const debugMode bool = true |
| 15 | + |
| 16 | +// Implementation of Basic Aho-Corasick algorithm (Prefix based). |
| 17 | +// Searches for a set of strings (patterns.txt) in text.txt. |
| 18 | +func main() { |
| 19 | + patFile, err := ioutil.ReadFile("patterns.txt") |
| 20 | + if err != nil { |
| 21 | + log.Fatal(err) |
| 22 | + } |
| 23 | + textFile, err := ioutil.ReadFile("text.txt") |
| 24 | + if err != nil { |
| 25 | + log.Fatal(err) |
| 26 | + } |
| 27 | + patterns := strings.Split(string(patFile), " ") |
| 28 | + fmt.Printf("\nRunning: Basic Aho-Corasick algorithm.\n\n") |
| 29 | + if debugMode == true { |
| 30 | + fmt.Printf("Searching for %d patterns/words:\n", len(patterns)) |
| 31 | + } |
| 32 | + for i := 0; i < len(patterns); i++ { |
| 33 | + if len(patterns[i]) > len(textFile) { |
| 34 | + log.Fatal("There is a pattern that is longer than text! Pattern number:", i+1) |
| 35 | + } |
| 36 | + if debugMode == true { |
| 37 | + fmt.Printf("%q ", patterns[i]) |
| 38 | + } |
| 39 | + } |
| 40 | + if debugMode == true { |
| 41 | + fmt.Printf("\n\nIn text (%d chars long): \n%q\n\n", len(textFile), textFile) |
| 42 | + } |
| 43 | + ahoCorasick(string(textFile), patterns) |
| 44 | +} |
| 45 | + |
| 46 | +// Function performing the Basic Aho-Corasick alghoritm. |
| 47 | +// Finds and prints occurences of each pattern. |
| 48 | +func ahoCorasick(t string, p []string) { |
| 49 | + startTime := time.Now() |
| 50 | + occurences := make(map[int][]int) |
| 51 | + ac, f, s := buildAc(p) |
| 52 | + if debugMode == true { |
| 53 | + fmt.Printf("\n\nAC:\n\n") |
| 54 | + } |
| 55 | + current := 0 |
| 56 | + for pos := 0; pos < len(t); pos++ { |
| 57 | + if debugMode == true { |
| 58 | + fmt.Printf("Position: %d, we read: %c", pos, t[pos]) |
| 59 | + } |
| 60 | + for getTransition(current, t[pos], ac) == -1 && s[current] != -1 { |
| 61 | + current = s[current] |
| 62 | + } |
| 63 | + if getTransition(current, t[pos], ac) != -1 { |
| 64 | + current = getTransition(current, t[pos], ac) |
| 65 | + fmt.Printf(" (Continue) \n") |
| 66 | + } else { |
| 67 | + current = 0 |
| 68 | + if debugMode == true { |
| 69 | + fmt.Printf(" (FAIL) \n") |
| 70 | + } |
| 71 | + } |
| 72 | + _, ok := f[current] |
| 73 | + if ok { |
| 74 | + for i := range f[current] { |
| 75 | + if p[f[current][i]] == getWord(pos-len(p[f[current][i]])+1, pos, t) { //check for word match |
| 76 | + if debugMode == true { |
| 77 | + fmt.Printf("Occurence at position %d, %q = %q\n", pos-len(p[f[current][i]])+1, p[f[current][i]], p[f[current][i]]) |
| 78 | + } |
| 79 | + newOccurences := intArrayCapUp(occurences[f[current][i]]) |
| 80 | + occurences[f[current][i]] = newOccurences |
| 81 | + occurences[f[current][i]][len(newOccurences)-1] = pos - len(p[f[current][i]]) + 1 |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | + elapsed := time.Since(startTime) |
| 87 | + fmt.Printf("\n\nElapsed %f secs\n", elapsed.Seconds()) |
| 88 | + for key, value := range occurences { //prints all occurences of each pattern (if there was at least one) |
| 89 | + fmt.Printf("\nThere were %d occurences for word: %q at positions: ", len(value), p[key]) |
| 90 | + for i := range value { |
| 91 | + fmt.Printf("%d", value[i]) |
| 92 | + if i != len(value)-1 { |
| 93 | + fmt.Printf(", ") |
| 94 | + } |
| 95 | + } |
| 96 | + fmt.Printf(".") |
| 97 | + } |
| 98 | + return |
| 99 | +} |
| 100 | + |
| 101 | +// Functions that builds Aho Corasick automaton. |
| 102 | +func buildAc(p []string) (acToReturn map[int]map[uint8]int, f map[int][]int, s []int) { |
| 103 | + acTrie, stateIsTerminal, f := constructTrie(p) |
| 104 | + s = make([]int, len(stateIsTerminal)) //supply function |
| 105 | + i := 0 //root of acTrie |
| 106 | + acToReturn = acTrie |
| 107 | + s[i] = -1 |
| 108 | + if debugMode == true { |
| 109 | + fmt.Printf("\n\nAC construction: \n") |
| 110 | + } |
| 111 | + for current := 1; current < len(stateIsTerminal); current++ { |
| 112 | + o, parent := getParent(current, acTrie) |
| 113 | + down := s[parent] |
| 114 | + for stateExists(down, acToReturn) && getTransition(down, o, acToReturn) == -1 { |
| 115 | + down = s[down] |
| 116 | + } |
| 117 | + if stateExists(down, acToReturn) { |
| 118 | + s[current] = getTransition(down, o, acToReturn) |
| 119 | + if stateIsTerminal[s[current]] == true { |
| 120 | + stateIsTerminal[current] = true |
| 121 | + f[current] = arrayUnion(f[current], f[s[current]]) //F(Current) <- F(Current) union F(S(Current)) |
| 122 | + if debugMode == true { |
| 123 | + fmt.Printf(" f[%d] set to: ", current) |
| 124 | + for i := range f[current] { |
| 125 | + fmt.Printf("%d\n", f[current][i]) |
| 126 | + } |
| 127 | + } |
| 128 | + } |
| 129 | + } else { |
| 130 | + s[current] = i //initial state? |
| 131 | + } |
| 132 | + } |
| 133 | + if debugMode == true { |
| 134 | + fmt.Printf("\nsupply function: \n") |
| 135 | + for i := range s { |
| 136 | + fmt.Printf("\ns[%d]=%d", i, s[i]) |
| 137 | + } |
| 138 | + fmt.Printf("\n\n") |
| 139 | + for i, j := range f { |
| 140 | + fmt.Printf("f[%d]=", i) |
| 141 | + for k := range j { |
| 142 | + fmt.Printf("%d\n", j[k]) |
| 143 | + } |
| 144 | + } |
| 145 | + } |
| 146 | + return acToReturn, f, s |
| 147 | +} |
| 148 | + |
| 149 | +// Function that constructs Trie as an automaton for a set of reversed & trimmed strings. |
| 150 | +func constructTrie(p []string) (trie map[int]map[uint8]int, stateIsTerminal []bool, f map[int][]int) { |
| 151 | + trie = make(map[int]map[uint8]int) |
| 152 | + stateIsTerminal = make([]bool, 1) |
| 153 | + f = make(map[int][]int) |
| 154 | + state := 1 |
| 155 | + if debugMode == true { |
| 156 | + fmt.Printf("\n\nTrie construction: \n") |
| 157 | + } |
| 158 | + createNewState(0, trie) |
| 159 | + for i := 0; i < len(p); i++ { |
| 160 | + current := 0 |
| 161 | + j := 0 |
| 162 | + for j < len(p[i]) && getTransition(current, p[i][j], trie) != -1 { |
| 163 | + current = getTransition(current, p[i][j], trie) |
| 164 | + j++ |
| 165 | + } |
| 166 | + for j < len(p[i]) { |
| 167 | + stateIsTerminal = boolArrayCapUp(stateIsTerminal) |
| 168 | + createNewState(state, trie) |
| 169 | + stateIsTerminal[state] = false |
| 170 | + createTransition(current, p[i][j], state, trie) |
| 171 | + current = state |
| 172 | + j++ |
| 173 | + state++ |
| 174 | + } |
| 175 | + if stateIsTerminal[current] { |
| 176 | + newArray := intArrayCapUp(f[current]) |
| 177 | + newArray[len(newArray)-1] = i |
| 178 | + f[current] = newArray //F(Current) <- F(Current) union {i} |
| 179 | + if debugMode == true { |
| 180 | + fmt.Printf(" and %d", i) |
| 181 | + } |
| 182 | + } else { |
| 183 | + stateIsTerminal[current] = true |
| 184 | + f[current] = []int{i} //F(Current) <- {i} |
| 185 | + if debugMode == true { |
| 186 | + fmt.Printf("\n%d is terminal for word number %d", current, i) |
| 187 | + } |
| 188 | + } |
| 189 | + } |
| 190 | + return trie, stateIsTerminal, f |
| 191 | +} |
| 192 | + |
| 193 | +/** |
| 194 | +Returns 'true' if arry of int's 's' contains int 'e', 'false' otherwise. |
| 195 | +
|
| 196 | +@author Mostafa http://stackoverflow.com/a/10485970 |
| 197 | +*/ |
| 198 | +func contains(s []int, e int) bool { |
| 199 | + for _, a := range s { |
| 200 | + if a == e { |
| 201 | + return true |
| 202 | + } |
| 203 | + } |
| 204 | + return false |
| 205 | +} |
| 206 | + |
| 207 | +// Function that returns word found in text 't' at position range 'begin' to 'end'. |
| 208 | +func getWord(begin, end int, t string) string { |
| 209 | + for end >= len(t) { |
| 210 | + return "" |
| 211 | + } |
| 212 | + d := make([]uint8, end-begin+1) |
| 213 | + for j, i := 0, begin; i <= end; i, j = i+1, j+1 { |
| 214 | + d[j] = t[i] |
| 215 | + } |
| 216 | + return string(d) |
| 217 | +} |
| 218 | + |
| 219 | +// Dynamically increases an array size of int's by 1. |
| 220 | +func intArrayCapUp(old []int) (new []int) { |
| 221 | + new = make([]int, cap(old)+1) |
| 222 | + copy(new, old) //copy(dst,src) |
| 223 | + old = new |
| 224 | + return new |
| 225 | +} |
| 226 | + |
| 227 | +// Dynamically increases an array size of bool's by 1. |
| 228 | +func boolArrayCapUp(old []bool) (new []bool) { |
| 229 | + new = make([]bool, cap(old)+1) |
| 230 | + copy(new, old) |
| 231 | + old = new |
| 232 | + return new |
| 233 | +} |
| 234 | + |
| 235 | +// Concats two arrays of int's into one. |
| 236 | +func arrayUnion(to, from []int) (concat []int) { |
| 237 | + concat = to |
| 238 | + for i := range from { |
| 239 | + if !contains(concat, from[i]) { |
| 240 | + concat = intArrayCapUp(concat) |
| 241 | + concat[len(concat)-1] = from[i] |
| 242 | + } |
| 243 | + } |
| 244 | + return concat |
| 245 | +} |
| 246 | + |
| 247 | +// Function that finds the first previous state of a state and returns it. |
| 248 | +// Used for trie where there is only one parent. |
| 249 | +func getParent(state int, at map[int]map[uint8]int) (uint8, int) { |
| 250 | + for beginState, transitions := range at { |
| 251 | + for c, endState := range transitions { |
| 252 | + if endState == state { |
| 253 | + return c, beginState |
| 254 | + } |
| 255 | + } |
| 256 | + } |
| 257 | + return 0, 0 //unreachable |
| 258 | +} |
| 259 | + |
| 260 | +// Automaton function for creating a new state 'state'. |
| 261 | +func createNewState(state int, at map[int]map[uint8]int) { |
| 262 | + at[state] = make(map[uint8]int) |
| 263 | + if debugMode == true { |
| 264 | + fmt.Printf("\ncreated state %d", state) |
| 265 | + } |
| 266 | +} |
| 267 | + |
| 268 | +// Creates a transition for function σ(state,letter) = end. |
| 269 | +func createTransition(fromState int, overChar uint8, toState int, at map[int]map[uint8]int) { |
| 270 | + at[fromState][overChar] = toState |
| 271 | + if debugMode == true { |
| 272 | + fmt.Printf("\n σ(%d,%c)=%d;", fromState, overChar, toState) |
| 273 | + } |
| 274 | +} |
| 275 | + |
| 276 | +// Returns ending state for transition σ(fromState,overChar), '-1' if there is none. |
| 277 | +func getTransition(fromState int, overChar uint8, at map[int]map[uint8]int) (toState int) { |
| 278 | + if !stateExists(fromState, at) { |
| 279 | + return -1 |
| 280 | + } |
| 281 | + toState, ok := at[fromState][overChar] |
| 282 | + if ok == false { |
| 283 | + return -1 |
| 284 | + } |
| 285 | + return toState |
| 286 | +} |
| 287 | + |
| 288 | +// Checks if state 'state' exists. Returns 'true' if it does, 'false' otherwise. |
| 289 | +func stateExists(state int, at map[int]map[uint8]int) bool { |
| 290 | + _, ok := at[state] |
| 291 | + if !ok || state == -1 || at[state] == nil { |
| 292 | + return false |
| 293 | + } |
| 294 | + return true |
| 295 | +} |
0 commit comments