|
| 1 | +#!/usr/bin/python |
| 2 | +# -*- coding: UTF-8 -*- |
| 3 | + |
| 4 | +from trie_ import Node, Trie |
| 5 | +from queue import Queue |
| 6 | + |
| 7 | + |
| 8 | +class ACNode(Node): |
| 9 | + def __init__(self, c: str): |
| 10 | + super(ACNode, self).__init__(c) |
| 11 | + self.fail = None |
| 12 | + self.length = 0 |
| 13 | + |
| 14 | + def insert_child(self, c: str): |
| 15 | + self._insert_child(ACNode(c)) |
| 16 | + |
| 17 | + |
| 18 | +class ACTrie(Trie): |
| 19 | + def __init__(self): |
| 20 | + self.root = ACNode(None) |
| 21 | + |
| 22 | + |
| 23 | +def ac_automata(main: str, ac_trie: ACTrie) -> list: |
| 24 | + root = ac_trie.root |
| 25 | + build_failure_pointer(ac_trie) |
| 26 | + |
| 27 | + ret = [] |
| 28 | + p = root |
| 29 | + for i, c in enumerate(main): |
| 30 | + while p != root and not p.has_child(c): |
| 31 | + p = p.fail |
| 32 | + |
| 33 | + if p.has_child(c): # a char matched, try to find all potential pattern matched |
| 34 | + q = p.get_child(c) |
| 35 | + while q != root: |
| 36 | + if q.is_ending_char: |
| 37 | + ret.append((i-q.length+1, i)) |
| 38 | + # ret.append(main[i-q.length+1:i+1]) |
| 39 | + q = q.fail |
| 40 | + p = p.get_child(c) |
| 41 | + |
| 42 | + return ret |
| 43 | + |
| 44 | + |
| 45 | +def build_failure_pointer(ac_trie: ACTrie) -> None: |
| 46 | + root = ac_trie.root |
| 47 | + |
| 48 | + # queue: [(node, node.length) ....] |
| 49 | + node_queue = Queue() |
| 50 | + node_queue.put((root, root.length)) |
| 51 | + |
| 52 | + root.fail = None |
| 53 | + while not node_queue.empty(): |
| 54 | + p, length = node_queue.get() |
| 55 | + for pc in p.children: |
| 56 | + pc.length = length + 1 |
| 57 | + if p == root: |
| 58 | + pc.fail = root |
| 59 | + else: |
| 60 | + q = p.fail |
| 61 | + # same as kmp |
| 62 | + while q != root and not q.has_child(pc.data): |
| 63 | + q = q.fail |
| 64 | + |
| 65 | + # cases now: |
| 66 | + # 1. q == root |
| 67 | + # 2. q != root and q.has_child(pc.data) |
| 68 | + if q.has_child(pc.data): |
| 69 | + pc.fail = q.get_child(pc.data) |
| 70 | + else: |
| 71 | + pc.fail = root |
| 72 | + node_queue.put((pc, pc.length)) |
| 73 | + |
| 74 | + |
| 75 | +if __name__ == '__main__': |
| 76 | + ac_trie = ACTrie() |
| 77 | + ac_trie.gen_tree(['fuck', 'shit', 'TMD', '傻叉']) |
| 78 | + |
| 79 | + print('--- ac automata ---') |
| 80 | + m_str = 'fuck you, what is that shit, TMD你就是个傻叉傻叉傻叉叉' |
| 81 | + print('original str : {}'.format(m_str)) |
| 82 | + |
| 83 | + filter_range_list = ac_automata(m_str, ac_trie) |
| 84 | + str_filtered = m_str |
| 85 | + for start, end in filter_range_list: |
| 86 | + str_filtered = str_filtered.replace(str_filtered[start:end+1], '*'*(end+1-start)) |
| 87 | + |
| 88 | + print('after filtered: {}'.format(str_filtered)) |
0 commit comments