|
1 | 1 | # -*- coding: utf-8 -*- |
| 2 | +# |
| 3 | +# SPDX-License-Identifier: LicenseRef-scancode-public-domain |
| 4 | +# See https://github.com/nexB/license-expression for support or download. |
| 5 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 6 | +# |
2 | 7 | """ |
3 | | -Aho-Corasick string search algorithm. |
| 8 | +Aho-Corasick string search algorithm in pure Python |
4 | 9 |
|
5 | 10 | Original Author: Wojciech Muła, [email protected] |
6 | 11 | WWW : http://0x80.pl |
7 | 12 | License : public domain |
8 | 13 |
|
9 | | -Modified for use in the license_expression library: |
| 14 | +This is the pure Python Aho-Corasick automaton from pyahocorasick modified for |
| 15 | +use in the license_expression library for advanced tokenization: |
| 16 | +
|
10 | 17 | - add support for unicode strings. |
11 | 18 | - case insensitive search using sequence of words and not characters |
12 | 19 | - improve returned results with the actual start,end and matched string. |
13 | 20 | - support returning non-matched parts of a string |
14 | 21 | """ |
15 | | - |
16 | | -from __future__ import absolute_import |
17 | | -from __future__ import print_function |
18 | | -from __future__ import unicode_literals |
19 | | - |
20 | 22 | from collections import deque |
21 | 23 | from collections import OrderedDict |
22 | 24 | import logging |
23 | 25 | import re |
24 | 26 |
|
25 | | -# Python 2 and 3 support |
26 | | -try: |
27 | | - # Python 2 |
28 | | - unicode |
29 | | - str = unicode # NOQA |
30 | | -except NameError: |
31 | | - # Python 3 |
32 | | - unicode = str # NOQA |
33 | | - |
34 | 27 | TRACE = False |
35 | 28 |
|
36 | 29 | logger = logging.getLogger(__name__) |
@@ -109,7 +102,7 @@ def add(self, tokens_string, value=None): |
109 | 102 | provided value, typically a Token object. If a value is not provided, |
110 | 103 | the tokens_string is used as value. |
111 | 104 |
|
112 | | - A tokens_string is any unicode string. It will be tokenized when added |
| 105 | + A tokens_string is any string. It will be tokenized when added |
113 | 106 | to the Trie. |
114 | 107 | """ |
115 | 108 | if self._converted: |
@@ -326,7 +319,12 @@ def iter(self, tokens_string, include_unmatched=False, include_space=False): |
326 | 319 | if include_unmatched: |
327 | 320 | n = len(token_string) |
328 | 321 | start_pos = end_pos - n + 1 |
329 | | - tok = Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], None) |
| 322 | + tok = Token( |
| 323 | + start=start_pos, |
| 324 | + end=end_pos, |
| 325 | + string=tokens_string[start_pos: end_pos + 1], |
| 326 | + value=None |
| 327 | + ) |
330 | 328 | if TRACE: |
331 | 329 | logger_debug(' unmatched tok:', tok) |
332 | 330 | yield tok |
|
0 commit comments