Refactor required phrase spans collection

pombredanne · pombredanne · commit 451fffb86fdb · 2024-10-08T17:39:24.000+02:00
* This is best moved into tokenize and is also used in models.
* The code is simpler as it does not collect the texts, only the spans
  which is what is used for indexing, matching and required phrases
  tagging.

Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -10,10 +10,11 @@
 
 import re
 
-from collections import defaultdict
 from binascii import crc32
+from collections import defaultdict
 from itertools import islice
 
+from licensedcode.spans import Span
 from licensedcode.stopwords import STOPWORDS
 from textcode.analysis import numbered_text_lines
 
@@ -118,6 +119,101 @@ def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
             yield token
 
 
+def get_existing_required_phrase_spans(text):
+    """
+    Return a list of token position Spans, one for each {{tagged}} required phrase found in  ``text``.
+
+    For example:
+
+    >>> text = 'This is enclosed in {{double curly braces}}'
+    >>> #       0    1  2        3    4      5     6
+    >>> x = get_existing_required_phrase_spans(text)
+    >>> assert x == [Span(4, 6)], x
+
+    >>> text = 'This is {{enclosed}} a  {{double curly braces}} or not'
+    >>> #       0    1    2          SW   3      4     5        6  7
+    >>> x = get_existing_required_phrase_spans(text)
+    >>> assert x == [Span(2), Span(3, 5)], x
+
+    >>> text = 'This {{is}} enclosed a  {{double curly braces}} or not'
+    >>> #       0    1      2        SW   3      4     5        6  7
+    >>> x = get_existing_required_phrase_spans(text)
+    >>> assert x == [Span([1]), Span([3, 4, 5])], x
+
+    >>> text = '{{AGPL-3.0  GNU Affero General Public License v3.0}}'
+    >>> #         0    1 2  3   4      5       6      7       8  9
+    >>> x = get_existing_required_phrase_spans(text)
+    >>> assert x == [Span(0, 9)], x
+
+    >>> assert get_existing_required_phrase_spans('{This}') == []
+
+    >>> def check_exception(text):
+    ...     try:
+    ...         return get_existing_required_phrase_spans(text)
+    ...     except InvalidRuleRequiredPhrase:
+    ...         pass
+
+    >>> check_exception('This {{is')
+    >>> check_exception('This }}is')
+    >>> check_exception('{{This }}is{{')
+    >>> check_exception('This }}is{{')
+    >>> check_exception('{{}}')
+    >>> check_exception('{{This is')
+    >>> check_exception('{{This is{{')
+    >>> check_exception('{{This is{{ }}')
+    >>> check_exception('{{{{This}}}}')
+    >>> check_exception('}}This {{is}}')
+    >>> check_exception('This }} {{is}}')
+    >>> check_exception('{{This}}')
+    [Span(0)]
+    >>> check_exception('{This}')
+    []
+    >>> check_exception('{{{This}}}')
+    [Span(0)]
+    """
+    return list(get_phrase_spans(text))
+
+
+class InvalidRuleRequiredPhrase(Exception):
+    pass
+
+
+
+def get_phrase_spans(text):
+    """
+    Yield position Spans for each tagged required phrase found in ``text``.
+    """
+    ipos = 0
+    in_required_phrase = False
+    current_phrase_positions = []
+    for token in required_phrase_tokenizer(text):
+        if token == REQUIRED_PHRASE_OPEN:
+            if in_required_phrase:
+                raise InvalidRuleRequiredPhrase('Invalid rule with nested required phrase {{ {{ braces', text)
+            in_required_phrase = True
+
+        elif token == REQUIRED_PHRASE_CLOSE:
+            if in_required_phrase:
+                if current_phrase_positions:
+                    yield Span(current_phrase_positions)
+                    current_phrase_positions = []
+                else:
+                    raise InvalidRuleRequiredPhrase('Invalid rule with empty required phrase {{}} braces', text)
+                in_required_phrase = False
+            else:
+                raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing closing braces', text)
+            continue
+        else:
+            if in_required_phrase:
+                current_phrase_positions.append(ipos)
+            ipos += 1
+
+    if current_phrase_positions or in_required_phrase:
+        raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing final closing braces', text)
+
+
+
+
 def index_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
     """
     Return an iterable of tokens from a rule or query ``text`` using index
diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py
@@ -17,7 +17,10 @@
 
 from commoncode.testcase import FileBasedTesting
 
+from licensedcode.spans import Span
+from licensedcode.tokenize import get_existing_required_phrase_spans
 from licensedcode.tokenize import index_tokenizer
+from licensedcode.tokenize import InvalidRuleRequiredPhrase
 from licensedcode.tokenize import matched_query_text_tokenizer
 from licensedcode.tokenize import ngrams
 from licensedcode.tokenize import query_lines
@@ -406,6 +409,10 @@ def test_index_tokenizer_lines_on_html_like_texts_2(self, regen=REGEN_TEST_FIXTU
         result = [list(index_tokenizer(line)) for _ln, line in lines]
         check_results(result, expected_file, regen=regen)
 
+
+class TestRequirePhraseTokenizer(FileBasedTesting):
+    test_data_dir = TEST_DATA_DIR
+
     def test_required_phrase_tokenizer_on_html_like_texts(self, regen=REGEN_TEST_FIXTURES):
         test_file = self.get_test_loc('tokenize/htmlish.txt')
         expected_file = test_file + '.expected.required_phrase_tokenizer.json'
@@ -521,6 +528,62 @@ def test_required_phrase_tokenizer_ignores_invalid_required_phrase_markup(self):
             'i', 'am', 'afraid'
         ]
 
+    def test_get_existing_required_phrase_spans_returns_spans(self):
+        text = (
+            'This released software is {{released}} by under {{the MIT license}}. '
+            'Which is a license originating at Massachusetts Institute of Technology (MIT).'
+        )
+
+        spans = get_existing_required_phrase_spans(text)
+        assert spans == [Span(4), Span(7, 9)]
+
+    def test_get_existing_required_phrase_spans_raises_exception_if_markup_is_not_closed(self):
+        text = 'This software is {{released by under the MIT license.'
+        try:
+            list(get_existing_required_phrase_spans(text))
+            raise Exception('Exception should be raised')
+        except InvalidRuleRequiredPhrase:
+            pass
+
+    def test_get_existing_required_phrase_spans_ignores_stopwords_in_positions(self):
+        text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.'
+        spans = get_existing_required_phrase_spans(text)
+        assert spans == [Span(11, 12)]
+
+    def test_get_existing_required_phrase_spans_yields_spans_without_stop_words(self):
+        text = 'This released software is {{released span}} by under {{the MIT quot license}}.'
+        spans = get_existing_required_phrase_spans(text)
+        assert spans == [Span(4), Span(7, 9)]
+
+    def test_get_existing_required_phrase_spans_does_not_yield_empty_spans(self):
+        text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.'
+        try:
+            list(get_existing_required_phrase_spans(text))
+            raise Exception('Exception should be raised')
+        except InvalidRuleRequiredPhrase:
+            pass
+
+    def test_get_existing_required_phrase_spans_only_considers_outer_required_phrase_markup(self):
+        text = 'This released {{{software under the MIT}}} license.'
+        required_phrase_spans = get_existing_required_phrase_spans(text)
+        assert required_phrase_spans == [Span(2, 5)]
+
+    def test_get_existing_required_phrase_spans_ignores_nested_required_phrase_markup(self):
+        text = 'This released {{software {{under the}} MIT}} license.'
+        try:
+            list(get_existing_required_phrase_spans(text))
+            raise Exception('Exception should be raised')
+        except InvalidRuleRequiredPhrase:
+            pass
+
+    def test_get_existing_required_phrase_spans_with_markup(self):
+        text = (
+            "Lua is free software distributed under the terms of the"
+            "<A HREF='http://www.opensource.org/licenses/mit-license.html'>{{MIT license}}</A>"
+            "reproduced below;"
+        )
+        assert get_existing_required_phrase_spans(text=text) == [Span(18, 19)]
+
 
 class TestNgrams(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR