aboutcode-org
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/licensedcode/match.py‎
Lines changed: 118 additions & 0 deletions b/‎src/licensedcode/match.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎src/licensedcode/tokenize.py‎
Lines changed: 43 additions & 8 deletions b/‎src/licensedcode/tokenize.py‎
Lines changed: 43 additions & 8 deletions
diff --git a/‎src/scancode/api.py‎
Lines changed: 7 additions & 1 deletion b/‎src/scancode/api.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/scancode/cli.py‎
Lines changed: 29 additions & 10 deletions b/‎src/scancode/cli.py‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎src/textcode/analysis.py‎
Lines changed: 4 additions & 3 deletions b/‎src/textcode/analysis.py‎
Lines changed: 4 additions & 3 deletions
@@ -74,6 +74,7 @@ def read(*names, **kwargs):
         'bitarray >= 0.8.1, < 1.0.0',
         'intbitset >= 2.3.0,  < 3.0',
         'pyahocorasick >= 1.1, < 1.2',
+        'attrs >=16.0, < 17.0',
 
         # caching
         'zc.lockfile >= 1.0.0, < 2.0.0',
 
@@ -37,6 +37,7 @@
 from licensedcode import query
 from licensedcode.spans import Span
 from licensedcode import MAX_DIST
+from licensedcode import tokenize
 
 """
 LicenseMatch data structure and matches merging and filtering routines.
@@ -1084,3 +1085,120 @@ def _debug_print_matched_query_text(match, query, extras=5):
     logger_debug(' MATCHED QUERY TEXT with extras')
     qt, _it = get_texts(new_match, location=query.location, query_string=None, idx=query.idx)
     print(qt)
+
+
+def get_full_matched_text(match, location=None, query_string=None, idx=None,
+                          whole_lines=False,
+                          highlight_matched=u'%s', highlight_not_matched=u'[%s]'):
+    """
+    Yield a stream of unicode strings corresponding to the full matched matched query
+    text given a query file at `location` or a `query_string`, a `match` and an
+    `index`. This contains the full text including punctuations and spaces that are
+    not participating in the matche proper.
+
+    Each text token string is interpolated for optional highlighting with the
+    `highlight_matched` format string is matched or to the to the
+    `highlight_not_matched` is not matched.
+    Punctuation is not "highlighted".
+
+    Optionally if `whole_lines` is True, the unmatched part at the start of the first
+    matched line and the end of the last matched lines are included in the text.
+    """
+    assert idx
+    dictionary_get = idx.dictionary.get
+
+
+    import attr
+    @attr.s(slots=True)
+    class Token(object):
+        value = attr.ib()
+        line_num = attr.ib()
+        pos = attr.ib(default=-1)
+        is_text = attr.ib(default=False)
+        is_included = attr.ib(default=False)
+        is_matched = attr.ib(default=False)
+        is_known = attr.ib(default=False)
+
+
+    def _tokenize(location, query_string):
+        """Yield Tokens with pos and line number."""
+        _pos = -1
+        for _line_num, _line in enumerate(query.query_lines(location, query_string, strip=False), 1):
+            for _is_text, _token in tokenize.matched_query_text_tokenizer(_line):
+                _known = _is_text and dictionary_get(_token.lower()) is not None
+                _tok = Token(value=_token, line_num=_line_num, is_text=_is_text, is_known=_known)
+                if _known:
+                    _pos += 1
+                    _tok.pos = _pos
+                yield _tok
+
+
+    def _filter_unmatched_lines(tokens, _start_line, _end_line):
+        """Skip lines that are not matched."""
+        for token in tokens:
+            if token.line_num < _start_line:
+                continue
+            if token.line_num > _end_line:
+                break
+            yield token
+
+
+    def _tag_tokens_as_matched(tokens, qspan):
+        """Tag tokens that are matched with is_matched."""
+        for token in tokens:
+            if token.pos != -1 and token.is_known and token.pos in qspan:
+                token.is_matched = True
+            yield token
+
+
+    def _tag_tokens_as_included_in_whole_lines(tokens, _start_line, _end_line):
+        """Tag all tokens in lines as included."""
+        for token in tokens:
+            if _start_line <= token.line_num <= _end_line:
+                token.is_included = True
+            yield token
+
+
+    def _tag_tokens_as_included_in_matched_range(tokens, _start, _end):
+        """Tag tokens with start and end as included."""
+        started = False
+        finished = False
+        for token in tokens:
+            if not started and token.pos == _start:
+                started = True
+
+            if started and not finished:
+                token.is_included = True
+
+            yield token
+
+            if token.pos == _end:
+                finished = True
+
+
+    def _filter_non_included_tokens(tokens):
+        """Skip non included tokens."""
+        for token in tokens:
+            if token.is_included:
+                yield token
+
+    # Create and process a stream of Tokens
+    tokenized = _tokenize(location, query_string)
+    in_line_range = _filter_unmatched_lines(tokenized, match.start_line, match.end_line)
+    matched = _tag_tokens_as_matched(in_line_range, match.qspan)
+    if whole_lines:
+        included = _tag_tokens_as_included_in_whole_lines(matched, match.start_line, match.end_line)
+    else:
+        included = _tag_tokens_as_included_in_matched_range(matched, match.qspan.start, match.qspan.end)
+    tokens = _filter_non_included_tokens(included)
+
+    # Finally yiled strings with eventual highlightings
+    for token in tokens:
+        if token.is_text:
+            if token.is_matched:
+                yield highlight_matched % token.value
+            else:
+                yield highlight_not_matched % token.value
+        else:
+            # punctuation
+            yield token.value
@@ -40,28 +40,36 @@
 for queries and rules texts.
 """
 
-
-def query_lines(location=None, query_string=None):
+def query_lines(location=None, query_string=None, strip=True):
     """
     Return an iterable of text lines given a file at `location` or a
-    `query string`. Include empty lines. 
+    `query string`. Include empty lines.
     """
     # TODO: OPTIMIZE: tokenizing line by line may be rather slow
     # we could instead get lines and tokens at once in a batch?
     lines = []
     if location:
         lines = text_lines(location, demarkup=False)
     elif query_string:
-        lines = query_string.splitlines(False)
+        if strip:
+            keepends = False
+        else:
+            keepends = True
+        lines = query_string.splitlines(keepends)
 
     for line in lines:
-        yield line.strip()
+        if strip:
+            yield line.strip()
+        else:
+            yield line
 
 
 # Split on whitespace and punctuations: keep only characters and +.
 # Keeping the + is important for licenses name such as GPL2+.
 
-query_pattern = '[a-zA-Z0-9]+ ?\+|[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
+_letter_or_digit = '[a-zA-Z0-9]+ ?\+'
+_not_punctuation = '[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
+query_pattern = _letter_or_digit + '|' + _not_punctuation
 word_splitter = re.compile('(?:%s)+' % query_pattern, re.UNICODE).findall
 
 def query_tokenizer(text, lower=True):
@@ -74,6 +82,33 @@ def query_tokenizer(text, lower=True):
     return (token for token in word_splitter(text) if token)
 
 
+# Alternate pattern used for matched text collection
+# collect tokens and non-token texts in two different groups
+_punctuation = '[!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
+_text_capture_pattern = '(?P<token>(?:' + query_pattern + ')+)' + '|' + '(?P<punct>' + _punctuation + '+)'
+tokens_and_non_tokens = re.compile(_text_capture_pattern, re.UNICODE).finditer
+
+def matched_query_text_tokenizer(text):
+    """
+    Return an iterable of tokens and non-tokens from a unicode query text keeping
+    everything (including punctuations, line endings, etc.)
+    The returned iterable contains 2-tuples of:
+    - True if the string is a text token or False if this is not (such as punctuation, spaces, etc).
+    - the corresponding string
+    This is used to reconstruct the matched query text accurately.
+    """
+    if not text:
+        return
+    for match in tokens_and_non_tokens(text):
+        if not match:
+            continue
+        mgd = match.groupdict()
+        token = mgd.get('token')
+        punct = mgd.get('punct')
+        if token or punct:
+            yield (True, token) if token else (False, punct)
+
+
 # Template-aware splitter, keeping a templated part {{anything}} as a token.
 # This splitter yields plain token strings or double braces-enclosed strings
 # {{something}} for templates. curly barces are otherwise treated as punctuation.
@@ -140,7 +175,7 @@ def ngrams(iterable, ngram_length):
     []
 
     This also works with arrays or tuples:
-    
+
     >>> from array import array
     >>> list(ngrams(array(b'h', [1,2,3,4,5]), 2))
     [(1, 2), (2, 3), (3, 4), (4, 5)]
@@ -156,7 +191,7 @@ def select_ngrams(ngrams, with_pos=False):
     Return an iterable as a subset of a sequence of ngrams using the hailstorm
     algorithm. If `with_pos` is True also include the starting position for the ngram
     in the original sequence.
-    
+
     Definition from the paper: http://www2009.eprints.org/7/1/p61.pdf
       The algorithm first fingerprints every token and then selects a shingle s if
       the minimum fingerprint value of all k tokens in s occurs at the first or the
 
@@ -103,7 +103,7 @@ def get_urls(location):
 DEJACODE_LICENSE_URL = 'https://enterprise.dejacode.com/urn/urn:dje:license:{}'
 
 
-def get_licenses(location, min_score=0, diag=False):
+def get_licenses(location, min_score=0, include_text=False, diag=False):
     """
     Yield dictionaries of license data detected in the file at location.
 
@@ -115,12 +115,16 @@ def get_licenses(location, min_score=0, diag=False):
     key of the returned mapping.
     """
     from licensedcode.index import get_index
+    from licensedcode.match import get_full_matched_text
     from licensedcode.models import get_licenses as licenses_details
 
     idx = get_index()
     licenses = licenses_details()
 
     for match in idx.match(location=location, min_score=min_score):
+        if include_text:
+            matched_text = u''.join(get_full_matched_text(match, location=location, 
+                                                          idx=idx, whole_lines=False))
         for license_key in match.rule.licenses:
             lic = licenses.get(license_key)
             result = OrderedDict()
@@ -146,6 +150,8 @@ def get_licenses(location, min_score=0, diag=False):
                 matched_rule['matched_length'] = match.ilen()
                 matched_rule['match_coverage'] = match.coverage()
                 matched_rule['rule_relevance'] = match.rule.relevance
+            if include_text:
+                result['matched_text'] = matched_text
             yield result
 
 
 
@@ -271,6 +271,8 @@ def validate_formats(ctx, param, value):
 @click.option('-i', '--info', is_flag=True, default=False, help='Include information such as size, type, etc.')
 @click.option('--license-score', is_flag=False, default=0, type=int, show_default=True,
               help='Do not return license matches with scores lower than this score. A number between 0 and 100.')
+@click.option('--license-text', is_flag=True, default=False,
+              help='Include the detected licenses matched text. Has no effect unless --license is requested.')
 
 @click.option('-f', '--format', is_flag=False, default='json', show_default=True, metavar='<style>',
               help=('Set <output_file> format <style> to one of the standard formats: %s '
@@ -289,14 +291,14 @@ def validate_formats(ctx, param, value):
 @click.option('--max-memory', is_flag=False, default=DEFAULT_MAX_MEMORY, type=int, show_default=True, help='Stop scanning a file if scanning requires more than a maximum amount of memory in megabytes.')
 
 def scancode(ctx, input, output_file, copyright, license, package,
-             email, url, info, license_score, format,
+             email, url, info, license_score, license_text, format,
              verbose, quiet, processes,
              diag, timeout, max_memory,
              *args, **kwargs):
     """scan the <input> file or directory for origin clues and license and save results to the <output_file>.
 
     The scan results are printed to stdout if <output_file> is not provided.
-    Error and progress is printed to stderr. 
+    Error and progress is printed to stderr.
     """
     possible_scans = [copyright, license, package, email, url, info]
     # Default scan when no options is provided
@@ -307,9 +309,22 @@ def scancode(ctx, input, output_file, copyright, license, package,
 
     scans_cache_class = get_scans_cache_class()
     try:
-        files_count, results = scan(input, copyright, license, package, email, url, info, license_score,
-                                    verbose, quiet, processes, scans_cache_class,
-                                    diag, timeout, max_memory)
+        files_count, results = scan(input_path=input,
+                                    copyright=copyright,
+                                    license=license,
+                                    package=package,
+                                    email=email,
+                                    url=url,
+                                    info=info,
+                                    license_score=license_score,
+                                    license_text=license_text,
+                                    verbose=verbose,
+                                    quiet=quiet,
+                                    processes=processes,
+                                    timeout=timeout, max_memory=max_memory,
+                                    diag=diag,
+                                    scans_cache_class=scans_cache_class,
+                                    )
         if not quiet:
             echo_stderr('Saving results.', fg='green')
         save_results(files_count, results, format, input, output_file)
@@ -323,10 +338,14 @@ def scancode(ctx, input, output_file, copyright, license, package,
     # ctx.exit(rc)
 
 
-def scan(input_path, copyright=True, license=True, package=True,
-         email=False, url=False, info=True, license_score=0,
-         verbose=False, quiet=False, processes=1, scans_cache_class=None,
-         diag=False, timeout=DEFAULT_TIMEOUT, max_memory=DEFAULT_MAX_MEMORY):
+def scan(input_path,
+         copyright=True, license=True, package=True,
+         email=False, url=False, info=True,
+         license_score=0, license_text=False,
+         verbose=False, quiet=False,
+         processes=1, timeout=DEFAULT_TIMEOUT, max_memory=DEFAULT_MAX_MEMORY,
+         diag=False,
+         scans_cache_class=None):
     """
     Return a tuple of (file_count, indexing_time, scan_results) where
     scan_results is an iterable. Run each requested scan proper: each individual file
@@ -337,7 +356,7 @@ def scan(input_path, copyright=True, license=True, package=True,
     scan_summary = OrderedDict()
     scan_summary['scanned_path'] = input_path
     scan_summary['processes'] = processes
-    get_licenses_with_score = partial(get_licenses, min_score=license_score, diag=diag)
+    get_licenses_with_score = partial(get_licenses, min_score=license_score, include_text=license_text, diag=diag)
 
     # note: "flag and function" expressions return the function if flag is True
     # note: the order of the scans matters to show things in logical order
 
@@ -160,9 +160,10 @@ def as_unicode(line):
     return s
 
 
-def remove_verbatim_line_endings(s):
+def remove_verbatim_cr_lf_tab_chars(s):
     """
-    Return a string removing verbatim, escaped line endings (such as \n).
+    Return a string replacinf by a space any verbatim but escaped line endings and
+    tabs (such as a literal \n or \r \t).
     """
     if not s:
         return s
@@ -179,7 +180,7 @@ def unicode_text_lines(location):
     if T.contains_text:
         with open(location, 'rbU') as f:
             for line in f:
-                yield remove_verbatim_line_endings(as_unicode(line))
+                yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
 
 
 def unicode_text(location):