3737from licensedcode .spans import Span
3838from licensedcode .stopwords import STOPWORDS
3939from licensedcode .tokenize import matched_query_text_tokenizer
40+ from licensedcode .tokenize import query_tokenizer
4041
4142
4243"""
@@ -532,7 +533,7 @@ def itokens_hash(self, idx):
532533
533534 # FIXME: this should be done for all the matches found in a given scanned
534535 # location at once to avoid reprocessing many times the original text
535- def matched_text (self , whole_lines = False ,
536+ def matched_text (self , whole_lines = False , highlight = True ,
536537 highlight_matched = u'%s' , highlight_not_matched = u'[%s]' ,
537538 _usecache = True ):
538539 """
@@ -558,6 +559,7 @@ def matched_text(self, whole_lines=False,
558559 query_string = query .query_string ,
559560 idx = query .idx ,
560561 whole_lines = whole_lines ,
562+ highlight = highlight ,
561563 highlight_matched = highlight_matched ,
562564 highlight_not_matched = highlight_not_matched , _usecache = _usecache )
563565 ).rstrip ()
@@ -1384,15 +1386,16 @@ def _log(_matches, _discarded, msg):
13841386@attr .s (slots = True , frozen = True )
13851387class Token (object ):
13861388 """
1387- Used to represent a token in collected matched texts and SPDX identifiers.
1389+ Used to represent a token in collected query-side matched texts and SPDX
1390+ identifiers.
13881391 """
13891392 # original text value for this token.
13901393 value = attr .ib ()
13911394 # line number, one-based
13921395 line_num = attr .ib ()
13931396 # absolute position for known tokens, zero-based. -1 for unknown tokens
13941397 pos = attr .ib (default = - 1 )
1395- # False if this is punctuation
1398+ # False if this is punctuation or spaces
13961399 is_text = attr .ib (default = False )
13971400 # True if part of a match
13981401 is_matched = attr .ib (default = False )
@@ -1405,6 +1408,8 @@ def tokenize_matched_text(location, query_string, dictionary, _cache={}):
14051408 Return a list of Token objects with pos and line number collected from the
14061409 file at `location` or the `query_string` string. `dictionary` is the index
14071410 mapping of tokens to token ids.
1411+
1412+ NOTE: the _cache={} arg IS A GLOBAL by design.
14081413 """
14091414 key = location , query_string
14101415 cached = _cache .get (key )
@@ -1425,26 +1430,52 @@ def _tokenize_matched_text(location, query_string, dictionary):
14251430 """
14261431 pos = - 1
14271432 for line_num , line in query .query_lines (location , query_string , strip = False ):
1433+ if TRACE_MATCHED_TEXT_DETAILS :
1434+ logger_debug (' _tokenize_matched_text:' ,
1435+ 'line_num:' , line_num ,
1436+ 'line:' , line )
1437+
14281438 for is_text , token_str in matched_query_text_tokenizer (line ):
1429- known = token_str .lower () in dictionary
1439+ if TRACE_MATCHED_TEXT_DETAILS :
1440+ logger_debug (' is_text:' , is_text , 'token_str:' , repr (token_str ))
1441+ known = False
1442+ if token_str and token_str .strip ():
1443+ # we retokenzie using the query tokenize
1444+ tokenized = list (query_tokenizer (token_str ))
1445+ if tokenized :
1446+ assert len (tokenized ) == 1 , repr ((is_text , token_str , tokenized ))
1447+ tokenized = tokenized [0 ]
1448+ known = tokenized in dictionary
1449+
14301450 if known :
14311451 pos += 1
14321452 p = pos
14331453 else :
14341454 p = - 1
1435- yield Token (
1455+
1456+ tok = Token (
14361457 value = token_str ,
14371458 line_num = line_num ,
14381459 is_text = is_text ,
14391460 is_known = known ,
14401461 pos = p )
14411462
1463+ if TRACE_MATCHED_TEXT_DETAILS :
1464+ logger_debug (' token:' , tok )
1465+ yield tok
1466+
14421467
14431468def reportable_tokens (tokens , match_qspan , start_line , end_line , whole_lines = False ):
14441469 """
1445- Yield Tokens from an iterable of `tokens` that are inside a `match_qspan`
1446- matched Span starting at `start_line` and ending at `end_line`. Known
1447- matched tokens are tagged as is_matched=True.
1470+ Yield Tokens from a `tokens` iterable of Token objects (built from a query-
1471+ side scanned file or string) that are inside a `match_qspan` matched Span
1472+ starting at `start_line` and ending at `end_line`. If whole_lines is True,
1473+ also yield unmatched Tokens that are before and after the match and on the
1474+ first and last line of a match (unless the lines are very long text lines or
1475+ the match is from binary content.)
1476+
1477+ As a side effect, known matched tokens are tagged as is_matched=True if they
1478+ are matched.
14481479
14491480 If `whole_lines` is True, any token within matched lines range is included.
14501481 Otherwise, a token is included if its position is within the matched
@@ -1475,7 +1506,12 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
14751506 tok = attr .evolve (tok , is_matched = True )
14761507 is_included = True
14771508 if TRACE_MATCHED_TEXT_DETAILS :
1478- logger_debug (' tok.is_matched = True' )
1509+ logger_debug (' tok.is_matched = True' , 'match_qspan:' , match_qspan )
1510+ else :
1511+ if TRACE_MATCHED_TEXT_DETAILS :
1512+ logger_debug (' unmatched token: tok.is_matched = False' ,
1513+ 'match_qspan:' , match_qspan ,
1514+ 'tok.pos in match_qspan:' , tok .pos in match_qspan )
14791515
14801516 if whole_lines :
14811517 # we only work on matched lines so no need to test further
@@ -1527,7 +1563,7 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
15271563def get_full_matched_text (
15281564 match , location = None , query_string = None , idx = None ,
15291565 whole_lines = False ,
1530- highlight_matched = u'%s' , highlight_not_matched = u'[%s]' ,
1566+ highlight = True , highlight_matched = u'%s' , highlight_not_matched = u'[%s]' ,
15311567 stopwords = STOPWORDS , _usecache = True ):
15321568 """
15331569 Yield unicode strings corresponding to the full matched query text
@@ -1541,11 +1577,11 @@ def get_full_matched_text(
15411577 matched line and the end of the last matched lines are also included in the
15421578 returned text.
15431579
1544- Each token is interpolated for "highlighting" and emphasis with the
1545- `highlight_matched` format string for matched tokens or to the
1546- `highlight_not_matched` for tokens not matched. The default is to enclose an
1547- unmatched token sequence in [] square brackets. Punctuation is not
1548- highlighted.
1580+ If `highlight` is True, each token is formatted for "highlighting" and
1581+ emphasis with the `highlight_matched` format string for matched tokens or to
1582+ the `highlight_not_matched` for tokens not matched. The default is to
1583+ enclose an unmatched token sequence in [] square brackets. Punctuation is
1584+ not highlighted.
15491585 """
15501586 if TRACE_MATCHED_TEXT :
15511587 logger_debug ('get_full_matched_text: match:' , match )
@@ -1564,7 +1600,7 @@ def get_full_matched_text(
15641600 tokens = list (tokens )
15651601 logger_debug ('get_full_matched_text: tokens:' )
15661602 for t in tokens :
1567- print (t )
1603+ print (' ' , t )
15681604
15691605 tokens = reportable_tokens (
15701606 tokens , match .qspan , match .start_line , match .end_line , whole_lines = whole_lines )
@@ -1576,16 +1612,21 @@ def get_full_matched_text(
15761612 print (t )
15771613
15781614 if TRACE_MATCHED_TEXT :
1579- logger_debug ('get_full_matched_text: highlight_matched:' , highlight_matched , 'highlight_not_matched:' , highlight_not_matched )
1615+ logger_debug (
1616+ 'get_full_matched_text: highlight_matched:' , highlight_matched ,
1617+ 'highlight_not_matched:' , highlight_not_matched )
15801618
15811619 # Finally yield strings with eventual highlightings
15821620 for token in tokens :
15831621 val = token .value
1584- if token .is_text and val .lower () not in stopwords :
1585- if token .is_matched :
1586- yield highlight_matched % val
1587- else :
1588- yield highlight_not_matched % val
1589- else :
1590- # we do not highlight punctuation..
1622+ if not highlight :
15911623 yield val
1624+ else :
1625+ if token .is_text and val .lower () not in stopwords :
1626+ if token .is_matched :
1627+ yield highlight_matched % val
1628+ else :
1629+ yield highlight_not_matched % val
1630+ else :
1631+ # we do not highlight punctuation..
1632+ yield val
0 commit comments