2525if os .environ .get ('SCANCODE_DEBUG_COPYRIGHT_DEEP' ):
2626 TRACE_DEEP = 1
2727
28+ TRACE_TOK = False or os .environ .get ('SCANCODE_DEBUG_COPYRIGHT_TOKEN' , False )
29+
2830
2931# Tracing flags
3032def logger_debug (* args ):
3133 pass
3234
3335
34- if TRACE or TRACE_DEEP :
36+ if TRACE or TRACE_DEEP or TRACE_TOK :
3537 import logging
3638
3739 logger = logging .getLogger (__name__ )
@@ -98,7 +100,18 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
98100 """
99101 detector = CopyrightDetector ()
100102
101- for candidates in candidate_lines (numbered_lines ):
103+ candidate_lines_groups = candidate_lines (numbered_lines )
104+ if TRACE :
105+ candidate_lines_groups = list (candidate_lines_groups )
106+ logger_debug (
107+ f'detect_copyrights_from_lines: ALL groups of candidate '
108+ f'lines collected: { len (candidate_lines_groups )} ' )
109+
110+ for candidates in candidate_lines_groups :
111+ if TRACE :
112+ from pprint import pformat
113+ can = pformat (candidates , width = 160 )
114+ logger_debug (f' detect_copyrights_from_lines: processing candidates group:\n { can } ' )
102115
103116 detections = detector .detect (
104117 numbered_lines = candidates ,
@@ -109,13 +122,17 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
109122 include_allrights = include_allrights
110123 )
111124
125+ if TRACE :
126+ detections = list (detections )
127+ logger_debug (f' detect_copyrights_from_lines: { detections } ' )
128+
112129 for detection in detections :
113130 # tuple of type, string, start, end
114131 yield detection
132+
115133 if time () > deadline :
116134 break
117135
118-
119136################################################################################
120137# DETECTION PROPER
121138################################################################################
@@ -149,8 +166,15 @@ def detect(self, numbered_lines,
149166 numbered_lines = list (numbered_lines )
150167 start_line = numbered_lines [0 ][0 ]
151168 end_line = numbered_lines [- 1 ][0 ]
169+
170+ if TRACE : logger_debug (f'CopyrightDetector:numbered_lines: { numbered_lines } ' )
171+
152172 tokens = self .get_tokens (numbered_lines )
153173
174+ if TRACE :
175+ tokens = list (tokens )
176+ logger_debug (f'CopyrightDetector:tokens: { tokens } ' )
177+
154178 if not tokens :
155179 return
156180
@@ -242,9 +266,9 @@ def get_tokens(self, numbered_lines):
242266 tokens_append = tokens .append
243267
244268 for _line_number , line in numbered_lines :
245- if TRACE : logger_debug (' get_tokens: bare line: ' + repr (line ))
269+ if TRACE_TOK : logger_debug (' get_tokens: bare line: ' + repr (line ))
246270 line = prepare_text_line (line )
247- if TRACE : logger_debug (' get_tokens:preped line: ' + repr (line ))
271+ if TRACE_TOK : logger_debug (' get_tokens:preped line: ' + repr (line ))
248272 for tok in splitter (line ):
249273 # strip trailing single quotes and ignore empties
250274 tok = tok .strip ("' " )
@@ -254,7 +278,9 @@ def get_tokens(self, numbered_lines):
254278 tok = tok .lstrip ('@' ).strip ()
255279 if tok and tok not in (':' ,):
256280 tokens_append (tok )
257- if TRACE : logger_debug (' get_tokens:tokens: ' + repr (tokens ))
281+ if TRACE_TOK :
282+ logger_debug (' get_tokens:tokens: ' + repr (tokens ))
283+ logger_debug (' get_tokens: ALL tokens collected' )
258284 return tokens
259285
260286 @classmethod
@@ -442,7 +468,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
442468
443469 # CamELCaseeXXX is typcally JUNK such as code variable names
444470 # AzaAzaaaAz BBSDSB002923,
445- (r'^([A-Z][a-z]+){3,}[A-Z]+[0-9]*,?$' , 'JUNK' ),
471+ (r'^([A-Z][a-z]+){3,20 }[A-Z]+[0-9]*,?$' , 'JUNK' ),
446472
447473 # multiple parens (at least two (x) groups) is a sign of junk
448474 # such as in (1)(ii)(OCT
@@ -501,7 +527,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
501527 (r'^Last$' , 'JUNK' ),
502528 (r'^[Rr]eleased?$' , 'JUNK' ),
503529 (r'^[Cc]opyrighting$' , 'JUNK' ),
504- (r'^Authori .*$' , 'JUNK' ),
530+ (r'^[Aa]uthori .*$' , 'JUNK' ),
505531 (r'^such$' , 'JUNK' ),
506532 (r'^[Aa]ssignments?[.,]?$' , 'JUNK' ),
507533 (r'^[Bb]uild$' , 'JUNK' ),
@@ -805,7 +831,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
805831 (r'^Assignment' , 'NN' ),
806832 (r'^Atomic$' , 'NN' ),
807833 (r'^Attribution$' , 'NN' ),
808- (r'^Authored $' , 'NN' ),
834+ (r'^[Aa]uthored $' , 'NN' ),
809835 (r'^Baslerstr\.?$' , 'NN' ),
810836 (r'^BSD$' , 'NN' ),
811837 (r'^BUT$' , 'NN' ),
@@ -1446,8 +1472,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
14461472 ############################################################################
14471473
14481474 # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
1449- (r'^<([a-zA-Z]+[a-zA-Z\.]){3, }$' , 'EMAIL_START' ),
1450- (r'^[a-zA-Z\.]{2,}>$' , 'EMAIL_END' ),
1475+ (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5 }$' , 'EMAIL_START' ),
1476+ (r'^[a-zA-Z\.]{2,5 }>$' , 'EMAIL_END' ),
14511477
14521478 # a .sh shell scripts is NOT an email.
14531479 (r'^.*\.sh\.?$' , 'JUNK' ),
@@ -3037,9 +3063,9 @@ def is_end_of_statement(chars_only_line):
30373063
30383064def candidate_lines (numbered_lines ):
30393065 """
3040- Yield lists of candidate lines where each list element is a tuple of
3041- (line number, line text) given an iterable of numbered_lines as tuples of
3042- (line number, line text) .
3066+ Yield groups of candidate lines as list where each list element is a tuple
3067+ of (line number, line text) given an iterable of numbered_lines as tuples
3068+ of (line number, line text) .
30433069
30443070 A candidate line is a line of text that may contain copyright statements.
30453071 A few lines before and after a candidate line are also included.
@@ -3082,7 +3108,6 @@ def candidate_lines(numbered_lines):
30823108 previous_chars = chars_only
30833109 if TRACE : logger_debug (' candidate_lines: line is candidate' )
30843110
3085-
30863111 elif 's>' in line :
30873112 # this is for debian-style <s></s> copyright name tags
30883113 # the state is now "in copyright"
0 commit comments