@@ -298,14 +298,13 @@ def detect(self,
298298 non_holder_labels_mini = frozenset ([
299299 'COPY' ,
300300 'YR-RANGE' , 'YR-AND' , 'YR' , 'YR-PLUS' , 'BARE-YR' ,
301- 'HOLDER' , 'AUTHOR' ,
302301 'IS' , 'HELD' ,
303302 ])
304303
305304 non_authors_labels = frozenset ([
306305 'COPY' ,
307306 'YR-RANGE' , 'YR-AND' , 'YR' , 'YR-PLUS' , 'BARE-YR' ,
308- 'HOLDER ' , 'AUTHOR ' ,
307+ 'AUTH ' , 'AUTH2' , 'HOLDER ' ,
309308 'IS' , 'HELD' ,
310309 ])
311310
@@ -322,10 +321,9 @@ def detect(self,
322321 copyrght = build_detection_from_node (
323322 node = tree_node ,
324323 cls = CopyrightDetection ,
325- ignores = non_copyright_labels ,
324+ ignored_labels = non_copyright_labels ,
326325 include_copyright_allrights = include_copyright_allrights ,
327326 refiner = refine_copyright ,
328- junk = COPYRIGHTS_JUNK ,
329327 )
330328
331329 if TRACE or TRACE_DEEP :
@@ -340,7 +338,7 @@ def detect(self,
340338 holder = build_detection_from_node (
341339 node = tree_node ,
342340 cls = HolderDetection ,
343- ignores = non_holder_labels ,
341+ ignored_labels = non_holder_labels ,
344342 refiner = refine_holder ,
345343 )
346344
@@ -351,7 +349,7 @@ def detect(self,
351349 holder = build_detection_from_node (
352350 node = tree_node ,
353351 cls = HolderDetection ,
354- ignores = non_holder_labels_mini ,
352+ ignored_labels = non_holder_labels_mini ,
355353 refiner = refine_holder ,
356354 )
357355
@@ -365,9 +363,8 @@ def detect(self,
365363 author = build_detection_from_node (
366364 node = tree_node ,
367365 cls = AuthorDetection ,
368- ignores = non_authors_labels ,
366+ ignored_labels = non_authors_labels ,
369367 refiner = refine_author ,
370- junk = AUTHORS_JUNK ,
371368 )
372369
373370 if author :
@@ -385,15 +382,21 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
385382 We perform a simple tokenization on spaces, tabs and some punctuation: =;
386383 """
387384 for start_line , line in numbered_lines :
385+ pos = 0
386+
388387 if TRACE_TOK :
389388 logger_debug (' get_tokens: bare line: ' + repr (line ))
390389
390+ # if not line.strip():
391+ # yield Token(value="\n", label="EMPTY_LINE", start_line=start_line, pos=pos)
392+ # pos += 1
393+ # continue
394+
391395 line = prepare_text_line (line )
392396
393397 if TRACE_TOK :
394398 logger_debug (' get_tokens: preped line: ' + repr (line ))
395399
396- pos = 0
397400 for tok in splitter (line ):
398401 # strip trailing quotes+comma
399402 if tok .endswith ("'," ):
@@ -406,7 +409,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
406409 .strip ()
407410 )
408411
409- # the tokenizer allows a sinble colon or dot to be atoken and we discard these
412+ # the tokenizer allows a single colon or dot to be a token and we discard these
410413 if tok and tok not in ':.' :
411414 yield Token (value = tok , start_line = start_line , pos = pos )
412415 pos += 1
@@ -475,42 +478,45 @@ class AuthorDetection(Detection):
475478 end_line = attr .ib ()
476479
477480
481+ def filter_tokens (node , ignored_labels = frozenset ()):
482+ """
483+ Yield tokens for this parse tree Tree, ignoring nodes with a label in the ``ignored_labels`` set.
484+ The order reflects the order of the leaves in the tree's hierarchical structure, breadth-first.
485+ """
486+ for token in node :
487+ if token .label in ignored_labels :
488+ continue
489+ if isinstance (token , Tree ):
490+ yield from filter_tokens (token , ignored_labels = ignored_labels )
491+ else :
492+ yield token
493+
494+
478495def build_detection_from_node (
479496 node ,
480497 cls ,
481- ignores = frozenset (),
498+ ignored_labels = frozenset (),
482499 include_copyright_allrights = False ,
483500 refiner = None ,
484- junk = frozenset (),
485- junk_patterns = frozenset (),
486501):
487502 """
488503 Return a ``cls`` Detection object from a pygmars.tree.Tree ``node`` with a
489504 space-normalized string value or None.
490505
491- Filter ``node`` Tokens with a type found in the ``ignores `` set of ignorable
506+ Filter ``node`` Tokens with a type found in the ``ignored_labels `` set of ignorable
492507 token types.
493508
494509 For copyright detection, include trailing "All rights reserved" if
495510 ``include_copyright_allrights`` is True.
496511
497512 Apply the ``refiner`` callable function to the detection string.
498-
499- Return None if the value exists in the ``junk`` strings set or is matched by
500- any of the regex in the ``junk_patterns`` set.
501513 """
502514 include_copyright_allrights = (
503515 cls == CopyrightDetection
504516 and include_copyright_allrights
505517 )
506518
507- if ignores :
508- leaves = [
509- token for token in node .leaves ()
510- if token .label not in ignores
511- ]
512- else :
513- leaves = node .leaves ()
519+ leaves = list (filter_tokens (node , ignored_labels = ignored_labels ))
514520
515521 if include_copyright_allrights :
516522 filtered = leaves
@@ -545,7 +551,7 @@ def build_detection_from_node(
545551 if refiner :
546552 node_string = refiner (node_string )
547553
548- if node_string and not is_junk_copyryright (node_string ):
554+ if node_string and not is_junk_copyright (node_string ):
549555 start_line = filtered [0 ].start_line
550556 end_line = filtered [- 1 ].start_line
551557
@@ -1370,6 +1376,8 @@ def build_detection_from_node(
13701376 (r'^Bugfixes?$' , 'NN' ),
13711377 (r'^Likes?$' , 'NN' ),
13721378 (r'^STA$' , 'NN' ),
1379+ (r'^Page$' , 'NN' ),
1380+ (r'^Todo/Under$' , 'JUNK' ),
13731381
13741382 (r'^Interrupt$' , 'NN' ),
13751383 (r'^cleanups?$' , 'JUNK' ),
@@ -2071,7 +2079,7 @@ def build_detection_from_node(
20712079 (r'^\$?date-of-document$' , 'YR' ),
20722080
20732081 # cardinal numbers
2074- (r'^-?[0-9]+(.[0-9]+)?\. ?$' , 'CD' ),
2082+ (r'^-?[0-9]+(.[0-9]+)?[\.,] ?$' , 'CD' ),
20752083
20762084 ############################################################################
20772085 # All caps and proper nouns
@@ -2239,6 +2247,8 @@ def build_detection_from_node(
22392247 YR-RANGE: {<YR-AND>+} #70
22402248 YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
22412249 YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
2250+ # Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
2251+ YR-RANGE: {<YR-RANGE> <CD>+} #72.2
22422252
22432253 CD: {<BARE-YR>} #bareyear
22442254
@@ -3178,7 +3188,7 @@ def build_detection_from_node(
31783188 # the Initial Developer. All Rights Reserved.
31793189 COPYRIGHT: {<PORTIONS> <AUTH2> <INITIALDEV> <IS> <COPY|COPYRIGHT2>+ <YR-RANGE>? <INITIALDEV>} #2609.1
31803190
3181- # Portions created by the Initial Developer are Copyright (C)
3191+ # Portions created by the Initial Developer are Copyright (C)
31823192 # the Initial Developer. All Rights Reserved.
31833193 # and
31843194 # Portions created by the Initial Developer are Copyright (C) 2002
@@ -3576,7 +3586,7 @@ def refine_names(s, prefixes):
35763586COPYRIGHTS_JUNK_PATTERN_MATCHERS = [re .compile (p , re .IGNORECASE ).match for p in COPYRIGHTS_JUNK ]
35773587
35783588
3579- def is_junk_copyryright (s , patterns = COPYRIGHTS_JUNK_PATTERN_MATCHERS ):
3589+ def is_junk_copyright (s , patterns = COPYRIGHTS_JUNK_PATTERN_MATCHERS ):
35803590 """
35813591 Return True if the string ``s`` matches any junk patterns.
35823592 """
0 commit comments