12
12
import re
13
13
import string
14
14
import sys
15
+
16
+ from collections import deque
15
17
from time import time
16
18
17
19
import attr
20
+
21
+ from commoncode .text import toascii
22
+ from commoncode .text import unixlinesep
18
23
from pygmars import lex
19
24
from pygmars import parse
20
25
from pygmars import Token
21
26
from pygmars .tree import Tree
22
27
23
- from commoncode .text import toascii
24
- from commoncode .text import unixlinesep
25
28
26
29
from cluecode import copyrights_hint
27
- from textcode .markup import strip_debian_markup
28
- from textcode .markup import strip_markup_text
30
+ from textcode .markup import strip_known_markup_from_text
29
31
30
32
# Tracing flags
31
33
TRACE = False or os .environ .get ('SCANCODE_DEBUG_COPYRIGHT' , False )
@@ -167,10 +169,10 @@ def detect_copyrights_from_lines(
167
169
else :
168
170
detector = DETECTOR
169
171
170
- candidate_lines_groups = collect_candidate_lines (numbered_lines )
172
+ candidate_lines_groups = list ( collect_candidate_lines (numbered_lines ) )
171
173
172
174
if TRACE or TRACE_TOK :
173
- candidate_lines_groups = list ( candidate_lines_groups )
175
+ candidate_lines_groups = candidate_lines_groups
174
176
logger_debug (
175
177
f'detect_copyrights_from_lines: ALL groups of candidate '
176
178
f'lines collected: { len (candidate_lines_groups )} ' ,
@@ -386,6 +388,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
386
388
if TRACE_TOK :
387
389
logger_debug (' get_tokens: bare line: ' + repr (line ))
388
390
391
+ # keep or skip empty lines
389
392
if not line .strip ():
390
393
stripped = last_line .lower ().strip (string .punctuation )
391
394
if (
@@ -398,11 +401,10 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
398
401
pos += 1
399
402
last_line = ""
400
403
continue
404
+
401
405
if TRACE_TOK :
402
406
logger_debug (' get_tokens: before preped line: ' + repr (line ))
403
407
404
- # line = prepare_text_line(line)
405
-
406
408
last_line = line
407
409
408
410
if TRACE_TOK :
@@ -801,6 +803,9 @@ def build_detection_from_node(
801
803
(r'^Earth$' , 'NN' ),
802
804
(r'^Maps/Google$' , 'NN' ),
803
805
806
+ # verbatime star
807
+ (r'^\*$' , 'JUNK' ),
808
+
804
809
(r'^([A-Z][a-z]+){3,}$' , 'JUNK' ),
805
810
806
811
############################################################################
@@ -919,6 +924,8 @@ def build_detection_from_node(
919
924
(r'^WARRANTS?$' , 'JUNK' ),
920
925
(r'^WARRANTYS?$' , 'JUNK' ),
921
926
927
+ (r'^Row\(' , 'JUNK' ),
928
+
922
929
(r'^hispagestyle$' , 'JUNK' ),
923
930
(r'^Generic$' , 'JUNK' ),
924
931
(r'^generate-' , 'JUNK' ),
@@ -1890,6 +1897,8 @@ def build_detection_from_node(
1890
1897
(r'^(SPRL|srl)[\.,]?$' , 'COMP' ),
1891
1898
# Poland
1892
1899
(r'^(sp\.|o\.o\.)$' , 'COMP' ),
1900
+ # Eingetragener Kaufmann
1901
+ (r'^(e\.K\.|e\.Kfm\.|e\.Kfr\.)$' , 'COMP' ),
1893
1902
1894
1903
# company suffix : AS: this is frequent beyond Norway.
1895
1904
(r'^AS' , 'CAPS' ),
@@ -2952,6 +2961,10 @@ def build_detection_from_node(
2952
2961
# Copyright (C) 1999-2000 VA Linux Systems
2953
2962
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CAPS> <NN|LINUX> <NNP>} #2280-1
2954
2963
2964
+ # Russ Dill <[email protected] > 2001-2003
2965
+ # Rewrited by Vladimir Oleynik <[email protected] > (C) 2003
2966
+ COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
2967
+
2955
2968
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
2956
2969
2957
2970
# using #2280 above: Copyright 2018 Developers of the Rand project
@@ -3241,10 +3254,6 @@ def build_detection_from_node(
3241
3254
# 1995-2003 by Internet Software Consortium
3242
3255
COPYRIGHT: {<COPYRIGHT> <NN> <YR-RANGE> <BY> <COMPANY> } #1615
3243
3256
3244
- # Russ Dill <[email protected] > 2001-2003
3245
- # Rewrited by Vladimir Oleynik <[email protected] > (C) 2003
3246
- COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
3247
-
3248
3257
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
3249
3258
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
3250
3259
@@ -3647,6 +3656,7 @@ def refine_names(s, prefixes):
3647
3656
r'Copyright \(c\) 2021 Dot' ,
3648
3657
r'^\(c\) \(c\) B$' ,
3649
3658
r'^\(c\) group$' ,
3659
+ r'^\(c\) \(c\) A$' ,
3650
3660
]
3651
3661
3652
3662
# a collection of junk junk matcher callables
@@ -4175,7 +4185,7 @@ def is_end_of_statement(chars_only_line):
4175
4185
)
4176
4186
4177
4187
4178
- remove_non_chars = re .compile (r'[^a-z0-9]' ).sub
4188
+ remove_non_chars = re .compile (r'[^a-z0-9]' , re . IGNORECASE ).sub
4179
4189
4180
4190
has_trailing_year = re .compile (r'(?:19\d\d|20[0-4]\d)+$' ).findall
4181
4191
@@ -4189,8 +4199,9 @@ def collect_candidate_lines(numbered_lines):
4189
4199
A candidate line is a line of text that may contain copyright statements.
4190
4200
A few lines before and after a candidate line are also included.
4191
4201
"""
4192
- candidates = []
4202
+ candidates = deque ()
4193
4203
candidates_append = candidates .append
4204
+ candidates_clear = candidates .clear
4194
4205
4195
4206
# used as a state and line counter
4196
4207
in_copyright = 0
@@ -4216,10 +4227,10 @@ def collect_candidate_lines(numbered_lines):
4216
4227
candidates_append ((ln , prepared ,))
4217
4228
4218
4229
if TRACE :
4219
- logger_debug (f' collect_candidate_lines: is EOS: yielding candidates\n { candidates !r} \n ' )
4230
+ logger_debug (f' collect_candidate_lines: is EOS: yielding candidates\n { list ( candidates ) !r} \n ' )
4220
4231
4221
- yield candidates
4222
- candidates = []
4232
+ yield list ( candidates )
4233
+ candidates_clear ()
4223
4234
in_copyright = 0
4224
4235
previous_chars = None
4225
4236
@@ -4253,35 +4264,35 @@ def collect_candidate_lines(numbered_lines):
4253
4264
):
4254
4265
4255
4266
if TRACE :
4256
- logger_debug (f' collect_candidate_lines: empty: yielding candidates\n { candidates !r} \n ' )
4267
+ logger_debug (f' collect_candidate_lines: empty: yielding candidates\n { list ( candidates ) !r} \n ' )
4257
4268
4258
- yield candidates
4259
- candidates = []
4269
+ yield list ( candidates )
4270
+ candidates_clear ()
4260
4271
in_copyright = 0
4261
4272
previous_chars = None
4262
4273
4263
4274
else :
4264
- candidates_append ((ln , line ,))
4275
+ candidates_append ((ln , prepared ,))
4265
4276
# and decrement our state
4266
4277
in_copyright -= 1
4267
4278
if TRACE :
4268
4279
logger_debug (' collect_candidate_lines: line is in copyright' )
4269
4280
4270
4281
elif candidates :
4271
4282
if TRACE :
4272
- logger_debug (f' collect_candidate_lines: not in COP: yielding candidates\n { candidates !r} \n ' )
4283
+ logger_debug (f' collect_candidate_lines: not in COP: yielding candidates\n { list ( candidates ) !r} \n ' )
4273
4284
4274
- yield candidates
4275
- candidates = []
4285
+ yield list ( candidates )
4286
+ candidates_clear ()
4276
4287
in_copyright = 0
4277
4288
previous_chars = None
4278
4289
4279
4290
# finally
4280
4291
if candidates :
4281
4292
if TRACE :
4282
- logger_debug (f'collect_candidate_lines: finally yielding candidates\n { candidates !r} \n ' )
4293
+ logger_debug (f'collect_candidate_lines: finally yielding candidates\n { list ( candidates ) !r} \n ' )
4283
4294
4284
- yield candidates
4295
+ yield list ( candidates )
4285
4296
4286
4297
################################################################################
4287
4298
# TEXT PRE PROCESSING
@@ -4299,12 +4310,29 @@ def collect_candidate_lines(numbered_lines):
4299
4310
4300
4311
# less common rem comment line prefix in dos
4301
4312
# less common dnl comment line prefix in autotools am/in
4302
- remove_comment_markers = re .compile (r'^(rem|\@rem|dnl)\s+' ).sub
4313
+ remove_weird_comment_markers = re .compile (r'^(rem|\@rem|dnl)\s+' ).sub
4303
4314
4304
4315
# common comment line prefix in man pages
4305
4316
remove_man_comment_markers = re .compile (r'\."' ).sub
4306
4317
4307
4318
4319
+ def remove_code_comment_markers (s ):
4320
+ """
4321
+ Return ``s`` removing code comments such as C and C++ style comment markers and assimilated
4322
+
4323
+ >>> remove_code_comment_markers("\\ *#%; /\\ /*a*/b/*c\\ d#e%f \\ *#%; /")
4324
+ 'a b c d e f'
4325
+ """
4326
+ return (s
4327
+ .replace ('/*' , ' ' )
4328
+ .replace ('*/' , ' ' )
4329
+ .replace ('*' , ' ' )
4330
+ .replace ('#' , ' ' )
4331
+ .replace ('%' , ' ' )
4332
+ .strip (' \\ /*#%;' )
4333
+ )
4334
+
4335
+
4308
4336
def prepare_text_line (line ):
4309
4337
"""
4310
4338
Prepare a text ``line`` for copyright detection.
@@ -4324,19 +4352,20 @@ def prepare_text_line(line):
4324
4352
logger_debug (' prepare_text_line: after remove_printf_format_codes: ' + repr (line ))
4325
4353
4326
4354
# less common comment line prefixes
4327
- line = remove_comment_markers (' ' , line )
4355
+ line = remove_weird_comment_markers (' ' , line )
4328
4356
if TRACE_TOK :
4329
- logger_debug (' prepare_text_line: after remove_comment_markers : ' + repr (line ))
4357
+ logger_debug (' prepare_text_line: after remove_weird_comment_markers : ' + repr (line ))
4330
4358
4331
4359
line = remove_man_comment_markers (' ' , line )
4332
-
4333
4360
if TRACE_TOK :
4334
4361
logger_debug (' prepare_text_line: after remove_man_comment_markers: ' + repr (line ))
4335
4362
4363
+ line = remove_code_comment_markers (line )
4364
+ if TRACE_TOK :
4365
+ logger_debug (' prepare_text_line: after remove_code_comment_markers: ' + repr (line ))
4366
+
4336
4367
line = (line
4337
4368
# C and C++ style comment markers
4338
- .replace ('/*' , ' ' ).replace ('*/' , ' ' )
4339
- .strip ().strip ('/*#' )
4340
4369
# in rst
4341
4370
.replace ('|copy|' , ' (c) ' )
4342
4371
# un common pipe chars in some ascii art
@@ -4368,6 +4397,11 @@ def prepare_text_line(line):
4368
4397
.replace ('\\ XA9' , ' (c) ' )
4369
4398
.replace ('\\ A9' , ' (c) ' )
4370
4399
.replace ('\\ a9' , ' (c) ' )
4400
+ .replace ('<A9>' , ' (c) ' )
4401
+ .replace ('XA9;' , ' (c) ' )
4402
+ .replace ('Xa9;' , ' (c) ' )
4403
+ .replace ('xA9;' , ' (c) ' )
4404
+ .replace ('xa9;' , ' (c) ' )
4371
4405
# \xc2 is a Â
4372
4406
.replace ('\xc2 ' , '' )
4373
4407
.replace ('\\ xc2' , '' )
@@ -4393,18 +4427,22 @@ def prepare_text_line(line):
4393
4427
.replace ('&' , '&' )
4394
4428
.replace ('&' , '&' )
4395
4429
.replace ('>' , '>' )
4430
+ .replace ('>' , '>' )
4396
4431
.replace ('>' , '>' )
4397
4432
.replace ('<' , '<' )
4433
+ .replace ('<' , '<' )
4398
4434
.replace ('<' , '<' )
4399
4435
4400
4436
# normalize (possibly repeated) quotes to unique single quote '
4401
4437
# backticks ` and "
4402
4438
.replace ('`' , "'" )
4403
4439
.replace ('"' , "'" )
4404
- # u nicode prefix in Python strings
4440
+ # u unicode prefix in legacy Python2 strings
4405
4441
.replace (" u'" , " '" )
4406
4442
# see https://github.com/nexB/scancode-toolkit/issues/3667
4407
4443
.replace ('§' , " " )
4444
+ # keep http
4445
+ .replace ('<http' , " http" )
4408
4446
)
4409
4447
4410
4448
if TRACE_TOK :
@@ -4427,15 +4465,18 @@ def prepare_text_line(line):
4427
4465
# replace ('
4428
4466
.replace ('("' , ' ' )
4429
4467
# some trailing garbage ')
4430
- .replace (u "')" , ' ' )
4431
- .replace (u "]," , ' ' )
4468
+ .replace ("')" , ' ' )
4469
+ .replace ("]," , ' ' )
4432
4470
)
4433
4471
if TRACE_TOK :
4434
4472
logger_debug (' prepare_text_line: after replacements2: ' + repr (line ))
4435
4473
4436
- line = strip_markup_text (line )
4437
4474
# note that we do not replace the debian tag by a space: we remove it
4438
- line = strip_debian_markup (line )
4475
+ # This "Debian" legacy copyright file <s> </s> markup tags seen in
4476
+ # older copyright files. Note we replace by nothing.
4477
+ line = line .replace ("</s>" , "" ).replace ("<s>" , "" ).replace ("<s/>" , "" )
4478
+
4479
+ line = strip_known_markup_from_text (line )
4439
4480
4440
4481
if TRACE_TOK :
4441
4482
logger_debug (' prepare_text_line: after strip_markup: ' + repr (line ))
@@ -4455,14 +4496,13 @@ def prepare_text_line(line):
4455
4496
# normalize to ascii text
4456
4497
line = toascii (line , translit = True )
4457
4498
4499
+ # remove stars
4500
+ line = line .strip (' *' )
4501
+
4458
4502
# normalize to use only LF as line endings so we can split correctly
4459
4503
# and keep line endings
4460
4504
line = unixlinesep (line )
4461
4505
4462
- # strip verbatim back slash and comment signs again at both ends of a line
4463
- # FIXME: this is done at the start of this function already
4464
- line = line .strip ('\\ /*#%;' )
4465
-
4466
4506
# normalize spaces
4467
4507
line = ' ' .join (line .split ())
4468
4508
0 commit comments