Skip to content

Commit ba17dbb

Browse files
committed
Improve copyright detection more
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 1847eb4 commit ba17dbb

File tree

65 files changed

+193
-173
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+193
-173
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ ply==3.11
5454
publicsuffix2==2.20191221
5555
pyahocorasick==2.1.0
5656
pycparser==2.21
57-
pygmars==0.7.0
57+
pygmars==0.9.0
5858
Pygments==2.13.0
5959
pymaven-patch==0.3.2
6060
pyparsing==3.0.9

setup-mini.cfg

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ install_requires =
100100
plugincode >= 32.0.0
101101
publicsuffix2
102102
pyahocorasick >= 2.0.0
103-
pygmars >= 0.7.0
103+
pygmars >= 0.9.0
104104
pygments
105105
pymaven_patch >= 0.2.8
106106
requests >= 2.7.0
@@ -112,7 +112,7 @@ install_requires =
112112
xmltodict >= 0.11.0
113113
zipp >= 3.0.0; python_version < "3.9"
114114
typecode >= 30.0.1
115-
# typecode[full] >= 30.0.0
115+
# typecode[full] >= 30.0.1
116116
# extractcode[full] >= 31.0.0
117117

118118

@@ -199,6 +199,7 @@ scancode_post_scan =
199199
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
200200
consolidate = summarycode.plugin_consolidate:Consolidator
201201
license-references = licensedcode.licenses_reference:LicenseReference
202+
todo = summarycode.todo:AmbiguousDetectionsToDoPlugin
202203
classify = summarycode.classify_plugin:FileClassifier
203204

204205

setup.cfg

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ install_requires =
100100
plugincode >= 32.0.0
101101
publicsuffix2
102102
pyahocorasick >= 2.0.0
103-
pygmars >= 0.7.0
103+
pygmars >= 0.9.0
104104
pygments
105105
pymaven_patch >= 0.2.8
106106
requests >= 2.7.0
@@ -138,9 +138,9 @@ testing =
138138

139139
docs =
140140
Sphinx == 5.1.0
141-
sphinx-rtd-theme >= 0.5.0
142-
doc8 >= 0.8.1
141+
sphinx_rtd_theme >= 0.5.1
143142
sphinx-reredirects >= 0.1.2
143+
doc8 >= 0.8.1
144144
sphinx-autobuild
145145
sphinx-rtd-dark-mode>=1.3.0
146146
sphinx-copybutton

src/cluecode/copyrights.py

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,9 @@ def build_detection_from_node(
869869
(r'^Implementation-Vendor$', 'JUNK'),
870870
(r'^dnl$', 'JUNK'),
871871

872+
(r'^as$', 'NN'),
873+
(r'^[Vv]isit$', 'JUNK'),
874+
872875
(r'^rem$', 'JUNK'),
873876
(r'^REM$', 'JUNK'),
874877
(r'^Supports$', 'JUNK'),
@@ -995,6 +998,11 @@ def build_detection_from_node(
995998
(r'^Privacy$', 'JUNK'),
996999
(r'^within$', 'JUNK'),
9971000

1001+
(r'^official$', 'JUNK'),
1002+
(r'^duties$', 'JUNK'),
1003+
(r'^civil$', 'JUNK'),
1004+
(r'^servants?$', 'JUNK'),
1005+
9981006
# various trailing words that are junk
9991007
(r'^Copyleft$', 'JUNK'),
10001008
(r'^LegalCopyright$', 'JUNK'),
@@ -1060,6 +1068,7 @@ def build_detection_from_node(
10601068
# FIXME: may be lowercase instead?
10611069
(r'^Title:?$', 'JUNK'),
10621070
(r'^Debianized-By:?$', 'JUNK'),
1071+
(r'^[Dd]ebianized$', 'JUNK'),
10631072
(r'^Upstream-Maintainer:?$', 'JUNK'),
10641073
(r'^Content', 'JUNK'),
10651074
(r'^Upstream-Author:?$', 'JUNK'),
@@ -1307,7 +1316,7 @@ def build_detection_from_node(
13071316
(r'^DISCLAIMED', 'NN'),
13081317
(r'^Docs?$', 'NN'),
13091318
(r'^DOCUMENTATION', 'NN'),
1310-
(r'^Download', 'NN'),
1319+
(r'^Download', 'JUNK'),
13111320
(r'^DOM$', 'NN'),
13121321
(r'^Do$', 'NN'),
13131322
(r'^DoubleClick$', 'NN'),
@@ -1341,6 +1350,8 @@ def build_detection_from_node(
13411350
(r'^Except', 'NN'),
13421351
(r'^When$', 'NN'),
13431352
# (r'^Owner$', 'NN'),
1353+
(r'^Specifications?$', 'NN'),
1354+
(r'^Final$', 'NN'),
13441355
(r'^Holds$', 'NN'),
13451356
(r'^Image', 'NN'),
13461357
(r'^Supplier', 'NN'),
@@ -1369,7 +1380,7 @@ def build_detection_from_node(
13691380
(r'^GnuPG$', 'NN'),
13701381
(r'^Government.', 'NNP'),
13711382
(r'^OProfile$', 'NNP'),
1372-
(r'^Government', 'NN'),
1383+
(r'^Government$', 'COMP'),
13731384
(r'^Grants?\.?,?$', 'NN'),
13741385
(r'^Header', 'NN'),
13751386
(r'^HylaFAX$', 'NN'),
@@ -1513,7 +1524,7 @@ def build_detection_from_node(
15131524
(r'^Those$', 'NN'),
15141525
(r'^Timer', 'NN'),
15151526
(r'^TODO$', 'NN'),
1516-
(r'^Tool.?$', 'NN'),
1527+
(r'^Tools?.?$', 'NN'),
15171528
(r'^Trademarks?$', 'NN'),
15181529
(r'^True$', 'NN'),
15191530
(r'^TRUE$', 'NN'),
@@ -1563,19 +1574,6 @@ def build_detection_from_node(
15631574
(r'^AM$', 'NN'),
15641575
(r'^PM$', 'NN'),
15651576

1566-
(r'^January$', 'NN'),
1567-
(r'^February$', 'NN'),
1568-
(r'^March$', 'NN'),
1569-
(r'^April$', 'NN'),
1570-
(r'^May$', 'NN'),
1571-
(r'^June$', 'NN'),
1572-
(r'^July$', 'NN'),
1573-
(r'^August$', 'NN'),
1574-
(r'^September$', 'NN'),
1575-
(r'^October$', 'NN'),
1576-
(r'^November$', 'NN'),
1577-
(r'^December$', 'NN'),
1578-
15791577
(r'^Name[\.,]?$', 'NN'),
15801578
(r'^Co-Author[\.,]?$', 'NN'),
15811579
(r'^Author\'s$', 'NN'),
@@ -1584,10 +1582,12 @@ def build_detection_from_node(
15841582
(r'^Convention[\.,]?$', 'NN'),
15851583
(r'^Paris[\.,]?$', 'NN'),
15861584

1587-
# we do not include Jan and Jun that are common enough first names
1588-
(r'^(Feb|Mar|Apr|May|Jul|Aug|Sep|Oct|Nov|Dec)$', 'NN'),
1589-
(r'^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$', 'NN'),
1590-
(r'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun)$', 'NN'),
1585+
(r'^([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember)$', 'NN'),
1586+
# we do not include May, Jan and Jun that are common enough first names
1587+
(r'^(Feb|Mar|Apr|Jul|Aug|Sep|Oct|Nov|Dec),?$', 'MONTH'),
1588+
1589+
(r'^([Mm]onday|[Tt]uesday|[Ww]ednesday|[Tt]hursday|[Ff]riday|[Ss]aturday|[Ss]unday),?$', 'DAY'),
1590+
(r'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun|May),?$', 'NN'),
15911591

15921592
# misc words that are not NNs
15931593
# lowercase verbs ending in "ing"
@@ -1862,10 +1862,11 @@ def build_detection_from_node(
18621862
############################################################################
18631863

18641864
# "authors" or "contributors" is interesting, and so a tag of its own
1865+
(r'^[Aa]uthors,$', 'AUTHDOT'),
18651866
(r'^[Aa]uthor$', 'AUTH'),
18661867
(r'^[Aa]uthor\.$', 'AUTHDOT'),
18671868
(r'^[Aa]uthors?\.$', 'AUTHDOT'),
1868-
(r'^[Aa]uthors|author\'$', 'AUTHS'),
1869+
(r'^([Aa]uthors|author\')$', 'AUTHS'),
18691870
(r'^[Aa]uthor\(s\)$', 'AUTHS'),
18701871
(r'^[Aa]uthor\(s\)\.?$', 'AUTHDOT'),
18711872
# as javadoc
@@ -1978,7 +1979,7 @@ def build_detection_from_node(
19781979
############################################################################
19791980

19801981
# rare cases of trailing + signon years
1981-
(r'^20[0-1][0-9]\+$', 'YR-PLUS'),
1982+
(r'^20[0-3][0-9]\+$', 'YR-PLUS'),
19821983

19831984
# year or year ranges
19841985
# plain year with various leading and trailing punct
@@ -2007,6 +2008,9 @@ def build_detection_from_node(
20072008

20082009
(r'^(' + _YEAR_DASH_PRESENT + ')+$', 'YR'),
20092010

2011+
# ISO dates as in 2024-12-09
2012+
(r'^' + _YEAR + '-(0?[1-9]|1[012])-(0?[1-9]|[12][0-9]|3[01])$', 'YR'),
2013+
20102014
# 88, 93, 94, 95, 96: this is a pattern mostly used in FSF copyrights
20112015
(r'^[8-9][0-9],$', 'YR'),
20122016

@@ -2176,12 +2180,7 @@ def build_detection_from_node(
21762180
# End of line commenst are rules descriptions.
21772181
# One rule per line.
21782182

2179-
USE_MAIN_BRANCH = False or os.environ.get('SCANCODE_COPYRIGHT_USE_MAIN_BRANCH', False)
2180-
2181-
if USE_MAIN_BRANCH:
2182-
from cluecode.copyrightorig import grammar as GRAMMAR
2183-
else:
2184-
GRAMMAR = """
2183+
GRAMMAR = """
21852184
21862185
#######################################
21872186
# YEARS
@@ -3111,7 +3110,7 @@ def build_detection_from_node(
31113110
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
31123111
31133112
# copyright its authors
3114-
COPYRIGHT: {<COPY> <NN> <AUTHS>} #83030
3113+
COPYRIGHT: {<COPY> <NN> <AUTHDOT>} #83030
31153114
31163115
# Copyright: 2004-2007 by Internet Systems Consortium, Inc. ("ISC")
31173116
# 1995-2003 by Internet Software Consortium
@@ -3234,7 +3233,7 @@ def build_detection_from_node(
32343233
COPYRIGHT: {<COPY> <BY>? <AUTHOR>+ <YR-RANGE>*} #2800-1
32353234
32363235
COPYRIGHT: {<AUTHOR> <COPYRIGHT2>} #2820
3237-
COPYRIGHT: {<AUTHOR> <YR-RANGE>} #2830
3236+
32383237
# copyrighted by MIT
32393238
COPYRIGHT: {<COPY> <BY> <MIT>} #2840
32403239
@@ -3556,11 +3555,14 @@ def is_junk_copyryright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
35563555
'author\'',
35573556
'authors,',
35583557
'authorship',
3558+
'maintainer',
3559+
'co-maintainer',
35593560
'or',
35603561
'spdx-filecontributor',
35613562
'</b>',
35623563
'mailto:',
35633564
"name'",
3565+
"a",
35643566
])
35653567
))
35663568

src/licensedcode/data/licenses/d-fsl-1.0-en.LICENSE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ ignorable_copyrights:
1818
ignorable_holders:
1919
- Ministry of Science and Research, State of North-Rhine Westphalia
2020
ignorable_authors:
21-
- Axel Metzger and Till Jaeger, Institut
21+
- Axel Metzger and Till Jaeger, Institut fur Rechtsfragen der Freien
2222
ignorable_urls:
2323
- http://www.d-fsl.org/
2424
- http://www.fsf.org/licenses/gpl
@@ -462,4 +462,4 @@ This Program may be used by anyone in accordance
462462
with the terms of the German Free Software
463463
License
464464

465-
The License may be obtained under <http://www.d-fsl.org>."
465+
The License may be obtained under <http://www.d-fsl.org>."

src/licensedcode/data/licenses/ietf-trust.LICENSE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ ignorable_holders:
2121
- IETF Trust
2222
- IETF Trust and the persons identified as the document authors
2323
ignorable_authors:
24-
- IETF Trust
2524
- the IETF Trust
2625
ignorable_urls:
2726
- http://opensource.org/licenses/bsd-license.php
@@ -328,4 +327,4 @@ licensing administrator for the IRTF Document Stream and that these Legal Provis
328327
applied to documents submitted and published in the IRTF Document Stream following
329328
December 28, 2009. Section 4 of these Legal Provisions shall not apply to documents in the
330329
IRTF Document Stream, and all references to Section 4 hereof shall be disregarded with respect
331-
to documents in the IRTF Document Stream.
330+
to documents in the IRTF Document Stream.

src/licensedcode/data/licenses/minpack.LICENSE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ ignorable_copyrights:
1717
ignorable_holders:
1818
- University of Chicago
1919
ignorable_authors:
20-
- the University of Chicago
20+
- the University of Chicago, as Operator of Argonne National Laboratory
2121
---
2222

2323
Minpack Copyright Notice (1999) University of Chicago. All rights reserved
@@ -70,4 +70,4 @@ PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
7070
SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
7171
(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
7272
EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
73-
POSSIBILITY OF SUCH LOSS OR DAMAGES.
73+
POSSIBILITY OF SUCH LOSS OR DAMAGES.

src/licensedcode/data/licenses/qpopper.LICENSE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ignorable_holders:
1515
- QUALCOMM Incorporated
1616
- The Regents of the University of California
1717
ignorable_authors:
18-
- Austin Shelton
18+
- Austin Shelton, Information Systems and Technology, University of California, Berkeley
1919
---
2020

2121
Qpopper(tm) is licensed by QUALCOMM Incorporated under the following
@@ -158,4 +158,4 @@ Qpopper(tm) is licensed by QUALCOMM Incorporated under the following
158158
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
159159
POPPER SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE
160160
UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE
161-
MAINTENANCE, SUPPORT, UPDATES, ENCHANCEMENTS, OR MODIFICATIONS.
161+
MAINTENANCE, SUPPORT, UPDATES, ENCHANCEMENTS, OR MODIFICATIONS.

src/licensedcode/data/licenses/sparky.LICENSE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ ignorable_copyrights:
1313
ignorable_holders:
1414
- The Regents of the University of California
1515
ignorable_authors:
16-
- the UCSF Resource
16+
- the UCSF Resource for Biocomputing, Visualization, and Informatics
1717
---
1818

1919
Copyright, License, and Disclaimer
@@ -48,4 +48,4 @@ OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
4848
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
4949
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
5050
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
51-
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

src/licensedcode/data/licenses/us-govt-public-domain.LICENSE

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@ other_urls:
1313
- https://www.law.cornell.edu/uscode/text/17/105
1414
- https://en.wikipedia.org/wiki/Copyright_status_of_works_by_the_federal_government_of_the_United_States
1515
- https://en.wikipedia.org/wiki/Wikipedia:Public_domain#U.S._government_works
16+
ignorable_copyrights:
17+
- copyright United States Government
18+
ignorable_holders:
19+
- United States Government
1620
---
1721

1822
17 U.S. Code $ 105. Subject matter of copyright: United States Government works
1923

2024
Copyright protection under this title is not available for any work of the
21-
United States Government.
25+
United States Government.

0 commit comments

Comments
 (0)