Skip to content

Commit fea65d3

Browse files
authored
Merge pull request #977 from nexB/better-pos-tagging
Improve copyright POS tagging
2 parents c52aa1e + a342e20 commit fea65d3

File tree

11 files changed

+68
-30
lines changed

11 files changed

+68
-30
lines changed

etc/scripts/testdata/livescan/expected.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,pr
1212
/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1313
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
1414
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
15-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost &lt [email protected]&gt,,,,,,,,,,,,,,,,,,
16-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost &lt,,,,,,,,,,,,,,,,,
15+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost <[email protected]>,,,,,,,,,,,,,,,,,,
16+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost,,,,,,,,,,,,,,,,,
1717
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,[email protected],,,,,,,,,,,,,,,,
1818
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
1919
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,

src/cluecode/copyrights.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,13 @@ def detect(location):
226226
(r'^(Send|It|Mac|Support|Confidential|Information|Various|Mouse|Wheel'
227227
r'|Vendor|Commercial|Indemnified|Luxi|These|Several|GnuPG|WPA|Supplicant'
228228
r'|TagSoup|Contact|IA64|Foreign|Data|Atomic|Pentium|Note|Delay|Separa.*|Added'
229-
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New'
229+
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New|MIT'
230230
r')$', 'NN'),
231231

232232
# Various non CAPS
233233
(r'^(OR)$', 'NN'),
234234

235-
# Various rare non CAPS but NNP
235+
# Various rare non CAPS but NNP, treated as full names
236236
(r'^(FSF[\.,]?)$', 'NAME'),
237237

238238
# Windows XP
@@ -252,6 +252,9 @@ def detect(location):
252252
r'[Ff]unctionality|bgcolor|F+|Rewrote|Much|remains?,?|Implementation|earlier'
253253
r'|al.|is|laws|url|[Ss]ee)$', 'JUNK'),
254254

255+
# Some mixed case junk
256+
(r'^LastModified$', 'JUNK'),
257+
255258
# Some font names
256259
(r'^Lucida$', 'JUNK'),
257260

@@ -277,8 +280,8 @@ def detect(location):
277280

278281
(r'^\$?LastChangedDate\$?$', 'YR'),
279282

280-
# Misc corner cases
281-
(r'^Software,\',|\(Royal|PARADIGM|nexB|Antill\',$', 'NNP'),
283+
# Misc corner cases that are NNP
284+
(r'^Software,\',|\(Royal|PARADIGM|nexB|okunishinishi|yiminghe|Antill\',$', 'NNP'),
282285

283286
# rarer caps
284287
# EPFL-LRC/ICA
@@ -347,7 +350,8 @@ def detect(location):
347350
(r'^HOLDER\(S\)$', 'JUNK'),
348351
(r'^([Hh]olders?|HOLDERS?)$', 'HOLDER'),
349352

350-
(r'^([Rr]espective)$', 'NN'),
353+
# not NNPs
354+
(r'^([Rr]espective|JavaScript)$', 'NN'),
351355

352356
# affiliates or "and its affiliate(s)."
353357
(r'^[Aa]ffiliate(s|\(s\))?\.?$', 'NNP'),
@@ -495,11 +499,8 @@ def detect(location):
495499
# all CAPS word, all letters including an optional trailing single quote
496500
(r"^[A-Z]{2,}\'?$", 'CAPS'),
497501

498-
# email eventually in parens or brackets. The closing > or ) is optional
499-
(r'[\<\(][a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?[\>\)]?', 'EMAIL'),
500-
501-
# email
502-
(r'[a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?', 'EMAIL'),
502+
# email eventually in parens or brackets with some trailing punct.
503+
(r'^[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,5}?[\>\)\.\,]*$', 'EMAIL'),
503504

504505
# URLS such as <(http://fedorahosted.org/lohit)>
505506
(r'[<\(]https?:.*[>\)]', 'URL'),
@@ -526,6 +527,7 @@ def detect(location):
526527

527528
# comma as a conjunction
528529
(r'^,$', 'CC'),
530+
529531
# .\" is not a noun
530532
(r'^\.\\\?"?$', 'JUNK'),
531533

@@ -538,7 +540,7 @@ def detect(location):
538540
# communications
539541
(r'communications', 'NNP'),
540542

541-
# Code variable names, snake case
543+
# Code variable names including snake case
542544
(r'^.*(_.*)+$', 'JUNK'),
543545

544546
# nouns (default)
@@ -1793,8 +1795,8 @@ def prepare_text_line(line):
17931795
"""
17941796
Prepare a line of text for copyright detection.
17951797
"""
1796-
re_sub = re.sub
17971798
# FIXME: maintain the original character positions
1799+
re_sub = re.sub
17981800

17991801
# strip whitespace
18001802
line = line.strip()
@@ -1809,13 +1811,10 @@ def prepare_text_line(line):
18091811
# replace ('
18101812
line = line.replace(r'("', ' ')
18111813

1812-
# strip comment markers
1813-
# common comment characters
1814-
line = line.strip('\\/*#%;')
18151814
# un common comment line prefix in dos
18161815
line = re_sub('^rem\s+', ' ', line)
18171816
line = re_sub('^\@rem\s+', ' ', line)
1818-
# un common comment line prefix in autotools am/in
1817+
# less common comment line prefix in autotools am/in
18191818
line = re_sub('^dnl\s+', ' ', line)
18201819
# un common comment line prefix in man pages
18211820
line = re_sub('^\.\\\\"', ' ', line)
@@ -1830,7 +1829,9 @@ def prepare_text_line(line):
18301829
line = line.replace('&copy;', ' (c) ')
18311830
line = line.replace('&#169;', ' (c) ')
18321831
line = line.replace('&#xa9;', ' (c) ')
1832+
line = line.replace('&#XA9;', ' (c) ')
18331833
line = line.replace(u'\xa9', ' (c) ')
1834+
line = line.replace(u'\XA9', ' (c) ')
18341835
# FIXME: what is \xc2???
18351836
line = line.replace(u'\xc2', '')
18361837

@@ -1840,6 +1841,16 @@ def prepare_text_line(line):
18401841
line = line.replace(u'&#13;&#10;', ' ')
18411842
line = line.replace(u'&#13;', ' ')
18421843
line = line.replace(u'&#10;', ' ')
1844+
# spaces
1845+
line = line.replace(u'&ensp;', ' ')
1846+
line = line.replace(u'&emsp;', ' ')
1847+
line = line.replace(u'&thinsp;', ' ')
1848+
1849+
# common named entities
1850+
line = line.replace(u'&quot;', '"').replace(u'&#34;', '"')
1851+
line = line.replace(u'&amp;', '&').replace(u'&#38;', '&')
1852+
line = line.replace(u'&gt;', '>').replace(u'&#62;', '>')
1853+
line = line.replace(u'&lt;', '<').replace(u'&#60;', '<')
18431854

18441855
# normalize (possibly repeated) quotes to unique single quote '
18451856
# backticks ` and "
@@ -1849,8 +1860,10 @@ def prepare_text_line(line):
18491860
# quotes to space? but t'so will be wrecked
18501861
# line = line.replace(u"'", ' ')
18511862

1852-
# remove explicit \\n
1863+
# treat explicit CR, LF and tabs as space
18531864
line = line.replace("\\n", ' ')
1865+
line = line.replace("\\r", ' ')
1866+
line = line.replace("\\t", ' ')
18541867

18551868
# remove backslash
18561869
line = line.replace("\\", ' ')
@@ -1906,4 +1919,8 @@ def prepare_text_line(line):
19061919
# why?
19071920
line = lowercase_well_known_word(line)
19081921

1922+
# strip comment markers
1923+
# common comment characters
1924+
line = line.strip('\\/*#%;')
1925+
19091926
return line

src/textcode/analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,8 @@ def as_unicode(line):
247247

248248
def remove_verbatim_cr_lf_tab_chars(s):
249249
"""
250-
Return a string replacinf by a space any verbatim but escaped line endings and
251-
tabs (such as a literal \n or \r \t).
250+
Return a string replacing by a space any verbatim but escaped line endings
251+
and tabs (such as a literal \n or \r \t).
252252
"""
253253
if not s:
254254
return s
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
\n\nCopyright (c) 2012 LearnBoost &lt;[email protected]&gt;\n\n
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
what:
2+
- copyrights
3+
- holders
4+
- holders_summary
5+
6+
copyrights:
7+
- Copyright (c) 2012 LearnBoost <[email protected]>
8+
holders:
9+
- LearnBoost
10+
holders_summary:
11+
- LearnBoost
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Otherwood (c) 2011 note this implementation is heavily based/inspired from the dictionary implementation
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
what:
2+
- copyrights
3+
copyrights:
4+
- Otherwood (c) 2011

tests/cluecode/data/ics/chromium-chrome-common-extensions-docs-examples-apps-hello-python-httplib2/__init__.py.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ copyrights:
66
- Copyright 2006, Joe Gregorio contributors Thomas Broyer ([email protected])', James Antill
77
holders:
88
- Joe Gregorio contributors
9-
- Thomas Broyer James Antill
9+
- Thomas Broyer
10+
11+
- James Antill
1012
holders_summary:
1113
- Joe Gregorio contributors
12-
- Thomas Broyer James Antill
14+
- Thomas Broyer
15+
16+
- James Antill
1317
notes: extra trailing contribution should not be detected

tests/cluecode/data/ics/speex-include-speex/speex_types.h.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org/
6+
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org
77
holders:
88
- the Xiph.Org Foundation
99
holders_summary:

tests/cluecode/data/ics/speex-libspeex/smallft.c.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org/
6+
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org
77
holders:
88
- the XIPHOPHORUS Company
99
holders_summary:

0 commit comments

Comments
 (0)