Skip to content

Commit 7e8c029

Browse files
committed
Improve copyright POS tagging #930
* minor changes mostly from scanning several npms Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent dcb8a19 commit 7e8c029

File tree

4 files changed

+24
-13
lines changed

4 files changed

+24
-13
lines changed

src/cluecode/copyrights.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,13 @@ def detect(location):
226226
(r'^(Send|It|Mac|Support|Confidential|Information|Various|Mouse|Wheel'
227227
r'|Vendor|Commercial|Indemnified|Luxi|These|Several|GnuPG|WPA|Supplicant'
228228
r'|TagSoup|Contact|IA64|Foreign|Data|Atomic|Pentium|Note|Delay|Separa.*|Added'
229-
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New'
229+
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New|MIT'
230230
r')$', 'NN'),
231231

232232
# Various non CAPS
233233
(r'^(OR)$', 'NN'),
234234

235-
# Various rare non CAPS but NNP
235+
# Various rare non CAPS but NNP, treated as full names
236236
(r'^(FSF[\.,]?)$', 'NAME'),
237237

238238
# Windows XP
@@ -252,6 +252,9 @@ def detect(location):
252252
r'[Ff]unctionality|bgcolor|F+|Rewrote|Much|remains?,?|Implementation|earlier'
253253
r'|al.|is|laws|url|[Ss]ee)$', 'JUNK'),
254254

255+
# Some mixed case junk
256+
(r'^LastModified$', 'JUNK'),
257+
255258
# Some font names
256259
(r'^Lucida$', 'JUNK'),
257260

@@ -277,8 +280,8 @@ def detect(location):
277280

278281
(r'^\$?LastChangedDate\$?$', 'YR'),
279282

280-
# Misc corner cases
281-
(r'^Software,\',|\(Royal|PARADIGM|nexB|Antill\',$', 'NNP'),
283+
# Misc corner cases that are NNP
284+
(r'^Software,\',|\(Royal|PARADIGM|nexB|okunishinishi|yiminghe|Antill\',$', 'NNP'),
282285

283286
# rarer caps
284287
# EPFL-LRC/ICA
@@ -347,7 +350,8 @@ def detect(location):
347350
(r'^HOLDER\(S\)$', 'JUNK'),
348351
(r'^([Hh]olders?|HOLDERS?)$', 'HOLDER'),
349352

350-
(r'^([Rr]espective)$', 'NN'),
353+
# not NNPs
354+
(r'^([Rr]espective|JavaScript)$', 'NN'),
351355

352356
# affiliates or "and its affiliate(s)."
353357
(r'^[Aa]ffiliate(s|\(s\))?\.?$', 'NNP'),
@@ -495,11 +499,8 @@ def detect(location):
495499
# all CAPS word, all letters including an optional trailing single quote
496500
(r"^[A-Z]{2,}\'?$", 'CAPS'),
497501

498-
# email eventually in parens or brackets. The closing > or ) is optional
499-
(r'[\<\(][a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?[\>\)]?', 'EMAIL'),
500-
501-
# email
502-
(r'[a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?', 'EMAIL'),
502+
# email eventually in parens or brackets with some trailing punct.
503+
(r'^[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,5}?[\>\)\.\,]*$', 'EMAIL'),
503504

504505
# URLS such as <(http://fedorahosted.org/lohit)>
505506
(r'[<\(]https?:.*[>\)]', 'URL'),
@@ -526,6 +527,7 @@ def detect(location):
526527

527528
# comma as a conjunction
528529
(r'^,$', 'CC'),
530+
529531
# .\" is not a noun
530532
(r'^\.\\\?"?$', 'JUNK'),
531533

@@ -538,7 +540,7 @@ def detect(location):
538540
# communications
539541
(r'communications', 'NNP'),
540542

541-
# Code variable names, snake case
543+
# Code variable names including snake case
542544
(r'^.*(_.*)+$', 'JUNK'),
543545

544546
# nouns (default)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Otherwood (c) 2011 note this implementation is heavily based/inspired from the dictionary implementation
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
what:
2+
- copyrights
3+
copyrights:
4+
- Otherwood (c) 2011

tests/cluecode/data/ics/chromium-chrome-common-extensions-docs-examples-apps-hello-python-httplib2/__init__.py.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ copyrights:
66
- Copyright 2006, Joe Gregorio contributors Thomas Broyer ([email protected])', James Antill
77
holders:
88
- Joe Gregorio contributors
9-
- Thomas Broyer James Antill
9+
- Thomas Broyer
10+
11+
- James Antill
1012
holders_summary:
1113
- Joe Gregorio contributors
12-
- Thomas Broyer James Antill
14+
- Thomas Broyer
15+
16+
- James Antill
1317
notes: extra trailing contribution should not be detected

0 commit comments

Comments
 (0)