@@ -226,13 +226,13 @@ def detect(location):
226226 (r'^(Send|It|Mac|Support|Confidential|Information|Various|Mouse|Wheel'
227227 r'|Vendor|Commercial|Indemnified|Luxi|These|Several|GnuPG|WPA|Supplicant'
228228 r'|TagSoup|Contact|IA64|Foreign|Data|Atomic|Pentium|Note|Delay|Separa.*|Added'
229- r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New'
229+ r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New|MIT '
230230 r')$' , 'NN' ),
231231
232232 # Various non CAPS
233233 (r'^(OR)$' , 'NN' ),
234234
235- # Various rare non CAPS but NNP
235+ # Various rare non CAPS but NNP, treated as full names
236236 (r'^(FSF[\.,]?)$' , 'NAME' ),
237237
238238 # Windows XP
@@ -252,6 +252,9 @@ def detect(location):
252252 r'[Ff]unctionality|bgcolor|F+|Rewrote|Much|remains?,?|Implementation|earlier'
253253 r'|al.|is|laws|url|[Ss]ee)$' , 'JUNK' ),
254254
255+ # Some mixed case junk
256+ (r'^LastModified$' , 'JUNK' ),
257+
255258 # Some font names
256259 (r'^Lucida$' , 'JUNK' ),
257260
@@ -277,8 +280,8 @@ def detect(location):
277280
278281 (r'^\$?LastChangedDate\$?$' , 'YR' ),
279282
280- # Misc corner cases
281- (r'^Software,\',|\(Royal|PARADIGM|nexB|Antill\',$' , 'NNP' ),
283+ # Misc corner cases that are NNP
284+ (r'^Software,\',|\(Royal|PARADIGM|nexB|okunishinishi|yiminghe| Antill\',$' , 'NNP' ),
282285
283286 # rarer caps
284287 # EPFL-LRC/ICA
@@ -347,7 +350,8 @@ def detect(location):
347350 (r'^HOLDER\(S\)$' , 'JUNK' ),
348351 (r'^([Hh]olders?|HOLDERS?)$' , 'HOLDER' ),
349352
350- (r'^([Rr]espective)$' , 'NN' ),
353+ # not NNPs
354+ (r'^([Rr]espective|JavaScript)$' , 'NN' ),
351355
352356 # affiliates or "and its affiliate(s)."
353357 (r'^[Aa]ffiliate(s|\(s\))?\.?$' , 'NNP' ),
@@ -495,11 +499,8 @@ def detect(location):
495499 # all CAPS word, all letters including an optional trailing single quote
496500 (r"^[A-Z]{2,}\'?$" , 'CAPS' ),
497501
498- # email eventually in parens or brackets. The closing > or ) is optional
499- (r'[\<\(][a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?[\>\)]?' , 'EMAIL' ),
500-
501- # email
502- (r'[a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?' , 'EMAIL' ),
502+ # email eventually in parens or brackets with some trailing punct.
503+ (r'^[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,5}?[\>\)\.\,]*$' , 'EMAIL' ),
503504
504505 # URLS such as <(http://fedorahosted.org/lohit)>
505506 (r'[<\(]https?:.*[>\)]' , 'URL' ),
@@ -526,6 +527,7 @@ def detect(location):
526527
527528 # comma as a conjunction
528529 (r'^,$' , 'CC' ),
530+
529531 # .\" is not a noun
530532 (r'^\.\\\?"?$' , 'JUNK' ),
531533
@@ -538,7 +540,7 @@ def detect(location):
538540 # communications
539541 (r'communications' , 'NNP' ),
540542
541- # Code variable names, snake case
543+ # Code variable names including snake case
542544 (r'^.*(_.*)+$' , 'JUNK' ),
543545
544546 # nouns (default)
0 commit comments