Skip to content

Commit 02d7add

Browse files
committed
fix #8, add support for chained suffixes. remove support for titles with periods in them, e.g "Lt.Gen."
current parser logic won't support both, seems fixable though
1 parent 1cbd161 commit 02d7add

File tree

5 files changed

+132
-49
lines changed

5 files changed

+132
-49
lines changed

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Supports 3 different comma placement variations in the input string.
3030

3131
* Title Firstname "Nickname" Middle Middle Lastname Suffix
3232
* Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix]
33-
* Title Firstname M Lastname [Suffix], Suffix [, Suffix]
33+
* Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix]
3434

3535
When there is ambiguity that cannot be resolved by a rule-based approach,
3636
HumanName prefers to handle the most common cases correctly. For example,

nameparser/config/suffixes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
'chfc',
1919
'cfp',
2020
'md',
21+
'mba',
22+
'ma',
2123
'phd',
2224
'mp',
2325
'qc',

nameparser/parser.py

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
from __future__ import unicode_literals
33

44
import logging
5-
from .util import u
6-
from .util import text_type
7-
from .util import lc
8-
from .config import CONSTANTS
9-
from .config import Constants
5+
from nameparser.util import u
6+
from nameparser.util import text_type
7+
from nameparser.util import lc
8+
from nameparser.config import CONSTANTS
9+
from nameparser.config import Constants
1010

1111
# http://code.google.com/p/python-nameparser/issues/detail?id=10
1212
log = logging.getLogger('HumanName')
@@ -264,7 +264,15 @@ def is_prefix(self, piece):
264264

265265
def is_suffix(self, piece):
266266
"""Is in the suffixes set or :py:func:`is_an_initial()`."""
267-
return lc(piece) in self.C.suffixes and not self.is_an_initial(piece)
267+
# suffixes may have periods inside them like "M.D."
268+
return lc(piece).replace('.','') in self.C.suffixes and not self.is_an_initial(piece)
269+
270+
def are_suffixes(self, pieces):
271+
"""Return True if all pieces are suffixes."""
272+
for piece in pieces:
273+
if not self.is_suffix(piece):
274+
return False
275+
return True
268276

269277
def is_rootname(self, piece):
270278
'''Is not a known title, suffix or prefix. Just first, middle, last names.'''
@@ -391,26 +399,24 @@ def parse_full_name(self):
391399
if not self.first:
392400
self.first_list.append(piece)
393401
continue
394-
if (i == len(pieces) - 2) and self.is_suffix(nxt):
402+
if self.are_suffixes(pieces[i+1:]):
395403
self.last_list.append(piece)
396-
self.suffix_list.append(nxt)
404+
self.suffix_list += pieces[i+1:]
397405
break
398406
if not nxt:
399407
self.last_list.append(piece)
400408
continue
401409

402410
self.middle_list.append(piece)
403411
else:
404-
if self.is_suffix(parts[1]):
412+
if self.are_suffixes(parts[1].split(' ')):
405413

406-
# suffix comma: title first middle last [suffix], suffix [, suffix]
414+
# suffix comma: title first middle last [suffix], suffix [suffix] [, suffix]
407415
# parts[0], parts[1:...]
408416

409417
self.suffix_list += parts[1:]
410-
411418
pieces = self.parse_pieces(parts[0].split(' '))
412419
log.debug("pieces: {0}".format(u(pieces)))
413-
414420
for i, piece in enumerate(pieces):
415421
try:
416422
nxt = pieces[i + 1]
@@ -423,9 +429,9 @@ def parse_full_name(self):
423429
if not self.first:
424430
self.first_list.append(piece)
425431
continue
426-
if (i == len(pieces) - 2) and self.is_suffix(nxt):
432+
if self.are_suffixes(pieces[i+1:]):
427433
self.last_list.append(piece)
428-
self.suffix_list.insert(0,nxt)
434+
self.suffix_list = pieces[i+1:] + self.suffix_list
429435
break
430436
if not nxt:
431437
self.last_list.append(piece)
@@ -475,6 +481,20 @@ def parse_full_name(self):
475481
self.unparsable = False
476482
self.post_process()
477483

484+
485+
# def split_periods(self, pieces):
486+
# """
487+
# If there is a period that is not at the end of a piece, split it on periods.
488+
# """
489+
# tmp = []
490+
# for piece in pieces:
491+
# if piece[:-1].find('.') >= 0:
492+
# p = [_f for _f in piece.split('.') if _f]
493+
# tmp += [x+'.' for x in p]
494+
# else:
495+
# tmp += [piece]
496+
# return tmp
497+
478498
def parse_pieces(self, parts, additional_parts_count=0):
479499
"""
480500
Split parts on spaces and remove commas, join on conjunctions and
@@ -489,21 +509,11 @@ def parse_pieces(self, parts, additional_parts_count=0):
489509
:rtype: list
490510
"""
491511

492-
ps = []
512+
tmp = []
493513
for part in parts:
494-
ps += [x.strip(' ,') for x in part.split(' ')]
495-
496-
# if there is a period that is not at the end of a piece, split it on periods
497-
pieces = []
498-
for piece in ps:
499-
if piece[:-1].find('.') >= 0:
500-
p = [_f for _f in piece.split('.') if _f]
501-
pieces += [x+'.' for x in p]
502-
else:
503-
pieces += [piece]
504-
514+
tmp += [x.strip(' ,') for x in part.split(' ')]
505515

506-
return self.join_on_conjunctions(pieces, additional_parts_count)
516+
return self.join_on_conjunctions(tmp, additional_parts_count)
507517

508518
def join_on_conjunctions(self, pieces, additional_parts_count=0):
509519
"""

nameparser/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ def lc(value):
2222
"""Lower case and remove any periods to normalize for comparison."""
2323
if not value:
2424
return ''
25-
return value.lower().replace('.','')
25+
return value.lower().strip('.')

tests.py

Lines changed: 91 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,9 @@ def test_lowercase_middle_initial_comma_lastname_and_suffix_conflict_with_conjun
10441044
self.m(hn.last, "Smith", hn)
10451045
self.m(hn.suffix, "III, Jr", hn)
10461046

1047+
@unittest.expectedFailure
10471048
def test_two_initials_conflict_with_conjunction(self):
1049+
# Supporting this seems to screw up titles with periods in them like M.B.A.
10481050
hn = HumanName('E.T. Smith')
10491051
self.m(hn.first, "E.", hn)
10501052
self.m(hn.middle, "T.", hn)
@@ -1270,7 +1272,85 @@ def test_parenthesis_are_removed2(self):
12701272
self.m(hn.suffix, "Jr.", hn)
12711273

12721274

1275+
class SuffixesTestCase(HumanNameTestBase):
1276+
1277+
def test_suffix(self):
1278+
hn = HumanName("Joe Franklin Jr")
1279+
self.m(hn.first, "Joe", hn)
1280+
self.m(hn.last, "Franklin", hn)
1281+
self.m(hn.suffix, "Jr", hn)
1282+
1283+
def test_suffix_with_periods(self):
1284+
hn = HumanName("Joe Dentist D.D.S.")
1285+
self.m(hn.first, "Joe", hn)
1286+
self.m(hn.last, "Dentist", hn)
1287+
self.m(hn.suffix, "D.D.S.", hn)
1288+
1289+
def test_two_suffixes(self):
1290+
hn = HumanName("Kenneth Clarke QC MP")
1291+
self.m(hn.first, "Kenneth", hn)
1292+
self.m(hn.last, "Clarke", hn)
1293+
# NOTE: this adds a comma when the orginal format did not have one.
1294+
# not ideal but at least its in the right bucket
1295+
self.m(hn.suffix, "QC, MP", hn)
1296+
1297+
def test_two_suffixes_lastname_comma_format(self):
1298+
hn = HumanName("Washington Jr. MD, Franklin")
1299+
self.m(hn.first, "Franklin", hn)
1300+
self.m(hn.last, "Washington", hn)
1301+
# NOTE: this adds a comma when the orginal format did not have one.
1302+
self.m(hn.suffix, "Jr., MD", hn)
1303+
1304+
def test_two_suffixes_suffix_comma_format(self):
1305+
hn = HumanName("Franklin Washington, Jr. MD")
1306+
self.m(hn.first, "Franklin", hn)
1307+
self.m(hn.last, "Washington", hn)
1308+
self.m(hn.suffix, "Jr. MD", hn)
1309+
1310+
def test_suffix_containing_periods(self):
1311+
hn = HumanName("Kenneth Clarke Q.C.")
1312+
self.m(hn.first, "Kenneth", hn)
1313+
self.m(hn.last, "Clarke", hn)
1314+
self.m(hn.suffix, "Q.C.", hn)
1315+
1316+
def test_suffix_containing_periods_lastname_comma_format(self):
1317+
hn = HumanName("Clarke, Kenneth, Q.C. M.P.")
1318+
self.m(hn.first, "Kenneth", hn)
1319+
self.m(hn.last, "Clarke", hn)
1320+
self.m(hn.suffix, "Q.C. M.P.", hn)
1321+
1322+
def test_suffix_containing_periods_suffix_comma_format(self):
1323+
hn = HumanName("Kenneth Clarke Q.C., M.P.")
1324+
self.m(hn.first, "Kenneth", hn)
1325+
self.m(hn.last, "Clarke", hn)
1326+
self.m(hn.suffix, "Q.C., M.P.", hn)
1327+
1328+
def test_suffix_with_single_comma_format(self):
1329+
hn = HumanName("John Doe jr., MD")
1330+
self.m(hn.first, "John", hn)
1331+
self.m(hn.last, "Doe", hn)
1332+
self.m(hn.suffix, "jr., MD", hn)
1333+
1334+
def test_suffix_with_double_comma_format(self):
1335+
hn = HumanName("Doe, John jr., MD")
1336+
self.m(hn.first, "John", hn)
1337+
self.m(hn.last, "Doe", hn)
1338+
self.m(hn.suffix, "jr., MD", hn)
1339+
1340+
#http://en.wikipedia.org/wiki/Ma_(surname)
1341+
def test_potential_suffix_that_is_also_last_name(self):
1342+
hn = HumanName("Jack Ma")
1343+
self.m(hn.first, "Jack", hn)
1344+
self.m(hn.last, "Ma", hn)
1345+
1346+
def test_potential_suffix_that_is_also_last_name_with_suffix(self):
1347+
hn = HumanName("Jack Ma Jr")
1348+
self.m(hn.first, "Jack", hn)
1349+
self.m(hn.last, "Ma", hn)
1350+
self.m(hn.suffix, "Jr", hn)
1351+
12731352
class HumanNameTitleTestCase(HumanNameTestBase):
1353+
12741354
def test_last_name_is_also_title(self):
12751355
hn = HumanName("Amy E Maid")
12761356
self.m(hn.first, "Amy", hn)
@@ -1306,18 +1386,6 @@ def test_title_is_title(self):
13061386
hn = HumanName("Coach")
13071387
self.m(hn.title, "Coach", hn)
13081388

1309-
def test_suffix_with_single_comma_format(self):
1310-
hn = HumanName("John Doe jr., MD")
1311-
self.m(hn.first, "John", hn)
1312-
self.m(hn.last, "Doe", hn)
1313-
self.m(hn.suffix, "jr., MD", hn)
1314-
1315-
def test_suffix_with_double_comma_format(self):
1316-
hn = HumanName("Doe, John jr., MD")
1317-
self.m(hn.first, "John", hn)
1318-
self.m(hn.last, "Doe", hn)
1319-
self.m(hn.suffix, "jr., MD", hn)
1320-
13211389
# TODO: fix handling of U.S.
13221390
@unittest.expectedFailure
13231391
def test_chained_title_first_name_initial(self):
@@ -1354,17 +1422,17 @@ def test_chained_hyphenated_title_with_comma_suffix(self):
13541422
self.m(hn.middle, "G", hn)
13551423
self.m(hn.last, "Davis", hn)
13561424
self.m(hn.suffix, "III", hn)
1357-
1425+
13581426
@unittest.expectedFailure
13591427
def test_title_multiple_titles_with_conjunctions(self):
1360-
# I think it finds the index of the wrong 'the'. I get confused because it
1428+
# FIXME: I think it finds the index of the wrong 'the'. I get confused because it
13611429
# loops in reverse order.
13621430
hn = HumanName("The Right Hon. the President of the Queen's Bench Division")
13631431
self.m(hn.title, "The Right Hon. the President of the Queen's Bench Division", hn)
13641432

13651433
@unittest.expectedFailure
13661434
def test_conjunction_before_title(self):
1367-
# TODO: seems fixable
1435+
# FIXME: seems fixable
13681436
hn = HumanName('The Lord of the Universe')
13691437
self.m(hn.title, "The Lord of the Universe", hn)
13701438

@@ -1388,7 +1456,9 @@ def test_title_with_last_initial_is_suffix(self):
13881456
self.m(hn.first, "John", hn)
13891457
self.m(hn.last, "V.", hn)
13901458

1391-
def test_lc_comparison_of_title(self):
1459+
@unittest.expectedFailure
1460+
def test_two_title_parts_separated_by_commas(self):
1461+
# supporting this currently messes up supporting suffixes like M.B.A.
13921462
hn = HumanName("Lt.Gen. John A. Kenneth Doe IV")
13931463
self.m(hn.title, "Lt. Gen.", hn)
13941464
self.m(hn.first, "John", hn)
@@ -1438,10 +1508,11 @@ def test_possible_conflict_with_suffix_that_could_be_initial(self):
14381508

14391509
# 'ben' is removed from PREFIXES in v0.2.5
14401510
# this test could re-enable this test if we decide to support 'ben' as a prefix
1441-
# def test_ben_as_conjunction(self):
1442-
# hn = HumanName("Ahmad ben Husain")
1443-
# self.m(hn.first,"Ahmad", hn)
1444-
# self.m(hn.last,"ben Husain", hn)
1511+
@unittest.expectedFailure
1512+
def test_ben_as_conjunction(self):
1513+
hn = HumanName("Ahmad ben Husain")
1514+
self.m(hn.first,"Ahmad", hn)
1515+
self.m(hn.last,"ben Husain", hn)
14451516

14461517
def test_ben_as_first_name(self):
14471518
hn = HumanName("Ben Johnson")

0 commit comments

Comments
 (0)