improve string format handling of nickname punctuation #41

derek73 · derek73 · commit 2d267dac9cd5 · 2016-03-14T19:15:05.000-07:00
remove empty quotes and parenthesis so they can be included in the formatting string
diff --git a/README.rst b/README.rst
@@ -47,15 +47,16 @@ Quick Start Example
     'Juan de la Vega'
 
 
-3 different comma placement variations are supported for the string that you pass.
+3 different comma placement variations are supported:
 
 * Title Firstname "Nickname" Middle Middle Lastname Suffix
 * Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix]
 * Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix]
 
-The parser does not make any attempt to clean the data. It mostly just splits on white
+The parser does not make any attempt to clean the input. It mostly just splits on white
 space and puts things in buckets based on their position in the string. This also means
-the difference between 'title' and 'suffix' is positional, not semantic. ("Pre-nominal"
+the difference between 'title' and 'suffix' is positional, not semantic. "Dr" is a title
+when it comes before the name and a suffix when it comes after. ("Pre-nominal"
 and "post-nominal" would probably be better names.)
 
 ::
diff --git a/docs/customize.rst b/docs/customize.rst
@@ -5,7 +5,10 @@ Pre-processing
 Name buckets
 ++++++++++++++
 
-Each attribute has a corresponding ordered list of name pieces. 
+Each attribute has a corresponding ordered list of name pieces. If you're doing
+pre- or post-processing you may wish to manipulate these lists directly. 
+The strings returned by the attribute names just join these lists with spaces.
+
 
 * o.title_list
 * o.first_list
@@ -14,9 +17,6 @@ Each attribute has a corresponding ordered list of name pieces.
 * o.suffix_list
 * o.nickname_list
 
-If you're doing pre- or post-processing you may wish to manipulate these lists directly. 
-The strings returned by the attribute names just join these lists with spaces.
-
 ::
 
   >>> hn = HumanName("Juan Q. Xavier Velasquez y Garcia, Jr.")
@@ -45,6 +45,33 @@ directly to the attribute.
   	nickname: ''
   ]>
 
+Controlling the string representation with string formatting
+============================================================
+
+You can control which name fields are included in the `str()` representation of a `HumanName` instance by changing its `string_format` attribute. Don't want to include nicknames in your output? No problem. 
+
+::
+
+  >>> name = HumanName("Dr. Juan de la Vega (Doc Vega)")
+  >>> str(name)
+  'Dr. Juan de la Vega Doc Vega'
+  >>> name.string_format = "{title} {first} {middle} {last}, {suffix}"
+  >>> str(name)
+  'Dr. Juan de la Vega'
+
+Trailing commas and empty quotes and parenthesis are automatically removed.
+
+::
+
+  >>> name = HumanName('Robert Johnson')
+  >>> name.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
+  >>> str(name)
+  'Robert Johnson'
+  >>> name = HumanName('Robert "Rob" Johnson')
+  >>> name.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
+  >>> str(name)
+  'Robert Johnson (Rob)'
+
 
 Customizing the Parser with Your Own Configuration
 ==================================================
@@ -54,17 +81,20 @@ matching the lower case characters of a name piece with pre-defined sets
 of strings located in :py:mod:`nameparser.config`. You can adjust
 these predefined sets to help fine tune the parser for your dataset.
 
-Parser Constants:
+Editable CONSTANTS sets:
 
-* `CONSTANTS.titles` - Pieces that come before the name. Cannot include things that may be first names
-* `CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David"
-* `CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d."
-* `CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr."
-* `CONSTANTS.conjunctions` - Connectors like "and" that join the preceeding piece to the following piece.
-* `CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding
-* `CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D"
-* `CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
+* `titles` - Pieces that come before the name. Cannot include things that may be first names
+* `first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David"
+* `suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d."
+* `suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr."
+* `conjunctions` - Connectors like "and" that join the preceeding piece to the following piece.
+* `prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding
+* `capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D"
+* `regexes` - Regular expressions used to find words, initials, nicknames, etc.
 
+Each set of constants comes with `add()` and `remove()` methods for tuning 
+the constants for your project. These methods automatically lower case and
+remove punctuation to normalize them for comparison.
 
 Changing the Parser Constants
 +++++++++++++++++++++++++++++++++
diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -117,7 +117,10 @@ def __next__(self):
     def __unicode__(self):
         if self.string_format:
             # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
-            return self.collapse_whitespace(self.string_format.format(**self.as_dict())).strip(', ')
+            _s = self.string_format.format(**self.as_dict())
+            # remove trailing punctation from missing nicknames
+            _s = _s.replace(" ()","").replace(" ''","").replace(' ""',"")
+            return self.collapse_whitespace(_s).strip(', ')
         return " ".join(self)
     
     def __str__(self):
diff --git a/tests.py b/tests.py
@@ -1433,6 +1433,13 @@ def test_suffix_with_double_comma_format(self):
         self.m(hn.last, "Doe", hn)
         self.m(hn.suffix, "jr., MD", hn)
 
+    @unittest.expectedFailure
+    def test_phd_with_erroneous_space(self):
+        hn = HumanName("John Smith, Ph. D.")
+        self.m(hn.first, "John", hn)
+        self.m(hn.last, "Smith", hn)
+        self.m(hn.suffix, "Ph. D.", hn)
+
     #http://en.wikipedia.org/wiki/Ma_(surname)
     def test_potential_suffix_that_is_also_last_name(self):
         hn = HumanName("Jack Ma")
@@ -1750,13 +1757,69 @@ def test_downcasing_mc(self):
 
 
 class HumanNameOutputFormatTests(HumanNameTestBase):
+    
     def test_formating(self):
         hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
         hn.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
         self.assertEqual(u(hn), "Rev John A. Kenneth Doe III (Kenny)")
         hn.string_format = "{last}, {title} {first} {middle}, {suffix} ({nickname})"
         self.assertEqual(u(hn), "Doe, Rev John A. Kenneth, III (Kenny)")
 
+    def test_quote_nickname_formating(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'")
+        hn.string_format = "{last}, {title} {first} {middle}, {suffix} '{nickname}'"
+        self.assertEqual(u(hn), "Doe, Rev John A. Kenneth, III 'Kenny'")
+
+    def test_formating_removing_keys_from_format_string(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'")
+        hn.string_format = "{last}, {title} {first} {middle}, {suffix}"
+        self.assertEqual(u(hn), "Doe, Rev John A. Kenneth, III")
+        hn.string_format = "{last}, {title} {first} {middle}"
+        self.assertEqual(u(hn), "Doe, Rev John A. Kenneth")
+        hn.string_format = "{last}, {first} {middle}"
+        self.assertEqual(u(hn), "Doe, John A. Kenneth")
+        hn.string_format = "{last}, {first}"
+        self.assertEqual(u(hn), "Doe, John")
+        hn.string_format = "{first} {last}"
+        self.assertEqual(u(hn), "John Doe")
+
+    def test_formating_removing_pieces_from_name_buckets(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'")
+        hn.string_format = "{title} {first} {middle} {last} {suffix}"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III")
+        hn.middle=''
+        self.assertEqual(u(hn), "Rev John Doe III")
+        hn.suffix=''
+        self.assertEqual(u(hn), "Rev John Doe")
+        hn.title=''
+        self.assertEqual(u(hn), "John Doe")
+
+    def test_formating_of_nicknames_with_parenthesis(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III (Kenny)")
+        hn.nickname=''
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III")
+
+    def test_formating_of_nicknames_with_single_quotes(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'"
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'")
+        hn.nickname=''
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III")
+
+    def test_formating_of_nicknames_with_double_quotes(self):
+        hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
+        hn.string_format = "{title} {first} {middle} {last} {suffix} \"{nickname}\""
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III \"Kenny\"")
+        hn.nickname=''
+        self.assertEqual(u(hn), "Rev John A. Kenneth Doe III")
 
 TEST_NAMES = (
     "John Doe",