Fix __str__ behavior

sloria · sloria · commit 84099ad9e957 · 2013-09-25T09:06:06.000-05:00
- Adds implements_to_string decorator that makes __str__
return unicode. This fixes print statements so the output is
as expected.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -8,6 +8,7 @@ Changelog
 - Basic extensions framework in place. TextBlob has been refactored to make it easier to develop extensions.
 - Add ``text.classifiers.PositiveNaiveBayesClassifier``.
 - Update NLTK.
+- Fix ``__str__`` behavior. ``print blob`` should now print expected output in both Python 2 and 3.
 - *Backwards-incompatible*: All abstract base classes have been moved to the ``text.base`` module.
 - *Backwards-incompatible*: ``PerceptronTagger`` will now be maintained as an extension, ``textblob-aptagger``. Instantiating a ``text.taggers.PerceptronTagger()`` will raise a ``DeprecationWarning``.
 
diff --git a/text/blob.py b/text/blob.py
@@ -27,7 +27,7 @@
 from text.utils import lowerstrip, PUNCTUATION_REGEX
 from text.inflect import singularize as _singularize, pluralize as _pluralize
 from text.mixins import BlobComparableMixin, StringlikeMixin
-from text.compat import unicode, basestring, python_2_unicode_compatible
+from text.compat import unicode, basestring
 from text.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
                        BaseSentimentAnalyzer, BaseParser)
 from text.np_extractors import FastNPExtractor
@@ -287,7 +287,6 @@ def _initialize_models(obj, tokenizer, pos_tagger,
     obj.classifier = classifier
 
 
-@python_2_unicode_compatible
 class BaseBlob(StringlikeMixin, BlobComparableMixin):
 
     '''An abstract base class that all text.blob classes will inherit from.
diff --git a/text/compat.py b/text/compat.py
@@ -19,6 +19,15 @@ def u(s):
     imap = imap
     izip = izip
     import unicodecsv as csv
+
+    def implements_to_string(cls):
+        '''Class decorator that renames __str__ to __unicode__ and
+        modifies __str__ that returns utf-8.
+        '''
+        cls.__unicode__ = cls.__str__
+        cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
+        return cls
+
 else: # PY3
     def b(s):
         return s.encode("latin-1")
@@ -35,6 +44,8 @@ def u(s):
     izip = zip
     import csv
 
+    implements_to_string = lambda x: x
+
 
 def add_metaclass(metaclass):
     """Class decorator for creating a class with a metaclass.
@@ -48,122 +59,3 @@ def wrapper(cls):
             orig_vars.pop(slots_var)
         return metaclass(cls.__name__, cls.__bases__, orig_vars)
     return wrapper
-
-# ======= Compatibility layer for __str__ and __repr__ from NLTK ==========
-
-import unicodedata
-import functools
-
-def remove_accents(text):
-
-    if isinstance(text, bytes):
-        text = text.decode('ascii')
-
-    category = unicodedata.category  # this gives a small (~10%) speedup
-    return ''.join(
-        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
-    )
-
-# Select the best transliteration method:
-try:
-    # Older versions of Unidecode are licensed under Artistic License;
-    # assume an older version is installed.
-    from unidecode import unidecode as transliterate
-except ImportError:
-    try:
-        # text-unidecode implementation is worse than Unidecode
-        # implementation so Unidecode is preferred.
-        from text_unidecode import unidecode as transliterate
-    except ImportError:
-        # This transliteration method should be enough
-        # for many Western languages.
-        transliterate = remove_accents
-
-
-def python_2_unicode_compatible(klass):
-    """
-    This decorator defines __unicode__ method and fixes
-    __repr__ and __str__ methods under Python 2.
-
-    To support Python 2 and 3 with a single code base,
-    define __str__ and __repr__ methods returning unicode
-    text and apply this decorator to the class.
-
-    Original __repr__ and __str__ would be available
-    as unicode_repr and __unicode__ (under both Python 2
-    and Python 3).
-    """
-
-    if not issubclass(klass, object):
-        raise ValueError("This decorator doesn't work for old-style classes")
-
-    # both __unicode__ and unicode_repr are public because they
-    # may be useful in console under Python 2.x
-
-    # if __str__ or __repr__ are not overriden in a subclass,
-    # they may be already fixed by this decorator in a parent class
-    # and we shouldn't them again
-
-    if not _was_fixed(klass.__str__):
-        klass.__unicode__ = klass.__str__
-        if PY2:
-            klass.__str__ = _7bit(_transliterated(klass.__unicode__))
-
-
-    if not _was_fixed(klass.__repr__):
-        klass.unicode_repr = klass.__repr__
-        if PY2:
-            klass.__repr__ = _7bit(klass.unicode_repr)
-
-    return klass
-
-
-def unicode_repr(obj):
-    """
-    For classes that was fixed with @python_2_unicode_compatible
-    ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
-    the result is returned without "u" letter (to make output the
-    same under Python 2.x and Python 3.x); for other variables
-    it is the same as ``repr``.
-    """
-    if not PY2:
-        return repr(obj)
-
-    # Python 2.x
-    if hasattr(obj, 'unicode_repr'):
-        return obj.unicode_repr()
-
-    if isinstance(obj, unicode):
-        return repr(obj)[1:]  # strip "u" letter from output
-
-    return repr(obj)
-
-
-def _transliterated(method):
-    def wrapper(self):
-        return transliterate(method(self))
-
-    functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])
-    if hasattr(method, "_nltk_compat_7bit"):
-        wrapper._nltk_compat_7bit = method._nltk_compat_7bit
-
-    wrapper._nltk_compat_transliterated = True
-    return wrapper
-
-
-def _7bit(method):
-    def wrapper(self):
-        return method(self).encode('ascii', 'backslashreplace')
-
-    functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])
-
-    if hasattr(method, "_nltk_compat_transliterated"):
-        wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
-
-    wrapper._nltk_compat_7bit = True
-    return wrapper
-
-
-def _was_fixed(method):
-    return (getattr(method, "_nltk_compat_7bit", False) or
-            getattr(method, "_nltk_compat_transliterated", False))
diff --git a/text/mixins.py b/text/mixins.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 import sys
-from text.compat import basestring, u
+from text.compat import basestring, implements_to_string
 
 
 class ComparableMixin(object):
@@ -46,6 +46,7 @@ def _compare(self, other, method):
         return super(BlobComparableMixin, self)._compare(other, method)
 
 
+@implements_to_string
 class StringlikeMixin(object):
 
     '''Make blob objects behave like Python strings.
@@ -61,18 +62,14 @@ def __repr__(self):
         return "{cls}({text})".format(cls=class_name,
                                         text=repr(self._strkey()))
 
-    def __len__(self):
-        '''Returns the length of the raw text.'''
-        return len(self._strkey())
-
     def __str__(self):
         '''Returns a string representation used in print statements
         or str(my_blob).'''
         return self._strkey()
 
-    def __unicode__(self):
-        '''Returns the unicode representation of the blob.'''
-        return u(self._strkey())
+    def __len__(self):
+        '''Returns the length of the raw text.'''
+        return len(self._strkey())
 
     def __iter__(self):
         '''Makes the object iterable as if it were a string,