1- import re
21import operator
3-
4- import textblob
5- import nltk
2+ import sys
63
74from . import exceptions
85from . import detectors
9- from .filth import Filth , MergedFilth
6+ from .filth import Filth
107
118
129class Scrubber (object ):
@@ -50,8 +47,10 @@ def clean(self, text, **kwargs):
5047 through to the ``Filth.replace_with`` method to fine-tune how the
5148 ``Filth`` is cleaned.
5249 """
53- if not isinstance (text , unicode ):
54- raise exceptions .UnicodeRequired
50+ if sys .version_info < (3 , 0 ):
51+ # Only in Python 2. In 3 every string is a Python 2 unicode
52+ if not isinstance (text , unicode ):
53+ raise exceptions .UnicodeRequired
5554
5655 clean_chunks = []
5756 filth = Filth ()
@@ -73,14 +72,15 @@ def iter_filth(self, text):
7372 # over all detectors simultaneously. just trying to get something
7473 # working right now and we can worry about efficiency later
7574 all_filths = []
76- for detector in self ._detectors .itervalues ():
75+ for detector in self ._detectors .values ():
7776 for filth in detector .iter_filth (text ):
7877 if not isinstance (filth , Filth ):
7978 raise TypeError ('iter_filth must always yield Filth' )
8079 all_filths .append (filth )
8180
82- # Sort per start position and substrings
83- all_filths = sorted (all_filths , cmp = _sort_filths )
81+ # Sort by start position. If two filths start in the same place then
82+ # return the longer one first
83+ all_filths .sort (key = lambda f : (f .beg , - f .end ))
8484
8585 # this is where the Scrubber does its hard work and merges any
8686 # overlapping filths.
@@ -94,22 +94,3 @@ def iter_filth(self, text):
9494 else :
9595 filth = filth .merge (next_filth )
9696 yield filth
97-
98-
99- def _sort_filths (a_filth , b_filth ):
100- """Sort list of filths per starting position and substrings"""
101-
102- # if a_filth starts first return a
103- if a_filth .beg < b_filth .beg :
104- return - 1
105-
106- # if b_filth starts first return b
107- if a_filth .beg > b_filth .beg :
108- return 1
109-
110- # if boths filths start in the same position
111- # return the inclusive filth
112- if a_filth .end > b_filth .end :
113- return - 1
114-
115- return 1
0 commit comments