Merge pull request #114 from LuminosoInsight/more-py3.7-fixes

slibs63 · web-flow · commit 6c59404e4ce7 · 2018-09-24T11:38:55.000-04:00
Fixes for Python 3.7.0
diff --git a/docs/index.rst b/docs/index.rst
@@ -301,8 +301,8 @@ Variants of UTF-8
 
 *ftfy.chardata* and *ftfy.build_data*: trivia about characters
 --------------------------------------------------------------
-These files load information about the character properties in Unicode 9.0.
-Yes, even if your version of Python doesn't support Unicode 9.0. This ensures
+These files load information about the character properties in Unicode 11.0.
+Yes, even if your version of Python doesn't support Unicode 11.0. This ensures
 that ftfy's behavior is consistent across versions.
 
 .. automodule:: ftfy.chardata
diff --git a/ftfy/badness.py b/ftfy/badness.py
@@ -128,7 +128,7 @@ def _make_weirdness_regex():
     '[ÂÃĂ][\x80-\x9f€ƒ‚„†‡ˆ‰‹Œ“•˜œŸ¡¢£¤¥¦§¨ª«¬¯°±²³µ¶·¸¹º¼½¾¿ˇ˘˝]|'
     # Characters we have to be a little more cautious about if they're at
     # the end of a word, but totally okay to fix in the middle
-    '[ÂÃĂ][›»‘”©™]\w|'
+    r'[ÂÃĂ][›»‘”©™]\w|'
     # Similar mojibake of low-numbered characters in MacRoman. Leaving out
     # most mathy characters because of false positives, but cautiously catching
     # "√±" (mojibake for "ñ") and "√∂" (mojibake for "ö") in the middle of a
@@ -141,7 +141,7 @@ def _make_weirdness_regex():
     # Also left out eye-like letters, including accented o's, for when ¬ is
     # the nose of a kaomoji.
     '[¬√][ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñúùûü†¢£§¶ß®©™≠ÆØ¥ªæø≤≥]|'
-    '\w√[±∂]\w|'
+    r'\w√[±∂]\w|'
     # ISO-8859-1, ISO-8859-2, or Windows-1252 mojibake of characters U+10000
     # to U+1FFFF. (The Windows-1250 and Windows-1251 versions might be too
     # plausible.)
diff --git a/ftfy/build_data.py b/ftfy/build_data.py
@@ -5,7 +5,7 @@
 classes we care about change, or if a new version of Python supports a new
 Unicode standard and we want it to affect our string decoding.
 
-The file that we generate is based on Unicode 9.0, as supported by Python 3.6.
+The file that we generate is based on Unicode 11.0, as supported by Python 3.7.
 You can certainly use it in earlier versions. This simply makes sure that we
 get consistent results from running ftfy on different versions of Python.
 
@@ -39,16 +39,16 @@ def make_char_data_file(do_it_anyway=False):
     Build the compressed data file 'char_classes.dat' and write it to the
     current directory.
 
-    If you run this, run it in Python 3.6 or later. It will run in earlier
-    versions, but you won't get the Unicode 9 standard, leading to inconsistent
-    behavior.
+    If you run this, run it in Python 3.7.0 or later. It will run in earlier
+    versions, but you won't get the Unicode 11 standard, leading to inconsistent
+    behavior. Pre-releases of Python 3.7 won't work (Unicode 11 wasn't out yet).
 
     To protect against this, running this in the wrong version of Python will
     raise an error unless you pass `do_it_anyway=True`.
     """
-    if sys.hexversion < 0x03060000 and not do_it_anyway:
+    if sys.hexversion < 0x030700f0 and not do_it_anyway:
         raise RuntimeError(
-            "This function should be run in Python 3.6 or later."
+            "This function should be run in Python 3.7.0 or later."
         )
 
     cclasses = [None] * 0x110000
diff --git a/ftfy/char_classes.dat b/ftfy/char_classes.dat
diff --git a/tests/test_futuristic_codepoints.py b/tests/test_futuristic_codepoints.py
@@ -40,3 +40,22 @@ def test_unicode_10():
     # all versions for consistency.
     thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
     assert sequence_weirdness(thalim) == 0
+
+
+def test_unicode_11():
+    # Unicode 11 has implemented the mtavruli form of the Georgian script.
+    # They are analogous to capital letters in that they can be used to
+    # emphasize text or write a headline.
+    #
+    # Python will convert to that form when running .upper() on Georgian text,
+    # starting in version 3.7.0. We want to recognize the result as reasonable
+    # text on all versions.
+    #
+    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
+    # language".
+
+    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
+    assert sequence_weirdness(georgian_mtavruli_text) == 0
+
+    mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
+    assert fix_encoding(mojibake) == georgian_mtavruli_text