Skip to content

Commit 3cf2c86

Browse files
author
Lance Nathan
authored
Merge pull request #110 from LuminosoInsight/latin1-inconsistent-mojibake
Make the partial mojibake fixer actually work on Latin-1
2 parents 5108b79 + af14c72 commit 3cf2c86

File tree

7 files changed

+72
-18
lines changed

7 files changed

+72
-18
lines changed

CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
## Version 5.5 (September 6, 2018)
2+
3+
- Recent versions have emphasized making a reasonable attempt to fix short,
4+
common mojibake sequences, such as `û`. In this version, we've expanded the
5+
heuristics to recognize these sequences in MacRoman as well as Windows-125x
6+
encodings.
7+
8+
- A related rule for fixing isolated Windows-1252/UTF-8 mixups, even when they
9+
were inconsistent with the rest of the string, claimed to work on Latin-1/UTF-8
10+
mixups as well, but in practice it didn't. We've made the rule more robust.
11+
12+
- Fixed a failure when testing the CLI on Windows.
13+
14+
- Removed the `pytest-runner` invocation from setup.py, as it created complex
15+
dependencies that would stop setup.py from working in some environments.
16+
The `pytest` command still works fine. `pytest-runner` is just too clever.
17+
18+
119
## Version 5.4.1 (June 14, 2018)
220

321
- Fixed a bug in the `setup.py` metadata.
@@ -19,6 +37,8 @@
1937

2038
- Provides better metadata for the new PyPI.
2139

40+
- Switched from nosetests to pytest.
41+
2242

2343
## Version 5.3 (January 25, 2018)
2444

ftfy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from ftfy import fixes
1111
from ftfy.formatting import display_ljust
1212

13-
__version__ = '5.4.1'
13+
__version__ = '5.5.0'
1414

1515

1616
# See the docstring for ftfy.bad_codecs to see what we're doing here.

ftfy/chardata.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,12 @@ def _build_utf8_punct_regex():
6060
These are recognizable by the distinctive 'â€' ('\xe2\x80') sequence they
6161
all begin with when decoded as Windows-1252.
6262
"""
63-
# We're making a regex that has all the literal bytes from 0x80 to 0xbf in
64-
# a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
65-
# However, when we decode the regex as Windows-1252, the resulting
66-
# characters won't even be remotely contiguous.
67-
obvious_utf8 = (€['
68-
+ bytes(range(0x80, 0xc0)).decode('sloppy-windows-1252')
63+
# We need to recognize the Latin-1 decodings of bytes 0x80 to 0xbf, which
64+
# are a contiguous range, as well as the different Windows-1252 decodings
65+
# of 0x80 to 0x9f, which are not contiguous at all. (Latin-1 and
66+
# Windows-1252 agree on bytes 0xa0 and up.)
67+
obvious_utf8 = ([€\x80][\x80-\xbf'
68+
+ bytes(range(0x80, 0xa0)).decode('sloppy-windows-1252')
6969
+ ']')
7070
return re.compile(obvious_utf8)
7171
PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
@@ -120,6 +120,10 @@ def _build_utf8_punct_regex():
120120
SINGLE_QUOTE_RE = re.compile('[\u02bc\u2018-\u201b]')
121121
DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
122122

123+
# This regex matches C1 control characters, which occupy some of the positions
124+
# in the Latin-1 character map that Windows assigns to other characters instead.
125+
C1_CONTROL_RE = re.compile(r'[\x80-\x9f]')
126+
123127

124128
def possible_encoding(text, encoding):
125129
"""

ftfy/fixes.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS,
1010
CONTROL_CHARS, LIGATURES, WIDTH_MAP,
1111
PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE,
12-
LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE)
12+
LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE,
13+
C1_CONTROL_RE)
1314
from ftfy.badness import text_cost
1415
from html import entities
1516

@@ -633,7 +634,7 @@ def replace_lossy_sequences(byts):
633634
not be used, and this function will not be run, so your weird control
634635
character will be left alone but wacky fixes like this won't be possible.
635636
636-
This is used as a step within `fix_encoding`.
637+
This is used as a transcoder within `fix_encoding`.
637638
"""
638639
return LOSSY_UTF8_RE.sub('\ufffd'.encode('utf-8'), byts)
639640

@@ -644,16 +645,23 @@ def fix_partial_utf8_punct_in_1252(text):
644645
UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
645646
consistently applied.
646647
647-
For this function, we assume the text has been decoded in Windows-1252.
648-
If it was decoded in Latin-1, we'll call this right after it goes through
649-
the Latin-1-to-Windows-1252 fixer.
648+
One form of inconsistency we need to deal with is that some character might
649+
be from the Latin-1 C1 control character set, while others are from the
650+
set of characters that take their place in Windows-1252. So we first replace
651+
those characters, then apply a fix that only works on Windows-1252 characters.
650652
651-
This is used as a step within `fix_encoding`.
653+
This is used as a transcoder within `fix_encoding`.
652654
"""
653-
def replacement(match):
655+
def latin1_to_w1252(match):
656+
"The function to apply when this regex matches."
657+
return match.group(0).encode('latin-1').decode('sloppy-windows-1252')
658+
659+
def w1252_to_utf8(match):
654660
"The function to apply when this regex matches."
655661
return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
656-
return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
662+
663+
text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
664+
return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
657665

658666

659667
TRANSCODERS = {

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
setup(
2828
name="ftfy",
29-
version='5.4.1',
29+
version='5.5.0',
3030
maintainer='Luminoso Technologies, Inc.',
3131
maintainer_email='[email protected]',
3232
license="MIT",

tests/test_cases.json

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,20 @@
151151
"fixed": "Arsenal v Wolfsburg: pre-season friendly – live!",
152152
"expect": "pass"
153153
},
154+
{
155+
"label": "Inconsistent UTF-8 / Latin-1 mojibake",
156+
"original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085",
157+
"fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…",
158+
"fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…",
159+
"expect": "pass"
160+
},
161+
{
162+
"label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set",
163+
"original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099",
164+
"fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…",
165+
"fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…",
166+
"expect": "pass"
167+
},
154168
{
155169
"label": "Handle Afrikaans 'n character",
156170
"original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.",
@@ -333,12 +347,19 @@
333347
"expect": "fail"
334348
},
335349
{
336-
"label": "Windows-1252 / MacRoman mixup in Spanish",
350+
"label": "Latin-1 / MacRoman mixup in Spanish",
337351
"comment": "Requires something like encoding detection",
338352
"original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.",
339353
"fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.",
340354
"expect": "fail"
341355
},
356+
{
357+
"label": "Latin-1 / MacRoman mixup in Spanish, 2 characters",
358+
"comment": "Requires something like encoding detection",
359+
"original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente",
360+
"fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente",
361+
"expect": "fail"
362+
},
342363
{
343364
"label": "UTF-8 / Windows-1251 mixup in tweet spam",
344365
"original": "Blog Traffic Tip 2 – Broadcast Email Your Blog",

tests/test_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def test_same_file():
5353
with pytest.raises(subprocess.CalledProcessError) as exception:
5454
get_command_output(['ftfy', TEST_FILENAME, '-o', TEST_FILENAME])
5555
error = exception.value.output.decode('utf-8')
56-
assert error.startswith("ftfy error:\nCan't read and write the same file.")
56+
assert error.startswith("ftfy error:")
57+
assert "Can't read and write the same file" in error
5758

5859

5960
def test_stdin():

0 commit comments

Comments
 (0)