Skip to content

Commit 9ba183f

Browse files
committed
Fix unicode decode errors.
1 parent e9d9e35 commit 9ba183f

File tree

3 files changed

+102
-8
lines changed

3 files changed

+102
-8
lines changed

cylc/uiserver/review.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import jinja2
3030
from jinja2 import select_autoescape
3131
import json
32+
import markupsafe
3233
import mimetypes
3334
import os
3435
import pwd
@@ -517,7 +518,7 @@ def suites(
517518
continue
518519
with contextlib.suppress(OSError):
519520
data["entries"].append({
520-
"name": str(item),
521+
"name": item.encode().decode('UTF-8', errors='replace'),
521522
"info": {},
522523
"last_activity_time": (
523524
self.get_last_activity_time(user, item))})
@@ -548,8 +549,9 @@ def suites(
548549
rosie_info = {}
549550
with contextlib.suppress(IOError):
550551
for line in open( # noqa: SIM115
551-
rosie_suite_info, 'r'
552+
rosie_suite_info, 'rb'
552553
).readlines():
554+
line = line.decode('utf-8', errors='replace')
553555
if not line.strip().startswith('#') and '=' in line:
554556
rosie_key, rosie_val = line.strip().split("=", 1)
555557
if rosie_key in ("project", "title"):
@@ -576,8 +578,8 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
576578
except KeyError:
577579
raise cherrypy.HTTPError(404) from None
578580
f_size = tar_info.size
579-
handle = tar_f.extractfile(path_in_tar)
580-
if handle.read(2) == "#!":
581+
handle = tar_f.extractfile(path_in_tar, 'rb')
582+
if self._is_shebang(handle):
581583
mime = self.MIME_TEXT_PLAIN
582584
else:
583585
mime = mimetypes.guess_type(
@@ -601,10 +603,10 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
601603
return cherrypy.lib.static.serve_file(temp_f.name, mime)
602604
finally:
603605
temp_f.close()
604-
text = handle.read()
606+
text = self._get_file_text(handle)
605607
else:
606608
f_size = os.stat(f_name).st_size
607-
if open(f_name).read(2) == "#!": # noqa: SIM115
609+
if self._is_shebang(f_name):
608610
mime = self.MIME_TEXT_PLAIN
609611
else:
610612
mime = mimetypes.guess_type(quote(f_name))[0]
@@ -618,7 +620,7 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
618620
):
619621
cherrypy.response.headers["Content-Type"] = mime
620622
return cherrypy.lib.static.serve_file(f_name, mime)
621-
text = open(f_name).read() # noqa: SIM115
623+
text = self._get_file_text(f_name)
622624
try:
623625
text = str(text)
624626
if mode in [None, "text"]:
@@ -627,7 +629,7 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
627629
# escape future modifications to this string. In order to
628630
# allow log file syntax highlighting (DEBUG, INFO, etc) we
629631
# must cast this back to a str to remove this functionality.
630-
text = str(jinja2.escape(text))
632+
text = str(markupsafe.escape(text))
631633
lines = text.splitlines()
632634
except ValueError:
633635
if path_in_tar:
@@ -1097,3 +1099,28 @@ def _get_user_suite_dir(self, user, suite, *paths):
10971099
def _sort_summary_entries(suite1):
10981100
"""Sort suites by last_activity_time."""
10991101
return suite1.get("last_activity_time") or suite1["name"]
1102+
1103+
@staticmethod
1104+
def _has_shebang(file):
1105+
"""File (or handle) has shebang"""
1106+
# It's a filepath
1107+
if isinstance(file, str):
1108+
with open(file, 'rb') as handle:
1109+
return handle.read(2) == b'!#'
1110+
1111+
# It's a file handle: Make sure that you close it later!
1112+
first_chars = file.read(2)
1113+
return first_chars in [b'!#', '!#']
1114+
1115+
@staticmethod
1116+
def _get_file_text(file):
1117+
"""File (or handle) has shebang"""
1118+
if isinstance(file, str):
1119+
with open(file, 'rb') as handle:
1120+
return handle.read().decode('utf-8', errors='replace')
1121+
1122+
# It's a file handle: Make sure that you close it later!
1123+
filecontent = file.read()
1124+
if isinstance(filecontent, bytes):
1125+
return filecontent.decode('utf-8', errors='replace')
1126+
return filecontent

tests/unit/test_review.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
2+
# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
from io import BytesIO, StringIO
18+
import pytest
19+
20+
from cylc.uiserver.review import CylcReviewService
21+
22+
23+
@pytest.fixture(scope='module')
24+
def get_garbage_file(tmp_path_factory: pytest.TempdirFactory) -> str:
25+
"""Create a file containing a byte which cannot be interpreted as UTF-8
26+
27+
Check that attempting to read this file will lead to an error without
28+
extra handling.
29+
"""
30+
fpath = tmp_path_factory.mktemp("temp") / 'garbage'
31+
fpath.write_bytes(b'\xe9')
32+
33+
# Assert that the string is garbage if it's interpreted as utf-8:
34+
with pytest.raises(UnicodeDecodeError):
35+
fpath.read_text()
36+
37+
return str(fpath)
38+
39+
40+
def test_has_shebang_handle_contains_garbage():
41+
"""Test is_shebang method for file handle input."""
42+
handle = BytesIO(b'\xe9')
43+
assert CylcReviewService._has_shebang(handle) is False
44+
45+
46+
def test_has_shebang_handle_is_true():
47+
"""It recognizes shebangs in text and binary files."""
48+
handle = StringIO('!#HelloWorld')
49+
assert CylcReviewService._has_shebang(handle) is True
50+
handle = BytesIO(b'!#HelloWorld')
51+
assert CylcReviewService._has_shebang(handle) is True
52+
53+
54+
def test_has_shebang_file_bin_bad(get_garbage_file):
55+
"""Test is_shebang method for file path input.
56+
57+
Additionally, test a case where there is a non-utf-8 char at the
58+
start of the file.
59+
"""
60+
assert CylcReviewService._has_shebang(get_garbage_file) is False
61+
62+
63+
def test_get_file_text_nasty(get_garbage_file):
64+
"""_get_file_text replaces non UTF-8 chars with question marks."""
65+
res = CylcReviewService()._get_file_text(get_garbage_file)
66+
assert res == '�'

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ exclude=
1818
.eggs,
1919
cylc/uiserver/jupyter_config.py,
2020
*.svg,
21+
./src
2122
paths =
2223
./cylc/uiserver
2324
./tests/integration

0 commit comments

Comments
 (0)