Skip to content

Commit 048ba70

Browse files
committed
Fix unicode decode errors.
1 parent e9d9e35 commit 048ba70

File tree

4 files changed

+170
-8
lines changed

4 files changed

+170
-8
lines changed

cylc/uiserver/review.py

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import jinja2
3030
from jinja2 import select_autoescape
3131
import json
32+
import markupsafe
3233
import mimetypes
3334
import os
3435
import pwd
@@ -41,6 +42,8 @@
4142
import traceback
4243
from urllib.parse import quote
4344

45+
from typing import TYPE_CHECKING, Union
46+
4447
from cylc.flow import __version__ as CYLC_VERSION
4548
from cylc.flow.hostuserutil import get_host
4649
from cylc.flow.task_state import TASK_STATUSES_ORDERED
@@ -49,6 +52,10 @@
4952
from cylc.uiserver.ws import get_util_home
5053

5154

55+
if TYPE_CHECKING:
56+
from io import TextIOWrapper
57+
58+
5259
# Cylc 7 Task states.
5360
CYLC7_TASK_STATUSES_ORDERED = [
5461
'runahead',
@@ -517,7 +524,7 @@ def suites(
517524
continue
518525
with contextlib.suppress(OSError):
519526
data["entries"].append({
520-
"name": str(item),
527+
"name": item.encode().decode('UTF-8', errors='replace'),
521528
"info": {},
522529
"last_activity_time": (
523530
self.get_last_activity_time(user, item))})
@@ -548,8 +555,9 @@ def suites(
548555
rosie_info = {}
549556
with contextlib.suppress(IOError):
550557
for line in open( # noqa: SIM115
551-
rosie_suite_info, 'r'
558+
rosie_suite_info, 'rb'
552559
).readlines():
560+
line = line.decode('utf-8', errors='replace')
553561
if not line.strip().startswith('#') and '=' in line:
554562
rosie_key, rosie_val = line.strip().split("=", 1)
555563
if rosie_key in ("project", "title"):
@@ -576,8 +584,8 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
576584
except KeyError:
577585
raise cherrypy.HTTPError(404) from None
578586
f_size = tar_info.size
579-
handle = tar_f.extractfile(path_in_tar)
580-
if handle.read(2) == "#!":
587+
handle = tar_f.extractfile(path_in_tar, 'rb')
588+
if self._is_shebang(handle):
581589
mime = self.MIME_TEXT_PLAIN
582590
else:
583591
mime = mimetypes.guess_type(
@@ -601,10 +609,10 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
601609
return cherrypy.lib.static.serve_file(temp_f.name, mime)
602610
finally:
603611
temp_f.close()
604-
text = handle.read()
612+
text = self._get_file_text(handle)
605613
else:
606614
f_size = os.stat(f_name).st_size
607-
if open(f_name).read(2) == "#!": # noqa: SIM115
615+
if self._is_shebang(f_name):
608616
mime = self.MIME_TEXT_PLAIN
609617
else:
610618
mime = mimetypes.guess_type(quote(f_name))[0]
@@ -618,7 +626,7 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
618626
):
619627
cherrypy.response.headers["Content-Type"] = mime
620628
return cherrypy.lib.static.serve_file(f_name, mime)
621-
text = open(f_name).read() # noqa: SIM115
629+
text = self._get_file_text(f_name)
622630
try:
623631
text = str(text)
624632
if mode in [None, "text"]:
@@ -627,7 +635,7 @@ def get_file(self, user, suite, path, path_in_tar=None, mode=None):
627635
# escape future modifications to this string. In order to
628636
# allow log file syntax highlighting (DEBUG, INFO, etc) we
629637
# must cast this back to a str to remove this functionality.
630-
text = str(jinja2.escape(text))
638+
text = str(markupsafe.escape(text))
631639
lines = text.splitlines()
632640
except ValueError:
633641
if path_in_tar:
@@ -1097,3 +1105,28 @@ def _get_user_suite_dir(self, user, suite, *paths):
10971105
def _sort_summary_entries(suite1):
10981106
"""Sort suites by last_activity_time."""
10991107
return suite1.get("last_activity_time") or suite1["name"]
1108+
1109+
@staticmethod
1110+
def _has_shebang(file: Union[str, TextIOWrapper]) -> bool:
1111+
"""File (or handle) has shebang"""
1112+
# It's a filepath
1113+
if isinstance(file, str):
1114+
with open(file, 'rb') as handle:
1115+
return handle.read(2) == b'!#'
1116+
1117+
# It's a file handle: Make sure that you close it later!
1118+
first_chars = file.read(2)
1119+
return first_chars in [b'!#', '!#']
1120+
1121+
@staticmethod
1122+
def _get_file_text(file: Union[str, TextIOWrapper]) -> str:
1123+
"""File (or handle) has shebang"""
1124+
if isinstance(file, str):
1125+
with open(file, 'rb') as handle:
1126+
return handle.read().decode('utf-8', errors='replace')
1127+
1128+
# It's a file handle: Make sure that you close it later!
1129+
filecontent = file.read()
1130+
if isinstance(filecontent, bytes):
1131+
return filecontent.decode('utf-8', errors='replace')
1132+
return filecontent

tests/unit/test_review.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
2+
# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
from io import BytesIO, StringIO
18+
import pytest
19+
20+
from cylc.uiserver.review import CylcReviewService
21+
22+
23+
@pytest.fixture(scope='module')
24+
def get_garbage_file(tmp_path_factory: pytest.TempdirFactory) -> str:
25+
"""Create a file containing a byte which cannot be interpreted as UTF-8
26+
27+
Check that attempting to read this file will lead to an error without
28+
extra handling.
29+
"""
30+
fpath = tmp_path_factory.mktemp("temp") / 'garbage'
31+
fpath.write_bytes(b'\xe9')
32+
33+
# Assert that the string is garbage if it's interpreted as utf-8:
34+
with pytest.raises(UnicodeDecodeError):
35+
fpath.read_text()
36+
37+
return str(fpath)
38+
39+
40+
def test_has_shebang_handle_contains_garbage():
41+
"""Test is_shebang method for file handle input."""
42+
handle = BytesIO(b'\xe9')
43+
assert CylcReviewService._has_shebang(handle) is False
44+
45+
46+
def test_has_shebang_handle_is_true():
47+
"""It recognizes shebangs in text and binary files."""
48+
handle = StringIO('!#HelloWorld')
49+
assert CylcReviewService._has_shebang(handle) is True
50+
handle = BytesIO(b'!#HelloWorld')
51+
assert CylcReviewService._has_shebang(handle) is True
52+
53+
54+
def test_has_shebang_file_bin_bad(get_garbage_file):
55+
"""Test is_shebang method for file path input.
56+
57+
Additionally, test a case where there is a non-utf-8 char at the
58+
start of the file.
59+
"""
60+
assert CylcReviewService._has_shebang(get_garbage_file) is False
61+
62+
63+
def test_get_file_text_nasty(get_garbage_file):
64+
"""_get_file_text replaces non UTF-8 chars with question marks."""
65+
res = CylcReviewService()._get_file_text(get_garbage_file)
66+
assert res == '�'

tests/unit/test_ws.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
2+
# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
"""Unit testing for web service functionality
18+
"""
19+
from contextlib import contextmanager
20+
import pytest
21+
from types import SimpleNamespace
22+
23+
from cylc.uiserver.ws import get_review_service_config
24+
25+
26+
def _mock_bind(input_):
27+
"""Mock the socket.bind interface with simulating having a range
28+
of bad ports.
29+
"""
30+
_, port = input_
31+
if port in list(range(8000, 8002)):
32+
raise Exception(f'Port is {port}')
33+
return True
34+
35+
36+
@contextmanager
37+
def _mock_socket(*args):
38+
"""Mock socket.socket context object
39+
"""
40+
yield SimpleNamespace(bind=_mock_bind)
41+
42+
43+
@pytest.mark.parametrize(
44+
'inputs, port',
45+
(
46+
(((8000, 8003), None), 8002),
47+
(((8000, 8003), 'gerbil/wheel'), 8002),
48+
)
49+
)
50+
def test_get_review_service_config(monkeypatch, inputs, port):
51+
monkeypatch.setattr('socket.socket', _mock_socket)
52+
result = get_review_service_config(*inputs)
53+
assert result['command'][3] == f'--port={port}'
54+
assert result['command'][4] == f'--service-root={inputs[1]}'
55+
assert result['url'] == f"http://127.0.0.1:{port}/"
56+
57+
58+
def test_get_review_service_config_no_port(monkeypatch):
59+
inputs = ((8000, 8001), None)
60+
monkeypatch.setattr('socket.socket', _mock_socket)
61+
with pytest.raises(Exception, match='bleugh'):
62+
get_review_service_config(*inputs)

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ exclude=
1818
.eggs,
1919
cylc/uiserver/jupyter_config.py,
2020
*.svg,
21+
./src
2122
paths =
2223
./cylc/uiserver
2324
./tests/integration

0 commit comments

Comments
 (0)