Skip to content

Commit 41c72f2

Browse files
authored
Start enforcing local storage to always use the UTF-8 encoding (#87)
This fixes a bug reported on Discord, where a user on Windows had issues with datasets loaded from the `storage` directory not being able to parse some items which had special characters, because the items were saved with the `utf-8` encoding, but loaded with the default Windows encoding `cp-1252`. This forces `utf-8` to be used everywhere.
1 parent a782693 commit 41c72f2

File tree

12 files changed

+22
-17
lines changed

12 files changed

+22
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ Changelog
88

99
- option to add event handlers which accept no arguments
1010

11+
### Fixed
12+
13+
- started enforcing local storage to always use the UTF-8 encoding
14+
1115
[1.0.0](../../releases/tag/v1.0.0) - 2022-03-13
1216
-----------------------------------------------
1317

docs/res/format_docs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
subs.append((fr'`({custom_type})\.([A-Z_]+)`', lambda match: f'[{match.group(0)}](#{match.group(1).lower()}-{match.group(2).lower()})'))
3737

3838
# Load the api_reference.md generated by Sphinx
39-
with open('api_reference.md', 'r+') as api_reference:
39+
with open('api_reference.md', 'r+', encoding='utf-8') as api_reference:
4040
api_reference_content = api_reference.read()
4141

4242
# Do the above defined replacements

scripts/check_version_in_changelog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
if not CHANGELOG_PATH.is_file():
1414
raise RuntimeError('Unable to find CHANGELOG.md file')
1515

16-
with open(CHANGELOG_PATH) as changelog_file:
16+
with open(CHANGELOG_PATH, encoding='utf-8') as changelog_file:
1717
for line in changelog_file:
1818
# The heading for the changelog entry for the given version can start with either the version number, or the version number in a link
1919
if re.match(fr'\[?{current_package_version}([\] ]|$)', line):

scripts/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# Load the current version number from src/package_name/_version.py
99
# It is on a line in the format __version__ = 1.2.3
1010
def get_current_package_version() -> str:
11-
with open(VERSION_FILE_PATH, 'r') as version_file:
11+
with open(VERSION_FILE_PATH, 'r', encoding='utf-8') as version_file:
1212
for line in version_file:
1313
if line.startswith('__version__'):
1414
delim = '"' if '"' in line else "'"
@@ -21,7 +21,7 @@ def get_current_package_version() -> str:
2121
# Write the given version number from src/package_name/_version.py
2222
# It replaces the version number on the line with the format __version__ = 1.2.3
2323
def set_current_package_version(version: str) -> None:
24-
with open(VERSION_FILE_PATH, 'r+') as version_file:
24+
with open(VERSION_FILE_PATH, 'r+', encoding='utf-8') as version_file:
2525
updated_version_file_lines = []
2626
version_string_found = False
2727
for line in version_file:

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
'flake8-comprehensions ~= 3.10.1',
7777
'flake8-datetimez ~= 20.10.0',
7878
'flake8-docstrings ~= 1.7.0',
79+
'flake8-encodings ~= 0.5.0',
7980
'flake8-isort ~= 6.0.0',
8081
'flake8-noqa ~= 1.3.0',
8182
'flake8-pytest-style ~= 1.7.2',

src/apify/_memory_storage/resource_clients/base_resource_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def _find_or_create_client_by_id_or_name(
101101
metadata_path = os.path.join(entry.path, '__metadata__.json')
102102
if not os.access(metadata_path, os.F_OK):
103103
continue
104-
with open(metadata_path) as metadata_file:
104+
with open(metadata_path, encoding='utf-8') as metadata_file:
105105
metadata = json.load(metadata_file)
106106
if id and id == metadata.get('id'):
107107
storage_path = entry.path

src/apify/_memory_storage/resource_clients/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def _create_from_directory(
412412
has_seen_metadata_file = True
413413

414414
# We have found the dataset's metadata file, build out information based on it
415-
with open(os.path.join(storage_directory, entry.name)) as f:
415+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
416416
metadata = json.load(f)
417417
id = metadata['id']
418418
name = metadata['name']
@@ -423,7 +423,7 @@ def _create_from_directory(
423423

424424
continue
425425

426-
with open(os.path.join(storage_directory, entry.name)) as f:
426+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
427427
entry_content = json.load(f)
428428
entry_name = entry.name.split('.')[0]
429429

src/apify/_memory_storage/resource_clients/key_value_store.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def _create_from_directory(
383383
if entry.is_file():
384384
if entry.name == '__metadata__.json':
385385
# We have found the store metadata file, build out information based on it
386-
with open(os.path.join(storage_directory, entry.name), encoding='utf8') as f:
386+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
387387
metadata = json.load(f)
388388
id = metadata['id']
389389
name = metadata['name']
@@ -395,7 +395,7 @@ def _create_from_directory(
395395

396396
if '.__metadata__.' in entry.name:
397397
# This is an entry's metadata file, we can use it to create/extend the record
398-
with open(os.path.join(storage_directory, entry.name), encoding='utf8') as f:
398+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
399399
metadata = json.load(f)
400400

401401
new_record = {
@@ -429,7 +429,7 @@ def _create_from_directory(
429429
elif 'application/json' in content_type:
430430
try:
431431
# Try parsing the JSON ahead of time (not ideal but solves invalid files being loaded into stores)
432-
json.loads(file_content)
432+
json.loads(file_content.decode('utf-8'))
433433
except json.JSONDecodeError:
434434
# We need to override and then restore the warnings filter so that the warning gets printed out,
435435
# Otherwise it would be silently swallowed

src/apify/_memory_storage/resource_clients/request_queue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _create_from_directory(
423423
if entry.is_file():
424424
if entry.name == '__metadata__.json':
425425
# We have found the queue's metadata file, build out information based on it
426-
with open(os.path.join(storage_directory, entry.name)) as f:
426+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
427427
metadata = json.load(f)
428428
id = metadata['id']
429429
name = metadata['name']
@@ -435,7 +435,7 @@ def _create_from_directory(
435435

436436
continue
437437

438-
with open(os.path.join(storage_directory, entry.name)) as f:
438+
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
439439
request = json.load(f)
440440
if request.get('orderNo'):
441441
request['orderNo'] = Decimal(request.get('orderNo'))

src/apify/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def _is_file_or_bytes(value: Any) -> bool:
323323

324324
def _maybe_parse_body(body: bytes, content_type: str) -> Any:
325325
if _is_content_type_json(content_type):
326-
return json.loads(body) # Returns any
326+
return json.loads(body.decode('utf-8')) # Returns any
327327
elif _is_content_type_xml(content_type) or _is_content_type_text(content_type):
328328
return body.decode('utf-8')
329329
return body

0 commit comments

Comments
 (0)