Skip to content

Commit 470e2ef

Browse files
committed
read_obo: default utf-8 encoding
closes #27
1 parent afaa34a commit 470e2ef

File tree

2 files changed

+18
-7
lines changed

2 files changed

+18
-7
lines changed

obonet/io.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import importlib
24
import io
35
import logging
@@ -6,12 +8,13 @@
68
from urllib.request import urlopen
79

810

9-
def open_read_file(path):
11+
def open_read_file(path, encoding: str | None = None):
1012
"""
1113
Return a file object from the path. Automatically detects and supports
1214
URLs and compression. If path is pathlike, it's converted to a string.
1315
If path is not a string nor pathlike, it's passed through without
14-
modification.
16+
modification. Use encoding to set the text character set encoding.
17+
Use `encoding=None` to use the platform-dependent default locale encoding.
1518
"""
1619
# Convert pathlike objects to string paths
1720
if hasattr(path, "__fspath__"):
@@ -29,16 +32,17 @@ def open_read_file(path):
2932
with urlopen(path) as response:
3033
content = response.read()
3134
if opener == io.open:
32-
encoding = response.headers.get_content_charset(failobj="utf-8")
35+
if not encoding:
36+
encoding = response.headers.get_content_charset(failobj="utf-8")
3337
logging.info(f"Will decode content from {path} using {encoding} charset.")
3438
text = content.decode(encoding)
3539
return io.StringIO(text)
3640
else:
3741
compressed_bytes = io.BytesIO(content)
38-
return opener(compressed_bytes, "rt")
42+
return opener(compressed_bytes, "rt", encoding=encoding)
3943

4044
# Read from file
41-
return opener(path, "rt")
45+
return opener(path, "rt", encoding=encoding)
4246

4347

4448
compression_to_module = {

obonet/read.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import itertools
24
import logging
35
import re
@@ -9,7 +11,9 @@
911
logger = logging.getLogger(__name__)
1012

1113

12-
def read_obo(path_or_file, ignore_obsolete=True):
14+
def read_obo(
15+
path_or_file, ignore_obsolete: bool = True, encoding: str | None = "utf-8"
16+
):
1317
"""
1418
Return a networkx.MultiDiGraph of the ontology serialized by the
1519
specified path or file.
@@ -25,8 +29,11 @@ def read_obo(path_or_file, ignore_obsolete=True):
2529
ignore_obsolete : boolean
2630
When true (default), terms that are marked 'is_obsolete' will
2731
not be added to the graph.
32+
encoding : str of None
33+
The character set encoding to use for path_or_file when path_or_file
34+
is a path/URL. Set to None for platform-dependent locale default.
2835
"""
29-
obo_file = open_read_file(path_or_file)
36+
obo_file = open_read_file(path_or_file, encoding=encoding)
3037
typedefs, terms, instances, header = get_sections(obo_file)
3138
obo_file.close()
3239

0 commit comments

Comments
 (0)