Skip to content

Commit 8bc0151

Browse files
authored
Update DefaultStacIO to fix parsing non-ascii in urls (#1566)
* Update `DefaultStacIO` to fix parsing ascii in urls * Update tests * Update changelog
1 parent 795befa commit 8bc0151

File tree

7 files changed

+126
-11
lines changed

7 files changed

+126
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
- More permissive collection extent deserialization ([#1559](https://github.com/stac-utils/pystac/pull/1559))
1414
- Type of `proj:code` setter ([#1560](https://github.com/stac-utils/pystac/pull/1560))
15+
- Use `urllib3` to fix parsing non-ascii in urls ([#1566](https://github.com/stac-utils/pystac/pull/1566))
1516

1617
## [v1.13.0] - 2025-04-15
1718

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ python -m pip install 'pystac[orjson]'
3535
```
3636

3737
If you would like to use a custom `RetryStacIO` class for automatically retrying
38-
network requests when reading with PySTAC, you'll need
38+
network requests when reading with PySTAC, or if you have non-ASCII characters in
39+
your urls you'll need
3940
[`urllib3`](https://urllib3.readthedocs.io/en/stable/):
4041

4142
```shell

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ dev = [
5454
"types-orjson>=3.6.2",
5555
"types-python-dateutil>=2.9.0.20241003",
5656
"types-urllib3>=1.26.25.14",
57+
"urllib3>=2.3.0",
5758
"virtualenv>=20.26.6",
5859
]
5960
docs = [

pystac/stac_io.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,9 @@ def read_text_from_href(self, href: str) -> str:
286286
"""Reads file as a UTF-8 string.
287287
288288
If ``href`` has a "scheme" (e.g. if it starts with "https://") then this will
289-
use :func:`urllib.request.urlopen` to open the file and read the contents;
290-
otherwise, :func:`open` will be used to open a local file.
289+
use :func:`urllib.request.urlopen` (or func:`urllib3.request` if available)
290+
to open the file and read the contents; otherwise, :func:`open` will be used
291+
to open a local file.
291292
292293
Args:
293294
@@ -297,9 +298,19 @@ def read_text_from_href(self, href: str) -> str:
297298
if _is_url(href):
298299
try:
299300
logger.debug(f"GET {href} Headers: {self.headers}")
300-
req = Request(href, headers=self.headers)
301-
with urlopen(req) as f:
302-
href_contents = f.read().decode("utf-8")
301+
if HAS_URLLIB3:
302+
with urllib3.request(
303+
"GET",
304+
href,
305+
headers=self.headers,
306+
preload_content=False, # type: ignore
307+
) as f:
308+
href_contents = f.read().decode("utf-8")
309+
else:
310+
req = Request(href, headers=self.headers)
311+
with urlopen(req) as f:
312+
href_contents = f.read().decode("utf-8")
313+
303314
except HTTPError as e:
304315
raise Exception(f"Could not read uri {href}") from e
305316
else:
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers: {}
5+
method: GET
6+
uri: https://capella-open-data.s3.us-west-2.amazonaws.com/stac/capella-open-data-by-capital/capella-open-data-mal%C3%A9/collection.json
7+
response:
8+
body:
9+
string: "{\n \"type\": \"Collection\",\n \"id\": \"capella-open-data-mal\\u00e9\",\n
10+
\ \"stac_version\": \"1.0.0\",\n \"description\": \"Capella Open Data Mal\\u00e9\",\n
11+
\ \"links\": [\n {\n \"rel\": \"root\",\n \"href\": \"../../catalog.json\",\n
12+
\ \"type\": \"application/json\",\n \"title\": \"Capella Open Data\"\n
13+
\ },\n {\n \"rel\": \"license\",\n \"href\": \"https://creativecommons.org/licenses/by/4.0/\",\n
14+
\ \"title\": \"CC BY 4.0\"\n },\n {\n \"rel\": \"item\",\n
15+
\ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_GEO_HH_20241130164247_20241130164315/CAPELLA_C09_SP_GEO_HH_20241130164247_20241130164315.json\",\n
16+
\ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n
17+
\ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_GEC_HH_20241130164247_20241130164315/CAPELLA_C09_SP_GEC_HH_20241130164247_20241130164315.json\",\n
18+
\ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n
19+
\ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_SICD_HH_20241130164247_20241130164315/CAPELLA_C09_SP_SICD_HH_20241130164247_20241130164315.json\",\n
20+
\ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n
21+
\ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_SLC_HH_20241130164247_20241130164315/CAPELLA_C09_SP_SLC_HH_20241130164247_20241130164315.json\",\n
22+
\ \"type\": \"application/json\"\n },\n {\n \"rel\": \"parent\",\n
23+
\ \"href\": \"../catalog.json\",\n \"type\": \"application/json\",\n
24+
\ \"title\": \"By Capital\"\n }\n ],\n \"stac_extensions\": [\n \"https://stac-extensions.github.io/sat/v1.0.0/schema.json\",\n
25+
\ \"https://stac-extensions.github.io/view/v1.0.0/schema.json\",\n \"https://stac-extensions.github.io/processing/v1.1.0/schema.json\",\n
26+
\ \"https://stac-extensions.github.io/projection/v1.1.0/schema.json\"\n
27+
\ ],\n \"item_assets\": {\n \"HH\": {\n \"title\": \"SAR file\",\n
28+
\ \"type\": \"image/tiff; application=geotiff\",\n \"roles\": [\n
29+
\ \"data\"\n ],\n \"sar:polarizations\": [\n \"HH\"\n
30+
\ ]\n },\n \"VV\": {\n \"title\": \"SAR file\",\n \"type\":
31+
\"image/tiff; application=geotiff\",\n \"roles\": [\n \"data\"\n
32+
\ ],\n \"sar:polarizations\": [\n \"VV\"\n ]\n },\n
33+
\ \"thumbnail\": {\n \"title\": \"Thumbnail\",\n \"type\": \"image/png\",\n
34+
\ \"roles\": [\n \"thumbnail\"\n ]\n },\n \"preview\":
35+
{\n \"title\": \"Preview image\",\n \"type\": \"image/tiff; application=geotiff;
36+
profile=cloud-optimized\",\n \"roles\": [\n \"overview\"\n ]\n
37+
\ },\n \"metadata\": {\n \"title\": \"Extended metadata\",\n \"type\":
38+
\"application/json\",\n \"roles\": [\n \"metadata\"\n ]\n
39+
\ }\n },\n \"title\": \"Mal\\u00e9\",\n \"extent\": {\n \"spatial\":
40+
{\n \"bbox\": [\n [\n -180,\n -90,\n 180,\n
41+
\ 90\n ]\n ]\n },\n \"temporal\": {\n \"interval\":
42+
[\n [\n \"2020-03-30T00:00:00Z\",\n null\n ]\n
43+
\ ]\n }\n },\n \"license\": \"proprietary\",\n \"keywords\": [\n
44+
\ \"sar\"\n ],\n \"providers\": [\n {\n \"name\": \"Capella Space\",\n
45+
\ \"roles\": [\n \"licensor\",\n \"producer\",\n \"processor\"\n
46+
\ ],\n \"url\": \"https://www.capellaspace.com\"\n },\n {\n
47+
\ \"name\": \"AWS\",\n \"roles\": [\n \"host\"\n ],\n
48+
\ \"url\": \"http://www.amazonaws.com/\"\n }\n ],\n \"summaries\":
49+
{\n \"constellation\": [\n \"capella\"\n ],\n \"instruments\":
50+
[\n \"capella-radar-2\",\n \"capella-radar-3\",\n \"capella-radar-4\",\n
51+
\ \"capella-radar-5\",\n \"capella-radar-6\",\n \"capella-radar-7\",\n
52+
\ \"capella-radar-8\",\n \"capella-radar-9\",\n \"capella-radar-10\",\n
53+
\ \"capella-radar-11\",\n \"capella-radar-13\",\n \"capella-radar-14\",\n
54+
\ \"capella-radar-15\"\n ],\n \"sar:frequency_band\": [\n \"X\"\n
55+
\ ],\n \"sar:product_type\": [\n \"SLC\",\n \"GEO\",\n \"GEC\",\n
56+
\ \"SICD\",\n \"SIDD\",\n \"CPHD\"\n ],\n \"sar:instrument_mode\":
57+
[\n \"stripmap\",\n \"spotlight\",\n \"sliding_spotlight\"\n
58+
\ ],\n \"sar:observation_direction\": [\n \"left\",\n \"right\"\n
59+
\ ],\n \"sar:polarizations\": [\n \"HH\",\n \"VV\"\n ]\n
60+
\ }\n}"
61+
headers:
62+
Accept-Ranges:
63+
- bytes
64+
Content-Length:
65+
- '4384'
66+
Content-Type:
67+
- application/json
68+
Date:
69+
- Wed, 16 Jul 2025 14:27:08 GMT
70+
ETag:
71+
- '"3a7d18f018a5dc5e29af78ba91fb3a67"'
72+
Last-Modified:
73+
- Wed, 16 Jul 2025 07:07:12 GMT
74+
Server:
75+
- AmazonS3
76+
x-amz-id-2:
77+
- mS8cX+L1/uoMVhaC4ZTcrNLZb47iqTYwEXT52pFgbN9gFq3JRX17cTr8i0bTFOAaWWZ1DwnpogFTTAOAyjGj3Q==
78+
x-amz-request-id:
79+
- PWBXR28EN2M7DX2W
80+
x-amz-server-side-encryption:
81+
- AES256
82+
status:
83+
code: 200
84+
message: OK
85+
version: 1

tests/test_stac_io.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pytest
88

99
import pystac
10+
import pystac.errors
1011
from pystac.stac_io import DefaultStacIO, DuplicateKeyReportingMixin, StacIO
1112
from tests.utils import TestCases
1213

@@ -116,20 +117,20 @@ class ReportingStacIO(DefaultStacIO, DuplicateKeyReportingMixin):
116117
assert str(excinfo.value), f'Found duplicate object name "key" in {src_href}'
117118

118119

119-
@unittest.mock.patch("pystac.stac_io.urlopen")
120-
def test_headers_stac_io(urlopen_mock: unittest.mock.MagicMock) -> None:
120+
@unittest.mock.patch("pystac.stac_io.urllib3.request")
121+
def test_headers_stac_io(request_mock: unittest.mock.MagicMock) -> None:
121122
stac_io = DefaultStacIO(headers={"Authorization": "api-key fake-api-key-value"})
122123

123124
catalog = pystac.Catalog("an-id", "a description").to_dict()
124125
# required until https://github.com/stac-utils/pystac/pull/896 is merged
125126
catalog["links"] = []
126-
urlopen_mock.return_value.__enter__.return_value.read.return_value = json.dumps(
127+
request_mock.return_value.__enter__.return_value.read.return_value = json.dumps(
127128
catalog
128129
).encode("utf-8")
129130
pystac.Catalog.from_file("https://example.com/catalog.json", stac_io=stac_io)
130131

131-
request_obj = urlopen_mock.call_args[0][0]
132-
assert request_obj.headers == stac_io.headers
132+
headers = request_mock.call_args[1]["headers"]
133+
assert headers == stac_io.headers
133134

134135

135136
@pytest.mark.vcr()
@@ -163,3 +164,16 @@ def test_save_http_href_errors(tmp_path: Path) -> None:
163164
catalog.set_self_href("http://pystac.test/catalog.json")
164165
with pytest.raises(NotImplementedError):
165166
catalog.save_object()
167+
168+
169+
@pytest.mark.vcr()
170+
def test_urls_with_non_ascii_characters() -> None:
171+
from pystac.stac_io import HAS_URLLIB3
172+
173+
url = "https://capella-open-data.s3.us-west-2.amazonaws.com/stac/capella-open-data-by-capital/capella-open-data-malé/collection.json"
174+
175+
if HAS_URLLIB3:
176+
pystac.Collection.from_file(url)
177+
else:
178+
with pytest.raises(pystac.STACError):
179+
pystac.Collection.from_file(url)

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)