Skip to content

Commit 391b9bc

Browse files
committed
Add CKAN content provider
1 parent a20dd1c commit 391b9bc

File tree

5 files changed

+164
-1
lines changed

5 files changed

+164
-1
lines changed

docs/source/usage.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ where ``<source-repository>`` is:
3939

4040
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
4141
* a Zenodo DOI (``10.5281/zenodo.1211089``),
42-
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
42+
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
43+
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
4344
* a path to a local directory (``a/local/directory``)
4445

4546
of the source repository you want to build.
@@ -136,3 +137,4 @@ Command line API
136137

137138
.. _Pytudes: https://github.com/norvig/pytudes
138139
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
140+
.. _CKAN: https://ckan.org

repo2docker/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ def _default_log_level(self):
152152
contentproviders.Dataverse,
153153
contentproviders.Hydroshare,
154154
contentproviders.Swhid,
155+
contentproviders.CKAN,
155156
contentproviders.Mercurial,
156157
contentproviders.Git,
157158
],

repo2docker/contentproviders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .base import Local
2+
from .ckan import CKAN
23
from .dataverse import Dataverse
34
from .figshare import Figshare
45
from .git import Git

repo2docker/contentproviders/ckan.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import re
2+
from datetime import datetime, timedelta, timezone
3+
from os import path
4+
from urllib.parse import urlparse
5+
6+
from requests import Session
7+
8+
from .. import __version__
9+
from .base import ContentProvider
10+
11+
12+
class CKAN(ContentProvider):
13+
"""Provide contents of a remote CKAN dataset."""
14+
15+
def __init__(self):
16+
super().__init__()
17+
self.session = Session()
18+
self.session.headers.update(
19+
{
20+
"user-agent": f"repo2docker {__version__}",
21+
}
22+
)
23+
24+
def _fetch_version(self, api_url):
25+
"""Fetch dataset modified date and convert to epoch.
26+
Borrowed from the Hydroshare provider.
27+
"""
28+
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
29+
resp = self.urlopen(package_show_url).json()
30+
date = resp["result"]["metadata_modified"]
31+
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
32+
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
33+
# truncate the timestamp
34+
return str(int(epoch))
35+
36+
def _request(self, url, **kwargs):
37+
return self.session.get(url, **kwargs)
38+
39+
urlopen = _request
40+
url_regex = r"/dataset/[a-z0-9_\\-]*$"
41+
42+
def detect(self, source, ref=None, extra_args=None):
43+
"""Trigger this provider for things that resolve to a CKAN dataset."""
44+
parsed_url = urlparse(source)
45+
if not parsed_url.netloc:
46+
return None
47+
48+
api_url = parsed_url._replace(
49+
path=re.sub(self.url_regex, "/api/3/action/", parsed_url.path)
50+
).geturl()
51+
52+
status_show_url = f"{api_url}status_show"
53+
resp = self.urlopen(status_show_url)
54+
if resp.status_code == 200:
55+
self.dataset_id = parsed_url.path.rsplit("/", maxsplit=1)[1]
56+
self.version = self._fetch_version(api_url)
57+
return {
58+
"dataset_id": self.dataset_id,
59+
"api_url": api_url,
60+
"version": self.version,
61+
}
62+
else:
63+
return None
64+
65+
def fetch(self, spec, output_dir, yield_output=False):
66+
"""Fetch a CKAN dataset."""
67+
dataset_id = spec["dataset_id"]
68+
69+
yield f"Fetching CKAN dataset {dataset_id}.\n"
70+
package_show_url = f"{spec['api_url']}package_show?id={dataset_id}"
71+
resp = self.urlopen(
72+
package_show_url,
73+
headers={"accept": "application/json"},
74+
)
75+
76+
dataset = resp.json()
77+
78+
yield "Fetching CKAN resources.\n"
79+
80+
resources = dataset["result"]["resources"]
81+
82+
for resource in resources:
83+
file_url = resource["url"]
84+
fname = file_url.rsplit("/", maxsplit=1)[-1]
85+
if fname == "":
86+
fname = resource["id"]
87+
88+
yield f"Requesting {file_url}\n"
89+
resp = self._request(file_url, stream=True)
90+
resp.raise_for_status()
91+
92+
dst_fname = path.join(output_dir, fname)
93+
with open(dst_fname, "wb") as dst:
94+
yield f"Fetching {fname}\n"
95+
for chunk in resp.iter_content(chunk_size=None):
96+
dst.write(chunk)
97+
98+
@property
99+
def content_id(self):
100+
"""A unique ID to represent the version of the content."""
101+
return f"{self.dataset_id}.v{self.version}"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os
2+
from contextlib import contextmanager
3+
from tempfile import NamedTemporaryFile, TemporaryDirectory
4+
5+
import pytest
6+
7+
from repo2docker.contentproviders import CKAN
8+
9+
test_ckan = CKAN()
10+
test_hosts = [
11+
(
12+
[
13+
"http://demo.ckan.org/dataset/sample-dataset-1",
14+
],
15+
{
16+
"dataset_id": "sample-dataset-1",
17+
"api_url": "http://demo.ckan.org/api/3/action/",
18+
"version": "1707387710",
19+
},
20+
)
21+
]
22+
23+
24+
@pytest.mark.parametrize("test_input, expected", test_hosts)
25+
def test_detect_ckan(test_input, expected):
26+
assert CKAN().detect(test_input[0]) == expected
27+
28+
# Don't trigger the CKAN content provider
29+
assert CKAN().detect("/some/path/here") is None
30+
assert CKAN().detect("https://example.com/path/here") is None
31+
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None
32+
33+
34+
@contextmanager
35+
def ckan_file():
36+
with NamedTemporaryFile() as file:
37+
file.write(b"some content")
38+
yield file.name
39+
40+
41+
def test_ckan_fetch(requests_mock):
42+
with ckan_file() as ckan_path:
43+
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
44+
requests_mock.get(
45+
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
46+
)
47+
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
48+
with TemporaryDirectory() as d:
49+
ckan = CKAN()
50+
spec = {
51+
"dataset_id": "1234",
52+
"api_url": "http://demo.ckan.org/api/3/action/",
53+
}
54+
output = []
55+
for l in ckan.fetch(spec, d):
56+
output.append(l)
57+
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
58+
assert expected == set(os.listdir(d))

0 commit comments

Comments
 (0)