Skip to content

Commit 9f15678

Browse files
authored
Merge pull request #1336 from depositar/ckan
[MRG] Add CKAN content provider
2 parents 09f3d53 + a390013 commit 9f15678

File tree

5 files changed

+216
-1
lines changed

5 files changed

+216
-1
lines changed

docs/source/usage.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ where ``<source-repository>`` is:
3939

4040
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
4141
* a Zenodo DOI (``10.5281/zenodo.1211089``),
42-
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
42+
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
43+
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
4344
* a path to a local directory (``a/local/directory``)
4445

4546
of the source repository you want to build.
@@ -136,3 +137,4 @@ Command line API
136137

137138
.. _Pytudes: https://github.com/norvig/pytudes
138139
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
140+
.. _CKAN: https://ckan.org

repo2docker/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ def _default_log_level(self):
152152
contentproviders.Dataverse,
153153
contentproviders.Hydroshare,
154154
contentproviders.Swhid,
155+
contentproviders.CKAN,
155156
contentproviders.Mercurial,
156157
contentproviders.Git,
157158
],

repo2docker/contentproviders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .base import Local
2+
from .ckan import CKAN
23
from .dataverse import Dataverse
34
from .figshare import Figshare
45
from .git import Git

repo2docker/contentproviders/ckan.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from datetime import datetime, timedelta, timezone
2+
from os import path
3+
from urllib.parse import parse_qs, urlencode, urlparse
4+
5+
from requests import Session
6+
7+
from .. import __version__
8+
from .base import ContentProvider
9+
10+
11+
class CKAN(ContentProvider):
12+
"""Provide contents of a remote CKAN dataset."""
13+
14+
def __init__(self):
15+
super().__init__()
16+
self.session = Session()
17+
self.session.headers.update(
18+
{
19+
"user-agent": f"repo2docker {__version__}",
20+
}
21+
)
22+
23+
def _fetch_version(self, api_url):
24+
"""Fetch dataset modified date and convert to epoch.
25+
Borrowed from the Hydroshare provider.
26+
"""
27+
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
28+
resp = self.urlopen(package_show_url).json()
29+
date = resp["result"]["metadata_modified"]
30+
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
31+
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
32+
# truncate the timestamp
33+
return str(int(epoch))
34+
35+
def _request(self, url, **kwargs):
36+
return self.session.get(url, **kwargs)
37+
38+
urlopen = _request
39+
40+
def detect(self, source, ref=None, extra_args=None):
41+
"""Trigger this provider for things that resolve to a CKAN dataset."""
42+
parsed_url = urlparse(source)
43+
if not parsed_url.netloc:
44+
return None
45+
46+
if "/dataset/" not in parsed_url.path:
47+
# Not actually a dataset
48+
return None
49+
50+
# CKAN may be under a URL prefix, and we should accomodate that
51+
url_prefix, dataset_url = parsed_url.path.split("/dataset/")
52+
53+
dataset_url_parts = dataset_url.split("/")
54+
self.dataset_id = dataset_url_parts[0]
55+
56+
api_url = parsed_url._replace(
57+
path=f"{url_prefix}/api/3/action/", query=""
58+
).geturl()
59+
60+
status_show_url = f"{api_url}status_show"
61+
resp = self.urlopen(status_show_url)
62+
if resp.status_code == 200:
63+
64+
# Activity ID may be present either as a query parameter, activity_id
65+
# or as part of the URL, under `/history/<activity-id>`. If `/history/`
66+
# is present, that takes precedence over `activity_id`
67+
activity_id = None
68+
if "history" in dataset_url_parts:
69+
activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1]
70+
elif parse_qs(parsed_url.query).get("activity_id") is not None:
71+
activity_id = parse_qs(parsed_url.query).get("activity_id")[0]
72+
73+
self.version = self._fetch_version(api_url)
74+
return {
75+
"dataset_id": self.dataset_id,
76+
"activity_id": activity_id,
77+
"api_url": api_url,
78+
"version": self.version,
79+
}
80+
else:
81+
return None
82+
83+
def fetch(self, spec, output_dir, yield_output=False):
84+
"""Fetch a CKAN dataset."""
85+
dataset_id = spec["dataset_id"]
86+
activity_id = spec["activity_id"]
87+
88+
yield f"Fetching CKAN dataset {dataset_id}.\n"
89+
90+
# handle the activites
91+
if activity_id:
92+
fetch_url = f"{spec['api_url']}activity_data_show?" + urlencode(
93+
{"id": activity_id, "object_type": "package"}
94+
)
95+
else:
96+
fetch_url = f"{spec['api_url']}package_show?" + urlencode(
97+
{"id": dataset_id}
98+
)
99+
100+
resp = self.urlopen(
101+
fetch_url,
102+
headers={"accept": "application/json"},
103+
)
104+
105+
dataset = resp.json()
106+
107+
yield "Fetching CKAN resources.\n"
108+
109+
resources = dataset["result"]["resources"]
110+
111+
for resource in resources:
112+
file_url = resource["url"]
113+
if file_url == "":
114+
continue
115+
fname = file_url.rsplit("/", maxsplit=1)[-1]
116+
if fname == "":
117+
fname = resource["id"]
118+
119+
yield f"Requesting {file_url}\n"
120+
resp = self._request(file_url, stream=True)
121+
resp.raise_for_status()
122+
123+
dst_fname = path.join(output_dir, fname)
124+
with open(dst_fname, "wb") as dst:
125+
yield f"Fetching {fname}\n"
126+
for chunk in resp.iter_content(chunk_size=None):
127+
dst.write(chunk)
128+
129+
@property
130+
def content_id(self):
131+
"""A unique ID to represent the version of the content."""
132+
return f"{self.dataset_id}.v{self.version}"
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import os
2+
from contextlib import contextmanager
3+
from tempfile import NamedTemporaryFile, TemporaryDirectory
4+
5+
from repo2docker.contentproviders import CKAN
6+
7+
8+
def test_detect_ckan(requests_mock):
9+
mock_response = {"result": {"metadata_modified": "2024-02-27T14:15:54.573058"}}
10+
requests_mock.get("http://demo.ckan.org/api/3/action/status_show", status_code=200)
11+
requests_mock.get(
12+
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
13+
)
14+
15+
expected = {
16+
"dataset_id": "1234",
17+
"activity_id": None,
18+
"api_url": "http://demo.ckan.org/api/3/action/",
19+
"version": "1709043354",
20+
}
21+
22+
expected_activity = expected.copy()
23+
expected_activity["activity_id"] = "5678"
24+
25+
assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected
26+
assert (
27+
CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678")
28+
== expected_activity
29+
)
30+
assert (
31+
CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678")
32+
== expected_activity
33+
)
34+
35+
36+
def test_detect_not_ckan():
37+
# Don't trigger the CKAN content provider
38+
assert CKAN().detect("/some/path/here") is None
39+
assert CKAN().detect("https://example.com/path/here") is None
40+
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None
41+
42+
43+
@contextmanager
44+
def ckan_file():
45+
with NamedTemporaryFile() as file:
46+
file.write(b"some content")
47+
yield file.name
48+
49+
50+
def test_ckan_fetch(requests_mock):
51+
with ckan_file() as ckan_path:
52+
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
53+
requests_mock.get(
54+
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
55+
)
56+
requests_mock.get(
57+
"http://demo.ckan.org/api/3/action/activity_data_show?id=5678",
58+
json=mock_response,
59+
)
60+
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
61+
62+
ckan = CKAN()
63+
spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"}
64+
65+
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
66+
67+
with TemporaryDirectory() as d:
68+
spec["activity_id"] = None
69+
output = []
70+
for l in ckan.fetch(spec, d):
71+
output.append(l)
72+
assert expected == set(os.listdir(d))
73+
74+
with TemporaryDirectory() as d:
75+
spec["activity_id"] = "5678"
76+
output = []
77+
for l in ckan.fetch(spec, d):
78+
output.append(l)
79+
assert expected == set(os.listdir(d))

0 commit comments

Comments
 (0)