Skip to content

Commit e54c24c

Browse files
committed
Add support for the SWHID content provider
This content provider allows to retrieve the content from a Software Heritage (SWH) persistent identifier (SWHID). Typical usage: repo2docker swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 It uses the SWH public vault API to retrieve the content of the given directory. Most of the times, this will not need an authentication token to bypass the rate-limiting of the SWH API. Without authentication, one should be allowed to retrieve one directory content per minute. If this is not enought, then the user must use authenticated calls to the SWH API. For this, a new `swh_token` config item has been added to the Repo2Docker application class. To use authentication: repo2docker --config cfg.json swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 with the swh_token config option being defined in the cfg.json config file.
1 parent 983607f commit e54c24c

File tree

5 files changed

+289
-0
lines changed

5 files changed

+289
-0
lines changed

repo2docker/app.py

100644100755
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def _default_log_level(self):
148148
contentproviders.Figshare,
149149
contentproviders.Dataverse,
150150
contentproviders.Hydroshare,
151+
contentproviders.Swhid,
151152
contentproviders.Mercurial,
152153
contentproviders.Git,
153154
],
@@ -269,6 +270,18 @@ def _user_name_default(self):
269270
allow_none=True,
270271
)
271272

273+
swh_token = Unicode(
274+
None,
275+
help="""
276+
Token to use authenticated SWH API access.
277+
278+
If unset, default to unauthenticated (limited) usage of the Software
279+
Heritage API.
280+
""",
281+
config=True,
282+
allow_none=True,
283+
)
284+
272285
cleanup_checkout = Bool(
273286
False,
274287
help="""
@@ -395,6 +408,10 @@ def fetch(self, url, ref, checkout_path):
395408
"No matching content provider found for " "{url}.".format(url=url)
396409
)
397410

411+
swh_token = self.config.get("swh_token", self.swh_token)
412+
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
413+
picked_content_provider.set_auth_token(swh_token)
414+
398415
for log_line in picked_content_provider.fetch(
399416
spec, checkout_path, yield_output=self.json_logs
400417
):

repo2docker/contentproviders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .dataverse import Dataverse
66
from .hydroshare import Hydroshare
77
from .mercurial import Mercurial
8+
from .swhid import Swhid
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import io
2+
import os
3+
import shutil
4+
import tarfile
5+
import time
6+
import re
7+
8+
from os import path
9+
10+
import requests
11+
12+
from .base import ContentProvider
13+
from ..utils import copytree
14+
from .. import __version__
15+
16+
17+
def parse_swhid(swhid):
18+
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
19+
# only parse/check the <identifier_core> of the swhid
20+
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
21+
m = re.match(swhid_regexp, swhid.split(";")[0])
22+
if m:
23+
return m.groupdict()
24+
25+
26+
class Swhid(ContentProvider):
27+
"""Provide contents of a repository identified by a SWHID."""
28+
29+
retry_delay = 5
30+
31+
def __init__(self):
32+
self.swhid = None
33+
self.base_url = "https://archive.softwareheritage.org/api/1"
34+
self.session = requests.Session()
35+
self.session.headers.update(
36+
{
37+
"user-agent": "repo2docker {}".format(__version__),
38+
}
39+
)
40+
41+
def set_auth_token(self, token):
42+
header = {"Authorization": "Bearer {}".format(token)}
43+
self.session.headers.update(header)
44+
45+
def _request(self, url, method="GET"):
46+
if not url.endswith("/"):
47+
url = url + "/"
48+
49+
for retries in range(3):
50+
try:
51+
resp = self.session.request(method, url)
52+
if resp.ok:
53+
break
54+
except requests.ConnectionError:
55+
time.sleep(self.retry_delay)
56+
57+
return resp
58+
59+
@property
60+
def content_id(self):
61+
"""The SWHID record ID used for content retrival"""
62+
return self.swhid
63+
64+
def detect(self, swhid, ref=None, extra_args=None):
65+
swhid_dict = parse_swhid(swhid)
66+
67+
if (
68+
swhid_dict
69+
and swhid_dict["type"] in ("dir", "rev")
70+
and swhid_dict["version"] == "1"
71+
):
72+
return {"swhid": swhid, "swhid_obj": swhid_dict}
73+
74+
def fetch_directory(self, dir_hash, output_dir):
75+
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
76+
yield "Fetching directory {} from {}\n".format(dir_hash, url)
77+
resp = self._request(url, "POST")
78+
receipt = resp.json()
79+
status = receipt["status"]
80+
assert status != "failed", receipt
81+
while status not in ("failed", "done"):
82+
time.sleep(self.retry_delay)
83+
resp = self._request(url)
84+
status = resp.json()["status"]
85+
if status == "failed":
86+
yield "Error preparing the directory for download"
87+
raise Exception()
88+
resp = self._request(resp.json()["fetch_url"])
89+
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
90+
archive.extractall(path=output_dir)
91+
# the output_dir should have only one subdir named after the dir_hash
92+
# move its content one level up
93+
copytree(path.join(output_dir, dir_hash), output_dir)
94+
shutil.rmtree(path.join(output_dir, dir_hash))
95+
yield "Fetched files: {}\n".format(os.listdir(output_dir))
96+
97+
def fetch(self, spec, output_dir, yield_output=False):
98+
swhid = spec["swhid"]
99+
swhid_obj = spec["swhid_obj"]
100+
101+
if swhid_obj["type"] == "rev":
102+
# need to get the directory for this revision
103+
sha1git = swhid_obj["hash"]
104+
url = "{}/revision/{}/".format(self.base_url, sha1git)
105+
yield "Fetching revision {} from {}\n".format(sha1git, url)
106+
resp = self._request(url)
107+
assert resp.ok, (resp.content, self.session.headers)
108+
directory = resp.json()["directory"]
109+
self.swhid = "swh:1:dir:{}".format(directory)
110+
yield from self.fetch_directory(directory, output_dir)
111+
elif swhid_obj["type"] == "dir":
112+
self.swhid = swhid
113+
yield from self.fetch_directory(swhid_obj["hash"], output_dir)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def get_identifier(json):
5656
"ruamel.yaml>=0.15",
5757
"toml",
5858
"semver",
59+
"requests",
5960
],
6061
python_requires=">=3.6",
6162
author="Project Jupyter Contributors",
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import json
2+
import os
3+
import io
4+
import tarfile
5+
import shutil
6+
import re
7+
import urllib
8+
import pytest
9+
import tempfile
10+
import logging
11+
import requests_mock
12+
13+
from os import makedirs
14+
from os.path import join
15+
from unittest.mock import patch, MagicMock, mock_open
16+
from zipfile import ZipFile
17+
18+
from repo2docker.contentproviders.swhid import Swhid, parse_swhid
19+
from repo2docker.contentproviders.base import ContentProviderException
20+
21+
22+
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
23+
# We do not use this later to prevent having to depend on swh.model[cli]
24+
def swhid_of_dir(path):
25+
object = Directory.from_disk(path=path).get_data()
26+
return swhid(DIRECTORY, object)
27+
28+
29+
def test_content_id():
30+
swhid = Swhid()
31+
assert swhid.content_id is None
32+
33+
34+
swhids_ok = [
35+
"swh:1:dir:" + "0" * 40,
36+
"swh:1:rev:" + "0" * 40,
37+
]
38+
swhids_invalid = [
39+
"swh:1:dir:" + "0" * 39,
40+
"swh:2:dir:" + "0" * 40,
41+
"swh:1:rev:" + "0" * 41,
42+
"swh:1:cnt:" + "0" * 40,
43+
"swh:1:ori:" + "0" * 40,
44+
"swh:1:rel:" + "0" * 40,
45+
"swh:1:snp:" + "0" * 40,
46+
]
47+
48+
detect_values = [
49+
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
50+
] + [(swhid, None) for swhid in swhids_invalid]
51+
52+
53+
@pytest.mark.parametrize("swhid, expected", detect_values)
54+
def test_detect(swhid, expected):
55+
provider = Swhid()
56+
assert provider.detect(swhid) == expected
57+
58+
59+
def fake_urlopen(req):
60+
print(req)
61+
return req.headers
62+
63+
64+
def test_unresolving_swhid():
65+
provider = Swhid()
66+
67+
# swhid = "0" * 40
68+
# assert provider.swhid2url(swhid) is swhid
69+
70+
71+
NULLID = "0" * 40
72+
73+
74+
@pytest.fixture
75+
def gen_tarfile(tmpdir):
76+
rootdir = join(tmpdir, "tmp")
77+
makedirs(rootdir)
78+
with open(join(rootdir, "file1.txt"), "wb") as fobj:
79+
fobj.write(b"Some content\n")
80+
81+
# this directory hash can be computed using the swh.model package, but we do
82+
# nto want to depend on this later to limit dependencies and because it
83+
# does not support python 3.6;
84+
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
85+
buf = io.BytesIO()
86+
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
87+
tarf.add(rootdir, arcname=dirhash)
88+
tarf.close()
89+
shutil.rmtree(rootdir)
90+
return dirhash, buf.getvalue()
91+
92+
93+
def mocked_provider(tmpdir, dirhash, tarfile_buf):
94+
provider = Swhid()
95+
adapter = requests_mock.Adapter()
96+
provider.base_url = "mock://api/1"
97+
provider.retry_delay = 0.1
98+
provider.session.mount("mock://", adapter)
99+
100+
adapter.register_uri(
101+
"GET",
102+
"mock://api/1/revision/{}/".format(NULLID),
103+
json={
104+
"author": {"fullname": "John Doe <[email protected]>"},
105+
"directory": dirhash,
106+
},
107+
)
108+
adapter.register_uri(
109+
"POST",
110+
"mock://api/1/vault/directory/{}/".format(dirhash),
111+
json={
112+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
113+
"status": "new",
114+
},
115+
)
116+
adapter.register_uri(
117+
"GET",
118+
"mock://api/1/vault/directory/{}/".format(dirhash),
119+
[
120+
{
121+
"json": {
122+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
123+
"status": "pending",
124+
}
125+
},
126+
{
127+
"json": {
128+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
129+
"status": "done",
130+
}
131+
},
132+
],
133+
)
134+
adapter.register_uri(
135+
"GET",
136+
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
137+
content=tarfile_buf,
138+
)
139+
return provider
140+
141+
142+
def test_fetch_revision(tmpdir, gen_tarfile):
143+
dir_id, tarfile_buf = gen_tarfile
144+
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
145+
swhid = "swh:1:rev:" + NULLID
146+
for log in provider.fetch(provider.detect(swhid), tmpdir):
147+
print(log)
148+
assert provider.content_id == "swh:1:dir:" + dir_id
149+
150+
151+
def test_fetch_directory(tmpdir, gen_tarfile):
152+
dir_id, tarfile_buf = gen_tarfile
153+
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
154+
swhid = "swh:1:dir:" + dir_id
155+
for log in provider.fetch(provider.detect(swhid), tmpdir):
156+
print(log)
157+
assert provider.content_id == swhid

0 commit comments

Comments
 (0)