Skip to content

Commit 41816d8

Browse files
authored
fix(core): adapt to zenodo jsonld changes. Send referer on Zenodo request (#3643)
1 parent e4c06cb commit 41816d8

File tree

4 files changed

+51
-28
lines changed

4 files changed

+51
-28
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ add_ignore = ["D105", "D107", "D202", "D401"]
214214

215215
[tool.bandit]
216216
skips = ["B101", "B603", "B607", "B404"]
217+
exclude_dirs = ["tests"]
217218

218219
[tool.isort]
219220
multi_line_output = 3

renku/core/dataset/providers/zenodo.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from renku.core.dataset.providers.repository import RepositoryImporter, make_request
3535
from renku.core.util import communication
3636
from renku.core.util.doi import is_doi
37+
from renku.core.util.requests import get_redirect_url
3738
from renku.core.util.urls import remove_credentials
3839
from renku.domain_model.project_context import project_context
3940

@@ -80,7 +81,9 @@ def supports(uri):
8081
@staticmethod
8182
def get_record_id(uri):
8283
"""Extract record id from URI."""
83-
return urlparse(uri).path.split("/")[-1]
84+
parts = urlparse(uri).path.split("/")
85+
parts = [p for p in parts if p.isdigit()]
86+
return parts[-1]
8487

8588
@staticmethod
8689
def get_export_parameters() -> List["ProviderParameter"]:
@@ -121,7 +124,7 @@ def __init__(self, *, uri: str, original_uri, json: Dict[str, Any]):
121124

122125
metadata = self._json.pop("metadata", {})
123126
self._json["metadata"] = ZenodoMetadataSerializer.from_metadata(metadata) if metadata is not None else None
124-
record_id = self._json.pop("record_id", None)
127+
record_id = self._json.pop("record_id", None) or self._json.pop("recid", None)
125128
self._json["record_id"] = str(record_id) if record_id is not None else None
126129

127130
# NOTE: Make sure that these properties have a default value
@@ -136,11 +139,11 @@ def version(self):
136139
@property
137140
def latest_uri(self):
138141
"""Get URI of latest version."""
139-
return self._json["links"].get("latest_html")
142+
return get_redirect_url(self._json["links"].get("latest"))
140143

141144
def is_latest_version(self):
142145
"""Check if this record is the latest version."""
143-
return ZenodoProvider.get_record_id(self._json["links"].get("latest_html")) == self._json["record_id"]
146+
return ZenodoProvider.get_record_id(self.latest_uri) == self._json["record_id"]
144147

145148
def get_jsonld(self):
146149
"""Get record metadata as jsonld."""
@@ -173,18 +176,19 @@ def fetch_provider_dataset(self) -> "ProviderDataset":
173176
from renku.domain_model.dataset import Url, generate_default_slug
174177

175178
class ZenodoDatasetSchema(ProviderDatasetSchema):
176-
"""Schema for Dataverse datasets."""
179+
"""Schema for Zenodo datasets."""
177180

178181
@pre_load
179182
def fix_data(self, data, **kwargs):
180-
"""Fix data that is received from Dataverse."""
183+
"""Fix data that is received from Zenodo."""
181184
# Fix context
182185
context = data.get("@context")
183186
if context and isinstance(context, str):
187+
if not context.endswith("/"):
188+
context = f"{context}/"
184189
if context == "https://schema.org/":
185190
context = "http://schema.org/"
186191
data["@context"] = {"@base": context, "@vocab": context}
187-
188192
# Add type to creators
189193
creators = data.get("creator", [])
190194
for c in creators:
@@ -194,6 +198,10 @@ def fix_data(self, data, **kwargs):
194198
license = data.get("license")
195199
if license and isinstance(license, dict):
196200
data["license"] = license.get("url", "")
201+
# fix keywords to be a list
202+
keywords = data.get("keywords")
203+
if keywords and isinstance(keywords, str):
204+
data["keywords"] = [k.strip() for k in keywords.split(",")]
197205

198206
# Delete existing isPartOf
199207
data.pop("isPartOf", None)
@@ -228,17 +236,17 @@ def fix_data(self, data, **kwargs):
228236
class ZenodoFileSerializer:
229237
"""Zenodo record file."""
230238

231-
def __init__(self, *, id=None, checksum=None, links=None, filename=None, filesize=None):
239+
def __init__(self, *, id=None, checksum=None, links=None, key=None, size=None, **kwargs):
232240
self.id = id
233241
self.checksum = checksum
234242
self.links = links
235-
self.filename = filename
236-
self.filesize = filesize
243+
self.filename = key
244+
self.filesize = size
237245

238246
@property
239247
def remote_url(self):
240248
"""Get remote URL as ``urllib.ParseResult``."""
241-
return urllib.parse.urlparse(self.links["download"])
249+
return urllib.parse.urlparse(self.links["self"])
242250

243251
@property
244252
def type(self):
@@ -325,7 +333,10 @@ def from_metadata(cls, metadata: Dict[str, Any]) -> "ZenodoMetadataSerializer":
325333
class ZenodoExporter(ExporterApi):
326334
"""Zenodo export manager."""
327335

328-
HEADERS = {"Content-Type": "application/json"}
336+
HEADERS = {
337+
"Content-Type": "application/json",
338+
"Referer": f"https://{os.environ.get('RENKU_DOMAIN', 'zenodo.org')}",
339+
}
329340

330341
def __init__(self, dataset, publish, tag):
331342
super().__init__(dataset)
@@ -503,7 +514,9 @@ def publish_deposition(self):
503514
"""Publish existing deposition."""
504515
from renku.core.util import requests
505516

506-
response = requests.post(url=self.publish_url, params=self.exporter.default_params)
517+
response = requests.post(
518+
url=self.publish_url, params=self.exporter.default_params, headers=self.exporter.HEADERS
519+
)
507520
self._check_response(response)
508521

509522
return response
@@ -517,14 +530,21 @@ def _check_response(response):
517530
except errors.RequestError:
518531
if response.status_code == 400:
519532
err_response = response.json()
520-
messages = [
521-
'"{}" failed with "{}"'.format(err["field"], err["message"]) for err in err_response["errors"]
522-
]
533+
if "errors" in err_response:
534+
messages = [
535+
'"{}" failed with "{}"'.format(err["field"], ", ".join(err["messages"]))
536+
for err in err_response["errors"]
537+
]
538+
elif "message" in err_response:
539+
messages = [err_response["message"]]
540+
else:
541+
messages = [response.text()]
523542

524543
raise errors.ExportError(
525544
"\n" + "\n".join(messages) + "\nSee `renku dataset edit -h` for details on how to edit" " metadata"
526545
)
527546
else:
547+
print(response.status_code)
528548
raise errors.ExportError(response.content)
529549

530550

tests/cli/test_integration_datasets.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -397,29 +397,27 @@ def test_dataset_import_renku_provider_with_subgroups(runner, project, uri):
397397
@pytest.mark.vcr
398398
def test_dataset_import_renkulab_dataset_with_image(runner, project, with_injection):
399399
"""Test dataset import from Renkulab projects."""
400+
# dataset is https://dev.renku.ch/projects/renku-python-integration-tests/lego-datasets/datasets/colors/
400401
result = runner.invoke(
401-
cli, ["dataset", "import", "https://dev.renku.ch/datasets/4f36f891bb7c4b2bab137633cc270a40"], input="y"
402+
cli, ["dataset", "import", "https://dev.renku.ch/datasets/5952ea58de934fe188680a0e626a259c"], input="y"
402403
)
403404

404405
assert 0 == result.exit_code, format_result_exception(result)
405-
assert "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" in result.output
406+
assert "158c016d7338e9874a2a70972ed62ca22e2ce7ae" in result.output
406407

407408
assert "0" in result.output
408409
assert "OK" in result.output
409410

410411
result = runner.invoke(cli, ["dataset", "ls-files"])
411412
assert 0 == result.exit_code, format_result_exception(result)
412-
assert "bla" in result.output
413+
assert "colors.csv" in result.output
413414

414415
with with_injection():
415416
dataset = [d for d in DatasetGateway().get_all_active_datasets()][0]
416-
assert 2 == len(dataset.images)
417-
img1 = next(i for i in dataset.images if i.position == 1)
418-
img2 = next(i for i in dataset.images if i.position == 2)
417+
assert 1 == len(dataset.images)
418+
img1 = next(i for i in dataset.images if i.position == 0)
419419

420-
assert img1.content_url == "https://example.com/image1.jpg"
421-
assert img2.content_url.endswith("/2.png")
422-
assert os.path.exists(project.path / img2.content_url)
420+
assert img1.content_url == ".renku/dataset_images/41033ca2758944678718dde9140431f1/0.png"
423421

424422

425423
@pytest.mark.integration
@@ -822,7 +820,6 @@ def test_dataset_export_upload_failure(runner, tmpdir, project, zenodo_sandbox):
822820
result = runner.invoke(cli, ["dataset", "export", "my-dataset", "zenodo"])
823821

824822
assert 1 == result.exit_code, result.output + str(result.stderr_bytes)
825-
assert "metadata.creators.0.affiliation" in result.output
826823
assert "metadata.description" in result.output
827824

828825

@@ -940,7 +937,10 @@ def test_export_dataset_unauthorized(
940937
result = runner.invoke(cli, ["dataset", "export", "my-dataset", provider] + params)
941938

942939
assert 1 == result.exit_code, result.output + str(result.stderr_bytes)
943-
assert "Access unauthorized - update access token." in result.output, format_result_exception(result)
940+
# Note: Zenodo returns a referer error when a wrong token is supplied for some reason
941+
assert (
942+
"Access unauthorized - update access token." in result.output or "Referer checking failed" in result.output
943+
), format_result_exception(result)
944944

945945
secret = get_value("zenodo", "secret")
946946
assert secret is None
@@ -1232,6 +1232,7 @@ def test_dataset_update_zenodo(project, runner, doi):
12321232
commit_sha_after_file1_delete = project.repository.head.commit.hexsha
12331233

12341234
before_dataset = get_dataset_with_injection("imported_dataset")
1235+
assert before_dataset is not None
12351236

12361237
result = runner.invoke(cli, ["dataset", "update", "--all", "--dry-run"])
12371238

@@ -1245,6 +1246,7 @@ def test_dataset_update_zenodo(project, runner, doi):
12451246
assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)
12461247

12471248
after_dataset = get_dataset_with_injection("imported_dataset")
1249+
assert after_dataset is not None
12481250
assert after_dataset.version != before_dataset.version
12491251
assert after_dataset.id != before_dataset.id
12501252
assert after_dataset.derived_from is None

tests/service/views/test_cache_views.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,7 @@ def test_cache_gets_synchronized(local_remote_repository, directory_tree, quick_
956956

957957
assert response
958958
assert 200 == response.status_code
959-
assert {"slug", "remote_branch"} == set(response.json["result"].keys())
959+
assert {"git_url", "slug", "remote_branch"} == set(response.json["result"].keys())
960960

961961
remote_repo_checkout.pull()
962962

0 commit comments

Comments
 (0)