Skip to content

Commit 43ff7bb

Browse files
authored
Merge pull request #1253 from IQSS/1242-dataverse-original-file-format
[MRG] download original file formats from Dataverse #1242
2 parents 0d84b9e + 48f4cc6 commit 43ff7bb

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

repo2docker/contentproviders/dataverse.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,18 @@ def fetch(self, spec, output_dir, yield_output=False):
102102

103103
for fobj in deep_get(record, "latestVersion.files"):
104104
file_url = (
105-
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
105+
# without format=original you get the preservation format (plain text, tab separated)
106+
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
106107
)
107-
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
108+
filename = fobj["label"]
109+
original_filename = fobj["dataFile"].get("originalFileName", None)
110+
if original_filename:
111+
# replace preservation format filename (foo.tab) with original filename (foo.dta)
112+
filename = original_filename
108113

109-
file_ref = {"download": file_url, "filename": filename}
114+
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
115+
116+
file_ref = {"download": file_url, "filename": filename_with_path}
110117
fetch_map = {key: key for key in file_ref.keys()}
111118

112119
yield from self.fetch_file(file_ref, fetch_map, output_dir)

tests/unit/contentproviders/test_dataverse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from io import BytesIO
55
from tempfile import TemporaryDirectory
66
from unittest.mock import patch
7+
from urllib.parse import urlsplit
78
from urllib.request import Request, urlopen
89

910
import pytest
@@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
131132
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
132133

133134
def mock_filecontent(req, context):
134-
file_no = int(req.url.split("/")[-1]) - 1
135+
parts = urlsplit(req.url)
136+
file_no = int(parts.path.split("/")[-1]) - 1
135137
return open(dv_files[file_no], "rb").read()
136138

137139
requests_mock.get(

0 commit comments

Comments
 (0)