Skip to content

Commit 938d91c

Browse files
committed
download original file formats from Dataverse #1242
Dataverse creates plain-text, preservation-friendly copies of certain file formats (some of which are proprietary, such as Stata or SPSS) and this .tab (tab-separated) file is downloaded unless you supply `format=original`, which is what this pull request does. The original filename (e.g. foo.dta, a Stata file) comes from `originalFileName`, which is only populated when the preservation copy (e.g. foo.tab) has been successfully created. Additional variables were created to distinguish between `filename`, `original_filename`, and `filename_with_path`. If `original_filename` is available, it's the right one to use. To allow the tests to continue passing, the query parameters are now removed so just the file id can be cast as an int.
1 parent 21fa80f commit 938d91c

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

repo2docker/contentproviders/dataverse.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,18 @@ def fetch(self, spec, output_dir, yield_output=False):
102102

103103
for fobj in deep_get(record, "latestVersion.files"):
104104
file_url = (
105-
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
105+
# without format=original you get the preservation format (plain text, tab separated)
106+
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
106107
)
107-
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
108+
filename = fobj["label"]
109+
original_filename = fobj["dataFile"].get("originalFileName", None)
110+
if original_filename:
111+
# replace preservation format filename (foo.tab) with original filename (foo.dta)
112+
filename = original_filename
108113

109-
file_ref = {"download": file_url, "filename": filename}
114+
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
115+
116+
file_ref = {"download": file_url, "filename": filename_with_path}
110117
fetch_map = {key: key for key in file_ref.keys()}
111118

112119
yield from self.fetch_file(file_ref, fetch_map, output_dir)

tests/unit/contentproviders/test_dataverse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tempfile import TemporaryDirectory
66
from unittest.mock import patch
77
from urllib.request import Request, urlopen
8+
from urllib.parse import urlsplit
89

910
import pytest
1011

@@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
131132
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
132133

133134
def mock_filecontent(req, context):
134-
file_no = int(req.url.split("/")[-1]) - 1
135+
parts = urlsplit(req.url)
136+
file_no = int(parts.path.split("/")[-1]) - 1
135137
return open(dv_files[file_no], "rb").read()
136138

137139
requests_mock.get(

0 commit comments

Comments
 (0)