Skip to content

Commit 1eccb26

Browse files
authored
Add dataset types to Dataset and Dataverse (#52)
* add ability to set the dataset type * ignore linter type issue in test * remove `datasetType` for version compat * remove print
1 parent 48a8379 commit 1eccb26

File tree

9 files changed

+251
-13
lines changed

9 files changed

+251
-13
lines changed

easyDataverse/dataset.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
import nob
77
import xmltodict
88
import yaml
9-
from pydantic import BaseModel, ConfigDict, Field
9+
from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator
1010

1111
from dvuploader import File, add_directory
1212

1313
from easyDataverse.base import DataverseBase
14+
from easyDataverse.datasettype import DatasetType
1415
from easyDataverse.license import CustomLicense, License
1516
from easyDataverse.uploader import update_dataset, upload_to_dataverse
1617
from easyDataverse.utils import YAMLDumper
@@ -54,9 +55,51 @@ class Dataset(BaseModel):
5455
description="The files of the dataset.",
5556
)
5657

58+
dataset_type: Optional[str] = Field(
59+
default=None,
60+
description="The type of the dataset.",
61+
)
62+
5763
API_TOKEN: Optional[str] = Field(None)
5864
DATAVERSE_URL: Optional[str] = Field(None)
5965

66+
# ! Validators
67+
@field_validator("dataset_type", mode="after")
68+
def _validate_dataset_type(
69+
cls,
70+
dataset_type: Optional[str],
71+
info: ValidationInfo,
72+
) -> Optional[str]:
73+
"""Validates the dataset type against available types in the Dataverse installation.
74+
75+
This validator ensures that the provided dataset type is valid and available
76+
in the target Dataverse installation. It fetches the available dataset types
77+
from the Dataverse instance and validates the provided type against them.
78+
79+
Note:
80+
If dataset_type is None, validation is skipped and None is returned.
81+
The DATAVERSE_URL must be set in the model for validation to work.
82+
"""
83+
if dataset_type is None:
84+
return dataset_type
85+
elif info.data["DATAVERSE_URL"] is None:
86+
raise ValueError(
87+
"No Dataverse URL has been provided. Please provide a Dataverse URL to validate the dataset type.",
88+
"This error should not happen and is likely a bug in the code.",
89+
"Please report this issue https://github.com/gdcc/easyDataverse/issues",
90+
)
91+
92+
available_types = DatasetType.from_instance(info.data["DATAVERSE_URL"]) # type: ignore
93+
available_names = [type.name for type in available_types]
94+
95+
if dataset_type not in available_names:
96+
raise ValueError(
97+
f"Dataset type '{dataset_type}' is not available in the Dataverse installation. "
98+
f"Please use 'list_dataset_types' to see which dataset types are available."
99+
)
100+
101+
return dataset_type
102+
60103
# ! Adders
61104
def add_metadatablock(self, metadatablock: DataverseBase) -> None:
62105
"""Adds a metadatablock object to the dataset if it is of 'DataverseBase' type and has a metadatablock name"""
@@ -190,13 +233,24 @@ def dataverse_dict(self) -> dict:
190233
else:
191234
terms = {}
192235

236+
dataset_type = self._get_dataset_type()
237+
193238
return {
239+
"datasetType": dataset_type,
194240
"datasetVersion": {
195241
"metadataBlocks": blocks,
196242
**terms,
197-
}
243+
},
198244
}
199245

246+
def _get_dataset_type(self) -> str:
247+
"""Returns the dataset type of the dataset."""
248+
249+
if self.dataset_type is None:
250+
return "dataset"
251+
252+
return self.dataset_type
253+
200254
def dataverse_json(self, indent: int = 2) -> str:
201255
"""Returns a JSON representation of the dataverse dataset."""
202256

easyDataverse/datasettype.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import List
2+
from urllib.parse import urljoin
3+
from pydantic import BaseModel, Field
4+
import httpx
5+
from pyDataverse.api import NativeApi
6+
7+
8+
class DatasetType(BaseModel):
9+
"""
10+
Represents a dataset type in Dataverse.
11+
12+
A dataset type defines the structure and metadata requirements for datasets
13+
in a Dataverse instance, including which metadata blocks are linked to it.
14+
"""
15+
16+
id: int = Field(..., description="The ID of the dataset type")
17+
name: str = Field(..., description="The name of the dataset type")
18+
linkedMetadataBlocks: list[str] = Field(
19+
default_factory=list,
20+
description="The metadata blocks linked to the dataset type",
21+
)
22+
23+
@classmethod
24+
def from_instance(cls, base_url: str) -> List["DatasetType"]:
25+
"""
26+
Retrieve all dataset types from a Dataverse instance.
27+
28+
Args:
29+
base_url: The base URL of the Dataverse instance
30+
31+
Returns:
32+
A list of DatasetType objects representing all dataset types
33+
available in the Dataverse instance
34+
35+
Raises:
36+
httpx.HTTPStatusError: If the API request fails
37+
ValueError: If the Dataverse instance is not at least version 6.4
38+
"""
39+
native_api = NativeApi(base_url=base_url)
40+
41+
if cls._get_version(native_api) < (6, 4):
42+
raise ValueError(
43+
"Dataset types are only supported in Dataverse 6.4 and above"
44+
)
45+
46+
url = urljoin(native_api.base_url, "api/datasets/datasetTypes")
47+
response = httpx.get(url)
48+
49+
if not response.is_success:
50+
# If there are no dataset types, the response is a 200 with an empty list
51+
return []
52+
53+
return [cls.model_validate(item) for item in response.json()["data"]]
54+
55+
@staticmethod
56+
def _get_version(native_api: NativeApi) -> tuple[int, int]:
57+
"""
58+
Get the version of the Dataverse instance.
59+
"""
60+
response = native_api.get_info_version()
61+
response.raise_for_status()
62+
version = response.json()["data"]["version"]
63+
major, minor = version.split(".", 1)
64+
return int(major), int(minor)

easyDataverse/dataverse.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import asyncio
22
from copy import deepcopy
3+
from functools import cached_property
34
import json
45
from uuid import UUID
56
from typing import Callable, Dict, List, Optional, Tuple, IO
67
from urllib import parse
78

89
import httpx
10+
from easyDataverse.datasettype import DatasetType
911
from easyDataverse.license import CustomLicense, License
1012
from rich.panel import Panel
1113
from rich.progress import Progress, SpinnerColumn, TextColumn
@@ -112,6 +114,25 @@ def default_license(self) -> License:
112114
"""The default license of the Dataverse installation."""
113115
return next(filter(lambda x: x.is_default, self.licenses.values()))
114116

117+
@computed_field(
118+
description="The dataset types available in the Dataverse installation."
119+
)
120+
@cached_property
121+
def dataset_types(self) -> Dict[str, DatasetType]:
122+
"""The dataset types available in the Dataverse installation."""
123+
if self.native_api is None:
124+
raise ValueError(
125+
"Native API is not available. Please connect to a Dataverse installation first."
126+
)
127+
128+
try:
129+
return {
130+
dataset_type.name: dataset_type
131+
for dataset_type in DatasetType.from_instance(self.native_api.base_url)
132+
}
133+
except ValueError:
134+
return {}
135+
115136
def _connect(self) -> None:
116137
"""Connects to a Dataverse installation and adds all metadtablocks as classes.
117138
@@ -299,6 +320,17 @@ def list_licenses(self):
299320

300321
print("\n")
301322

323+
def list_dataset_types(self):
324+
"""Lists the dataset types available in the Dataverse installation."""
325+
rich.print("[bold]Dataset Types[/bold]")
326+
for dataset_type in self.dataset_types.values():
327+
if dataset_type.name == "dataset":
328+
print(f"- {dataset_type.name} (default)")
329+
else:
330+
print(f"- {dataset_type.name}")
331+
332+
print("\n")
333+
302334
# ! Dataset Handlers
303335

304336
def create_dataset(self) -> Dataset:
@@ -308,7 +340,9 @@ def create_dataset(self) -> Dataset:
308340
Returns:
309341
Dataset: The newly created dataset.
310342
"""
311-
return self._dataset_gen()
343+
dataset = self._dataset_gen()
344+
dataset._dataverse = self
345+
return dataset
312346

313347
@classmethod
314348
def load_from_url(
@@ -409,6 +443,7 @@ def load_dataset(
409443
dataset.license = custom_license
410444

411445
dataset.p_id = latest_version.datasetPersistentId # type: ignore
446+
dataset.dataset_type = remote_ds.data.get("datasetType", None) # type: ignore
412447
blocks = latest_version.metadataBlocks # type: ignore
413448
files = latest_version.files # type: ignore
414449

easyDataverse/uploader.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def upload_to_dataverse(
3434
str: The resulting DOI of the dataset, if successful.
3535
"""
3636

37-
api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN)
37+
api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN) # type: ignore
3838
ds = Dataset()
3939
ds.from_json(json_data)
4040

@@ -50,21 +50,21 @@ def upload_to_dataverse(
5050
if p_id:
5151
create_params["pid"] = p_id
5252

53-
response = api.create_dataset(**create_params)
53+
response = api.create_dataset(**create_params) # type: ignore
5454
response.raise_for_status()
5555

5656
# Get response data
5757
p_id = response.json()["data"]["persistentId"]
5858

5959
_uploadFiles(
6060
files=files,
61-
p_id=p_id,
62-
api=api,
61+
p_id=p_id, # type: ignore
62+
api=api, # type: ignore
6363
n_parallel=n_parallel,
6464
) # type: ignore
6565

6666
console = Console()
67-
url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")
67+
url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}") # type: ignore
6868
panel = Panel(
6969
f"🎉 {url}",
7070
title="Dataset URL",

tests/integration/test_dataset_creation.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from pydantic import ValidationError
23
import pytest
34
from easyDataverse.dataset import Dataset
45

@@ -95,6 +96,59 @@ def test_creation_and_upload(
9596
"File should be in the sub-directory"
9697
)
9798

99+
@pytest.mark.integration
100+
def test_creation_and_upload_with_dataset_type(
101+
self,
102+
credentials,
103+
):
104+
# Arrange
105+
base_url, api_token = credentials
106+
dataverse = Dataverse(
107+
server_url=base_url,
108+
api_token=api_token,
109+
)
110+
111+
# Act
112+
dataset = dataverse.create_dataset()
113+
114+
dataset.dataset_type = "dataset"
115+
dataset.citation.title = "My dataset"
116+
dataset.citation.subject = ["Other"]
117+
dataset.citation.add_author(name="John Doe")
118+
dataset.citation.add_ds_description(
119+
value="This is a description of the dataset",
120+
date="2024",
121+
)
122+
dataset.citation.add_dataset_contact(
123+
name="John Doe",
124+
email="john@doe.com",
125+
)
126+
127+
pid = dataset.upload(dataverse_name="root")
128+
129+
# Re-fetch the dataset
130+
dataset = dataverse.load_dataset(pid)
131+
132+
assert dataset.dataset_type == "dataset"
133+
134+
@pytest.mark.integration
135+
def test_creation_invalid_dataset_type(
136+
self,
137+
credentials,
138+
):
139+
# Arrange
140+
base_url, api_token = credentials
141+
dataverse = Dataverse(
142+
server_url=base_url,
143+
api_token=api_token,
144+
)
145+
146+
# Act
147+
dataset = dataverse.create_dataset()
148+
149+
with pytest.raises(ValidationError):
150+
dataset.dataset_type = "invalid"
151+
98152
@pytest.mark.integration
99153
def test_creation_other_license(
100154
self,
@@ -227,6 +281,7 @@ def test_tab_ingest_disabled(
227281
@staticmethod
228282
def sort_citation(dataset: Dataset):
229283
dv_dict = dataset.dataverse_dict()
284+
del dv_dict["datasetType"]
230285
citation = dv_dict["datasetVersion"]["metadataBlocks"]["citation"]
231286
citation_fields = citation["fields"]
232287
dv_dict["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(

tests/integration/test_dataset_download.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ def test_dataset_download_with_file_and_filter_pattern(
179179

180180
@staticmethod
181181
def sort_citation(dataset: Dict):
182+
if "datasetType" in dataset:
183+
del dataset["datasetType"]
184+
182185
citation = dataset["datasetVersion"]["metadataBlocks"]["citation"]
183186
citation_fields = citation["fields"]
184187
dataset["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(

tests/integration/test_dataset_update.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_dataset_update(
3636

3737
# Fetch the dataset and update the title
3838
dataset = dataverse.load_dataset(pid)
39-
dataset.citation.title = "Title has changed"
39+
dataset.citation.title = "Title has changed" # type: ignore
4040
dataset.update()
4141

4242
# Re-fetch the dataset
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pytest
2+
from easyDataverse.datasettype import DatasetType
3+
4+
5+
class TestDatasetType:
6+
"""Integration tests for DatasetType functionality."""
7+
8+
@pytest.mark.integration
9+
def test_dataset_type_from_instance(self, credentials):
10+
"""
11+
Test retrieving dataset types from a Dataverse instance.
12+
13+
This test verifies that we can successfully fetch dataset types
14+
from a Dataverse installation and that the returned data matches
15+
the expected structure.
16+
17+
Args:
18+
credentials: Fixture providing base_url and api_token for testing
19+
"""
20+
base_url, _ = credentials
21+
dataset_types = DatasetType.from_instance(base_url)
22+
23+
assert len(dataset_types) > 0
24+
expected_dataset_types = [
25+
DatasetType(id=1, name="dataset", linkedMetadataBlocks=[]),
26+
]
27+
assert dataset_types == expected_dataset_types

0 commit comments

Comments
 (0)