Skip to content

Commit cafbbb4

Browse files
committed
feat: add return_type csv
1 parent d83fe84 commit cafbbb4

File tree

11 files changed

+203
-27
lines changed

11 files changed

+203
-27
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ repos:
9494
additional_dependencies:
9595
- orjson # Ref: https://github.com/python/mypy/blob/v1.13.0/CHANGELOG.md#improved-performance
9696
- httpx>=0.27
97+
- pandas-stubs>=2.2
9798
- pytest>=8.2
9899
- respx>=0.21
99100
- typer>=0.12

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
4545
- dataclass (default): Returns a Python dataclass object, allowing convenient access to the element of the re3data
4646
schema
4747
- response: Returns a Python object representing the API response
48-
- original XML: Returns the raw XML response from the API
49-
- JSON: Returns a JSON representation of the API response
48+
- original XML (str): Returns the raw XML response from the API
49+
- JSON (str): Returns a JSON representation of the API response
5050
- dictionary: Returns a dictionary representation of the API response
51+
- csv (str): Returns a CSV representation of the API response
52+
- dataframe: Returns a pandas.DataFrame representation of the API response
5153

5254
## Requirements
5355

@@ -61,6 +63,8 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
6163
schemas, simplifies processing of API responses.
6264
- **Optional CLI**: [typer](https://github.com/tiangolo/typer), a popular library for building command-line interfaces,
6365
powers the user-friendly interface.
66+
- **Optional DataFrame/CSV**: [pandas](https://github.com/pandas-dev/pandas), a powerful and flexible data analysis
67+
library, enables generation of DataFrames and CSV files from parsed XML responses.
6468

6569
## Installation
6670

docs/src/install.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@ Install with `python -m pip install "python-re3data[cli]"`.
129129
| ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------- |
130130
| [typer](https://github.com/tiangolo/typer) | >= 0.12 | A popular library for building command-line interfaces, powers the user-friendly interface. |
131131

132+
#### CSV
133+
134+
Install with `python -m pip install "python-re3data[csv]"`.
135+
136+
| Package | Version | Description |
137+
| ---------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------ |
138+
| [pandas](https://github.com/pandas-dev/pandas) | >= 2.0 | A powerful and flexible data analysis library, enables generation of DataFrames and CSV files from parsed XML responses. |
139+
132140
<!---
133141
This installation guide is adapted from these sources:
134142
- "pandas" Installation, https://pandas.pydata.org/docs/getting_started/install.html (BSD-3-Clause license)

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,12 @@ dependencies = [
4949
optional-dependencies.cli = [
5050
"typer>=0.12",
5151
]
52+
optional-dependencies.csv = [
53+
"pandas>=2",
54+
]
5255
optional-dependencies.dev = [
5356
"pre-commit-uv~=4.1",
54-
"python-re3data[cli]",
57+
"python-re3data[cli,csv]",
5558
]
5659
optional-dependencies.docs = [
5760
"mike~=2.1",

src/re3data/_client/_async.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@
2222
from re3data._response import Response, _build_response
2323

2424
if TYPE_CHECKING:
25-
from re3data._resources import Repository, RepositorySummary
25+
from pandas import DataFrame
2626

27+
from re3data._resources import Repository, RepositorySummary
2728
logger = logging.getLogger(__name__)
2829

2930

@@ -59,7 +60,7 @@ async def list(
5960
query: str | None = None,
6061
return_type: ReturnType = ReturnType.DATACLASS,
6162
count: bool = False,
62-
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
63+
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
6364
"""List the metadata of all repositories in the re3data API.
6465
6566
Args:
@@ -83,7 +84,7 @@ async def list(
8384

8485
async def get(
8586
self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
86-
) -> Repository | Response | dict[str, Any] | str:
87+
) -> Repository | Response | dict[str, Any] | DataFrame | str:
8788
"""Get the metadata of a specific repository.
8889
8990
Args:

src/re3data/_client/_sync.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
from re3data._response import Response, _build_response
2525

2626
if TYPE_CHECKING:
27-
from re3data._resources import Repository, RepositorySummary
27+
from pandas import DataFrame
2828

29+
from re3data._resources import Repository, RepositorySummary
2930
logger = logging.getLogger(__name__)
3031

3132

@@ -61,7 +62,7 @@ def list(
6162
query: str | None = None,
6263
return_type: ReturnType = ReturnType.DATACLASS,
6364
count: bool = False,
64-
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
65+
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
6566
"""List the metadata of all repositories in the re3data API.
6667
6768
Args:
@@ -85,7 +86,7 @@ def list(
8586

8687
def get(
8788
self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
88-
) -> Repository | Response | dict[str, Any] | str:
89+
) -> Repository | Response | dict[str, Any] | DataFrame | str:
8990
"""Get the metadata of a specific repository.
9091
9192
Args:

src/re3data/_client/base.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313

1414
from re3data import __version__
1515
from re3data._response import Response, _count_repositories, _parse_repositories_response, _parse_repository_response
16-
from re3data._serializer import _to_dict, _to_json
16+
from re3data._serializer import _to_csv, _to_dataframe, _to_dict, _to_json
1717

1818
if TYPE_CHECKING:
19+
from pandas import DataFrame
20+
1921
from re3data._resources import Repository, RepositorySummary
2022

2123
BASE_URL: str = "https://www.re3data.org/api/beta/"
@@ -37,7 +39,9 @@ class ResourceType(str, Enum):
3739

3840

3941
class ReturnType(str, Enum):
42+
CSV = "csv"
4043
DATACLASS = "dataclass"
44+
DATAFRAME = "dataframe"
4145
DICT = "dict"
4246
JSON = "json"
4347
RESPONSE = "response"
@@ -80,19 +84,19 @@ def _build_query_params(query: str | None = None) -> dict[str, str]:
8084
@overload
8185
def _dispatch_return_type(
8286
response: Response, resource_type: Literal[ResourceType.REPOSITORY], return_type: ReturnType, count: bool = False
83-
) -> Repository | Response | dict[str, Any] | str: ...
87+
) -> Repository | Response | dict[str, Any] | DataFrame | str: ...
8488
@overload
8589
def _dispatch_return_type(
8690
response: Response,
8791
resource_type: Literal[ResourceType.REPOSITORY_LIST],
8892
return_type: ReturnType,
8993
count: bool = False,
90-
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int: ...
94+
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int: ...
9195

9296

93-
def _dispatch_return_type(
97+
def _dispatch_return_type( # noqa: PLR0911
9498
response: Response, resource_type: ResourceType, return_type: ReturnType, count: bool = False
95-
) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | str | int:
99+
) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
96100
"""Dispatch the response to the correct return type based on the provided return type and resource type.
97101
98102
Args:
@@ -105,14 +109,15 @@ def _dispatch_return_type(
105109
Depending on the return_type and resource_type, this can be a Repository object, a list of RepositorySummary
106110
objects, an HTTP response, a dictionary representation or the original XML.
107111
"""
112+
# return the count of repositories, the response or the original xml before parsing the response
108113
if resource_type == ResourceType.REPOSITORY_LIST and count:
109114
return _count_repositories(response.text)
110-
111115
if return_type == ReturnType.RESPONSE:
112116
return response
113117
if return_type == ReturnType.XML:
114118
return response.text
115119

120+
# all subsequent return types rely on parsing the response first
116121
parsed: Repository | list[RepositorySummary]
117122
if resource_type == ResourceType.REPOSITORY_LIST:
118123
parsed = _parse_repositories_response(response)
@@ -121,9 +126,16 @@ def _dispatch_return_type(
121126
if return_type == ReturnType.DATACLASS:
122127
return parsed
123128

129+
# JSON and dictionary
124130
if return_type == ReturnType.JSON:
125131
return _to_json(parsed)
126-
return _to_dict(parsed)
132+
if return_type == ReturnType.DICT:
133+
return _to_dict(parsed)
134+
135+
# tabular representations: DataFrame and CSV
136+
if return_type == ReturnType.DATAFRAME:
137+
return _to_dataframe(parsed)
138+
return _to_csv(parsed)
127139

128140

129141
class BaseClient:

src/re3data/_serializer.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,41 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5-
"""The _serializer module offers functions for converting parsed data into dictionaries or JSON strings.
5+
"""The _serializer module offers functions for converting parsed data into various return types.
66
7-
This module provides functions to serialize various types of data into dictionaries or JSON strings.
7+
This module provides functions to serialize various types of data, e.g. into dictionaries or JSON strings.
88
The serialized data can be used for further processing or storage.
99
1010
Functions:
1111
_to_dict: Serialize parsed data into a dictionary.
1212
_to_json: Serialize parsed data into a JSON string.
13+
_to_dataframe: Serialize parsed data into a DataFrame.
14+
_to_csv: Serialize parsed data into a CSV string.
1315
"""
1416

15-
from typing import Any
17+
from __future__ import annotations
18+
19+
import logging
20+
import sys
21+
from typing import TYPE_CHECKING, Any
22+
23+
try:
24+
from pandas import json_normalize
25+
26+
PANDAS_INSTALLED = True
27+
except ImportError:
28+
PANDAS_INSTALLED = False
1629

1730
from xsdata.formats.dataclass.context import XmlContext
1831
from xsdata.formats.dataclass.serializers import DictEncoder, JsonSerializer
1932
from xsdata.formats.dataclass.serializers.config import SerializerConfig
2033

21-
from re3data._resources import Repository, RepositorySummary
34+
if TYPE_CHECKING:
35+
from pandas import DataFrame
2236

37+
from re3data._resources import Repository, RepositorySummary
38+
39+
logger = logging.getLogger(__name__)
2340
CONFIG = SerializerConfig(indent=" ")
2441
CONTEXT = XmlContext()
2542

@@ -51,3 +68,35 @@ def _to_json(parsed: Repository | list[RepositorySummary]) -> str:
5168
A JSON representation of the input data.
5269
"""
5370
return JSON_SERIALIZER.render(parsed)
71+
72+
73+
def _to_dataframe(parsed: Repository | list[RepositorySummary]) -> DataFrame:
74+
"""Serialize parsed data into a DataFrame.
75+
76+
Args:
77+
parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
78+
`RepositorySummary` objects.
79+
80+
Returns:
81+
A DataFrame representation of the input data.
82+
"""
83+
if PANDAS_INSTALLED:
84+
return json_normalize(_to_dict(parsed))
85+
logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
86+
sys.exit(1)
87+
88+
89+
def _to_csv(parsed: Repository | list[RepositorySummary]) -> str:
90+
"""Serialize parsed data into a CSV string.
91+
92+
Args:
93+
parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
94+
`RepositorySummary` objects.
95+
96+
Returns:
97+
A CSV string representation of the input data.
98+
"""
99+
if PANDAS_INSTALLED:
100+
return _to_dataframe(parsed).to_csv(index=False)
101+
logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
102+
sys.exit(1)

tests/integration/test_async_client.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import httpx
1010
import pytest
11+
from pandas import DataFrame
1112

1213
from re3data import RepositoryNotFoundError, Response, ReturnType
1314
from re3data._resources import Repository, RepositoryName, RepositorySummary
@@ -64,6 +65,21 @@ async def test_client_list_repositories_dict(async_client: AsyncClient, mock_rep
6465
assert repository["id"] == "r3d100010371"
6566

6667

68+
async def test_client_list_repositories_csv(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
69+
response = await async_client.repositories.list(return_type=ReturnType.CSV)
70+
assert isinstance(response, str)
71+
assert response.startswith("id,doi,name,")
72+
assert "r3d100010371" in response
73+
assert "https://doi.org/10.17616/R3P594" in response
74+
75+
76+
async def test_client_list_repositories_dataframe(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
77+
response = await async_client.repositories.list(return_type=ReturnType.DATAFRAME)
78+
assert isinstance(response, DataFrame)
79+
assert response.shape == (3, 5)
80+
assert response["id"].loc[0] == "r3d100010371"
81+
82+
6783
async def test_client_list_repositories_response(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
6884
response = await async_client.repositories.list(return_type=ReturnType.RESPONSE)
6985
assert isinstance(response, Response)
@@ -139,6 +155,24 @@ async def test_client_get_single_repository_dict(
139155
assert response["re3data.orgIdentifier"] == zenodo_id
140156

141157

158+
async def test_client_get_single_repository_csv(
159+
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
160+
) -> None:
161+
response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.CSV)
162+
assert isinstance(response, str)
163+
assert response.startswith("re3data.orgIdentifier,additionalName,repositoryURL,")
164+
assert "r3d100010468" in response
165+
166+
167+
async def test_client_get_single_repository_dataframe(
168+
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
169+
) -> None:
170+
response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.DATAFRAME)
171+
assert isinstance(response, DataFrame)
172+
assert response.shape == (1, 43)
173+
assert response["re3data.orgIdentifier"].loc[0] == "r3d100010468"
174+
175+
142176
async def test_client_get_single_repository_response(
143177
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
144178
) -> None:

0 commit comments

Comments
 (0)