Skip to content

Commit d68b599

Browse files
authored
Merge branch 'develop' into 11752-croissant-restricted
2 parents be9607b + bf08caf commit d68b599

File tree

17 files changed

+464
-47
lines changed

17 files changed

+464
-47
lines changed

.github/workflows/spi_release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
with:
4343
java-version: '17'
4444
distribution: 'adopt'
45-
server-id: ossrh
45+
server-id: central
4646
server-username: MAVEN_USERNAME
4747
server-password: MAVEN_PASSWORD
4848
- uses: actions/cache@v4
@@ -80,7 +80,7 @@ jobs:
8080
with:
8181
java-version: '17'
8282
distribution: 'adopt'
83-
server-id: ossrh
83+
server-id: central
8484
server-username: MAVEN_USERNAME
8585
server-password: MAVEN_PASSWORD
8686
gpg-private-key: ${{ secrets.DATAVERSEBOT_GPG_KEY }}

conf/mdc/counter_weekly.sh

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/bin/sh
2+
#counter_weekly.sh
3+
4+
# This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite
5+
# Note: Requires curl and jq for parsing JSON responses form curl
6+
7+
# A recursive method to process each Dataverse
8+
processDV () {
9+
echo "Processing Dataverse ID#: $1"
10+
11+
#Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses
12+
DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents)
13+
14+
# Iterate over all datasets, pulling the value of their DOIs (as part of the persistentUrl) from the json returned
15+
for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset") | .persistentUrl'); do
16+
17+
#The authority/identifier are preceded by a protocol/host, i.e. https://doi.org/
18+
DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'`
19+
20+
# Call the Dataverse API for this dataset and capture both the response and HTTP status code
21+
HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI")
22+
23+
# Extract the HTTP status code from the last line
24+
HTTP_STATUS=$(echo "$HTTP_RESPONSE" | tail -n1)
25+
# Extract the response body (everything except the last line)
26+
RESPONSE_BODY=$(echo "$HTTP_RESPONSE" | sed '$d')
27+
28+
# Check the HTTP status code and report accordingly
29+
case $HTTP_STATUS in
30+
200)
31+
# Successfully queued
32+
# Extract status from the nested data object
33+
STATUS=$(echo "$RESPONSE_BODY" | jq -r '.data.status')
34+
35+
# Extract message from the nested data object
36+
if echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1 && [ "$(echo "$RESPONSE_BODY" | jq -r '.data.message')" != "null" ]; then
37+
MESSAGE=$(echo "$RESPONSE_BODY" | jq -r '.data.message')
38+
echo "[SUCCESS] doi:$DOI - $STATUS: $MESSAGE"
39+
else
40+
# If message is missing or null, just show the status
41+
echo "[SUCCESS] doi:$DOI - $STATUS: Citation update queued"
42+
fi
43+
;;
44+
400)
45+
# Bad request
46+
if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then
47+
ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message')
48+
echo "[ERROR 400] doi:$DOI - Bad request: $ERROR"
49+
else
50+
echo "[ERROR 400] doi:$DOI - Bad request"
51+
fi
52+
;;
53+
404)
54+
# Not found
55+
if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then
56+
ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message')
57+
echo "[ERROR 404] doi:$DOI - Not found: $ERROR"
58+
else
59+
echo "[ERROR 404] doi:$DOI - Not found"
60+
fi
61+
;;
62+
503)
63+
# Service unavailable (queue full)
64+
if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then
65+
ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message')
66+
echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR"
67+
elif echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1; then
68+
ERROR=$(echo "$RESPONSE_BODY" | jq -r '.data.message')
69+
echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR"
70+
else
71+
echo "[ERROR 503] doi:$DOI - Service unavailable: Queue is full"
72+
fi
73+
;;
74+
*)
75+
# Other error
76+
echo "[ERROR $HTTP_STATUS] doi:$DOI - Unexpected error"
77+
echo "Response: $RESPONSE_BODY"
78+
;;
79+
esac
80+
81+
done
82+
83+
# Now iterate over any child Dataverses and recursively process them
84+
for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do
85+
echo $subdv
86+
processDV $subdv
87+
done
88+
89+
}
90+
91+
# Call the function on the root dataverse to start processing
92+
processDV 1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The ExportDataProvider framework in the dataverse-spi package has been extended, adding some extra options for developers of metadata exporter plugins.
2+
See the [documentation](https://guides.dataverse.org/en/latest/developers/metadataexport.html#building-an-exporter) in the Metadata Export guide for details.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
The /api/admin/makeDataCount/{id}/updateCitationsForDataset endpoint, which allows citations for a dataset to be retrieved from DataCite, is often called periodically for all datasets. However, allowing calls for many datasets to be processed in parallel can cause performance problems in Dataverse and/or cause calls to DataCite to fail due to rate limiting. The existing implementation was also inefficient w.r.t. memory use when used on datasets with many (>~1K) files. This release configures Dataverse to queue calls to this api, processes them serially, adds optional throttling to avoid hitting DataCite rate limits and improves memory use.
2+
3+
New optional MPConfig setting:
4+
5+
dataverse.api.mdc.min-delay-ms - number of milliseconds to wait between calls to DataCite. A value of ~100 should conservatively address DataCite's current 3000/5 minute limit. A value of 250 may be required for their test service.
6+
7+
Backward compatibility: This api call is now asynchronous and will return an OK response when the call is queued or a 503 if the queue is full.

doc/sphinx-guides/source/admin/make-data-count.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ The example :download:`counter_weekly.sh <../_static/util/counter_weekly.sh>` wi
166166

167167
Citations will be retrieved for each published dataset and recorded in the your Dataverse installation's database.
168168

169+
Note that the :ref:`dataverse.api.mdc.min-delay-ms` setting can be used to avoid getting rate-limit errors from DataCite.
170+
169171
For how to get the citations out of your Dataverse installation, see "Retrieving Citations for a Dataset" under :ref:`Dataset Metrics <dataset-metrics-api>` in the :doc:`/api/native-api` section of the API Guide.
170172

171173
Please note that while the Dataverse Software has a metadata field for "Related Dataset" this information is not currently sent as a citation to Crossref.

doc/sphinx-guides/source/api/changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ This API changelog is experimental and we would love feedback on its usefulness.
77
:local:
88
:depth: 1
99

10+
v6.9
11+
----
12+
- The POST /api/admin/makeDataCount/{id}/updateCitationsForDataset processing is now asynchronous and the response no longer includes the number of citations. The response can be OK if the request is queued or 503 if the queue is full (default queue size is 1000).
13+
1014
v6.8
1115
----
1216

doc/sphinx-guides/source/developers/making-library-releases.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,32 @@ Releasing a Snapshot Version to Maven Central
3636

3737
That is to say, to make a snapshot release, you only need to get one or more commits into the default branch.
3838

39+
It's possible, of course, to make snapshot releases outside of GitHub Actions, from environments such as your laptop. Generally, you'll want to look at the GitHub Action and try to do the equivalent. You'll need a file set up locally at ``~/.m2/settings.xml`` with the following (contact a core developer for the redacted bits):
40+
41+
.. code-block:: bash
42+
43+
<settings>
44+
<servers>
45+
<server>
46+
<id>central</id>
47+
<username>REDACTED</username>
48+
<password>REDACTED</password>
49+
</server>
50+
</servers>
51+
</settings>
52+
53+
Then, study the GitHub Action and perform similar commands from your local environment. For example, as of this writing, for the dataverse-spi project, you can run the following commands, substituting the suffix you need:
54+
55+
``mvn -f modules/dataverse-spi -Dproject.version.suffix="2.1.0-PR11767-SNAPSHOT" verify``
56+
57+
``mvn -f modules/dataverse-spi -Dproject.version.suffix="2.1.0-PR11767-SNAPSHOT" deploy``
58+
59+
This will upload the snapshot here, for example: https://central.sonatype.com/repository/maven-snapshots/io/gdcc/dataverse-spi/2.1.02.1.0-PR11767-SNAPSHOT/dataverse-spi-2.1.02.1.0-PR11767-20250827.182026-1.jar
60+
61+
Before OSSRH was retired, you could browse through snapshot jars you published at https://s01.oss.sonatype.org/content/repositories/snapshots/io/gdcc/dataverse-spi/2.0.0-PR9685-SNAPSHOT/, for example. Now, even though you may see the URL of the jar as shown above during the "deploy" step, if you try to browse the various snapshot jars at https://central.sonatype.com/repository/maven-snapshots/io/gdcc/dataverse-spi/2.1.02.1.0-PR11767-SNAPSHOT/ you'll see "This maven2 hosted repository is not directly browseable at this URL. Please use the browse or HTML index views to inspect the contents of this repository." Sadly, the "browse" and "HTML index" links don't work, as noted in a `question <https://community.sonatype.com/t/this-maven2-group-repository-is-not-directly-browseable-at-this-url/8991>`_ on the Sonatype Community forum. Below is a suggestion for confirming that the jar was uploaded properly, which is to use Maven to copy the jar to your local directory. You could then compare checksums.
62+
63+
``mvn dependency:copy -DrepoUrl=https://central.sonatype.com/repository/maven-snapshots/ -Dartifact=io.gdcc:dataverse-spi:2.1.02.1.0-PR11767-SNAPSHOT -DoutputDirectory=.``
64+
3965
Releasing a Release (Non-Snapshot) Version to Maven Central
4066
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4167

doc/sphinx-guides/source/installation/config.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3731,6 +3731,22 @@ Example:
37313731

37323732
Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_CORS_HEADERS_EXPOSE``.
37333733

3734+
3735+
.. _dataverse.api.mdc.min-delay-ms:
3736+
3737+
dataverse.api.mdc.min-delay-ms
3738+
++++++++++++++++++++++++++++++
3739+
3740+
Minimum delay in milliseconds between Make Data Count (MDC) API requests from the /api/admin/makeDataCount/{id}/updateCitationsForDataset api.
3741+
This setting helps prevent overloading the MDC service by enforcing a minimum time interval between consecutive requests.
3742+
If a request arrives before this interval has elapsed since the previous request, it will be rate-limited.
3743+
3744+
Default: ``0`` (no delay enforced)
3745+
3746+
Example: ``dataverse.api.mdc.min-delay-ms=100`` (enforces a minimum 100ms delay between MDC API requests)
3747+
3748+
Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_API_MDC_MIN_DELAY_MS``.
3749+
37343750
.. _feature-flags:
37353751

37363752
Feature Flags

modules/dataverse-spi/pom.xml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
<groupId>io.gdcc</groupId>
1515
<artifactId>dataverse-spi</artifactId>
16-
<version>2.0.0${project.version.suffix}</version>
16+
<version>2.1.0${project.version.suffix}</version>
1717
<packaging>jar</packaging>
1818

1919
<name>Dataverse SPI Plugin API</name>
@@ -64,11 +64,13 @@
6464

6565
<distributionManagement>
6666
<snapshotRepository>
67-
<id>ossrh</id>
68-
<url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
67+
<id>central</id>
68+
<url>https://central.sonatype.com/repository/maven-snapshots/</url>
6969
</snapshotRepository>
7070
<repository>
71+
<!--TODO: change this from ossrh to central?-->
7172
<id>ossrh</id>
73+
<!--TODO: change this url?-->
7274
<url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
7375
</repository>
7476
</distributionManagement>
@@ -110,7 +112,9 @@
110112
<artifactId>nexus-staging-maven-plugin</artifactId>
111113
<extensions>true</extensions>
112114
<configuration>
115+
<!--TODO: change this from ossrh to central?-->
113116
<serverId>ossrh</serverId>
117+
<!--TODO: change this URL?-->
114118
<nexusUrl>https://s01.oss.sonatype.org</nexusUrl>
115119
<autoReleaseAfterClose>true</autoReleaseAfterClose>
116120
</configuration>
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package io.gdcc.spi.export;
2+
3+
/**
4+
*
5+
* @author landreev
6+
* Provides an optional mechanism for defining various data retrieval options
7+
* for the export subsystem in a way that should allow us adding support for
8+
* more options going forward with minimal or no changes to the already
9+
* implemented export plugins.
10+
*/
11+
public class ExportDataContext {
12+
private boolean datasetMetadataOnly = false;
13+
private boolean publicFilesOnly = false;
14+
private Integer offset = null;
15+
private Integer length = null;
16+
17+
private ExportDataContext() {
18+
19+
}
20+
21+
public static ExportDataContext context() {
22+
ExportDataContext context = new ExportDataContext();
23+
return context;
24+
}
25+
26+
public ExportDataContext withDatasetMetadataOnly() {
27+
this.datasetMetadataOnly = true;
28+
return this;
29+
}
30+
31+
public ExportDataContext withPublicFilesOnly() {
32+
this.publicFilesOnly = true;
33+
return this;
34+
}
35+
36+
public ExportDataContext withOffset(Integer offset) {
37+
this.offset = offset;
38+
return this;
39+
}
40+
41+
public ExportDataContext withLength(Integer length) {
42+
this.length = length;
43+
return this;
44+
}
45+
46+
public boolean isDatasetMetadataOnly() {
47+
return datasetMetadataOnly;
48+
}
49+
50+
public boolean isPublicFilesOnly() {
51+
return publicFilesOnly;
52+
}
53+
54+
public Integer getOffset() {
55+
return offset;
56+
}
57+
58+
public Integer getLength() {
59+
return length;
60+
}
61+
}

0 commit comments

Comments
 (0)