diff --git a/doc/release-notes/11832-DataCite-scaling.md b/doc/release-notes/11832-DataCite-scaling.md new file mode 100644 index 00000000000..574ef05f94c --- /dev/null +++ b/doc/release-notes/11832-DataCite-scaling.md @@ -0,0 +1,14 @@ +This release adds functionality to retry calls to DataCite when their server is overloaded or Dataverse has hit their rate limit. + +It also introduces an option to only update DataCite metadata after checking to see if the current DataCite information is out of date. +(This adds a request to get information from DataCite before any potential write of new information which will be more efficient when +most DOIs have not changed but will result in an extra call to get info when a DOI has changed.) + +Both of these can help when DataCite is being used heavily, e.g. creating and publishing datasets with many datafiles and using file DOIs, +or doing bulk operations that involve DataCite with many datasets. + +### New Settings + +- dataverse.feature.only-update-datacite-when-needed + +The default is false - Dataverse will not check to see if DataCite's information is out of date before sending an update. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index bbf0a0d2449..91dc63c36cb 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -537,6 +537,8 @@ dataverse.pid.*.datacite.username ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ dataverse.pid.*.datacite.password ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +dataverse.feature.only-update-datacite-when-needed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PID Providers of type ``datacite`` require four additional parameters that define how the provider connects to DataCite. DataCite has two APIs that are used in Dataverse: @@ -552,6 +554,11 @@ for `Fabrica `_ and their APIs. You need to provide the same credentials (``username``, ``password``) to Dataverse software to mint and manage DOIs for you. As noted above, you should use one of the more secure options for setting the password. +The `only-update-datacite-when-needed feature` flag is a global option that causes Dataverse to GET the latest metadata from DataCite +for a DOI and compare it with the current metadata in Dataverse and only sending a following POST request if needed. This potentially +substitutes a read for an unnecessary write at DataCite, but would result in extra reads when all metadata in Dataverse is new. +Setting the flag to "true" is recommended when using DataCite file DOIs. + CrossRef-specific Settings ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -3824,6 +3831,9 @@ please find all known feature flags below. Any of these flags can be activated u * - role-assignment-history - Turns on tracking/display of role assignments and revocations for collections, datasets, and files - ``Off`` + * - only-update-datacite-when-needed + - Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). + - ``Off`` **Note:** Feature flags can be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_FEATURE_XXX`` (e.g. ``DATAVERSE_FEATURE_API_SESSION_AUTH=1``). These environment variables can be set in your shell before starting Payara. If you are using :doc:`Docker for development `, you can set them in the `docker compose `_ file. diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DOIDataCiteRegisterService.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DOIDataCiteRegisterService.java index a4d788de4df..d9ddfe04393 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DOIDataCiteRegisterService.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DOIDataCiteRegisterService.java @@ -95,7 +95,14 @@ public String reRegisterIdentifier(String identifier, Map metada } retString = "metadata:\\r" + client.postMetadata(xmlMetadata) + "\\r"; } - if (!target.equals(client.getUrl(numericIdentifier))) { + String currentUrl = null; + try { + //May get a 204 if the DOI is still draft + currentUrl = client.getUrl(numericIdentifier); + } catch (RuntimeException ex) { + logger.fine("Error getting Url for " + numericIdentifier + ": " + ex.getMessage()); + } + if (!target.equals(currentUrl)) { logger.info("Updating target URL to " + target); client.postUrl(numericIdentifier, target); retString = retString + "url:\\r" + target; diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java index dd64a89dfe6..3c21699d45e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java @@ -15,6 +15,7 @@ import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.pidproviders.doi.AbstractDOIProvider; +import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.util.json.JsonUtil; import jakarta.json.JsonObject; @@ -217,7 +218,11 @@ public boolean publicizeIdentifier(DvObject dvObject) { metadata.put("datacite.publicationyear", generateYear(dvObject)); metadata.put("_target", getTargetUrl(dvObject)); try { - doiDataCiteRegisterService.registerIdentifier(identifier, metadata, dvObject); + if (FeatureFlags.ONLY_UPDATE_DATACITE_WHEN_NEEDED.enabled()) { + doiDataCiteRegisterService.reRegisterIdentifier(identifier, metadata, dvObject); + } else { + doiDataCiteRegisterService.registerIdentifier(identifier, metadata, dvObject); + } return true; } catch (Exception e) { logger.log(Level.WARNING, "modifyMetadata failed: " + e.getMessage(), e); diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteRESTfullClient.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteRESTfullClient.java index 465b10ee407..47394f0ad54 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteRESTfullClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteRESTfullClient.java @@ -41,6 +41,10 @@ public class DataCiteRESTfullClient implements Closeable { private static final Logger logger = Logger.getLogger(DataCiteRESTfullClient.class.getCanonicalName()); + // Constants for retry mechanism + private static final int MAX_RETRIES = 5; + private static final long RETRY_DELAY_MS = 10000; // 10 seconds + private String url; private CloseableHttpClient httpClient; private HttpClientContext context; @@ -59,11 +63,78 @@ public DataCiteRESTfullClient(String url, String username, String password) { public void close() { if (this.httpClient != null) { try { - httpClient.close(); + httpClient.close(); } catch (IOException io) { - logger.warning("IOException closing hhtpClient: " + io.getMessage()); - } + logger.warning("IOException closing httpClient: " + io.getMessage()); + } + } + } + + /** + * Execute HTTP request with retry mechanism for specific status codes + * + * @param request The HTTP request to execute + * @param operationName Name of the operation for logging + * @return HttpResponse The response from the server + * @throws IOException If an error occurs during the request + */ + private HttpResponse executeWithRetry(org.apache.http.client.methods.HttpRequestBase request, String operationName) throws IOException { + int attempts = 0; + IOException lastException = null; + + while (attempts < MAX_RETRIES) { + try { + HttpResponse response = httpClient.execute(request, context); + int statusCode = response.getStatusLine().getStatusCode(); + + // If we get a retry status code, try again after delay + if (statusCode == 429 || statusCode == 503 || statusCode == 504) { + EntityUtils.consumeQuietly(response.getEntity()); + attempts++; + + if (attempts < MAX_RETRIES) { + logger.warning("DataCite API returned status " + statusCode + + " for " + operationName + ". Retrying in " + + (RETRY_DELAY_MS / 1000) + " seconds (attempt " + attempts + " of " + MAX_RETRIES + ")"); + try { + Thread.sleep(RETRY_DELAY_MS); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("Retry interrupted", ie); + } + } else { + logger.severe("DataCite API failed with status " + statusCode + + " for " + operationName + " after " + MAX_RETRIES + " attempts"); + return response; // Return the last failed response + } + } else { + // Success or non-retry error code + return response; + } + } catch (IOException ioe) { + lastException = ioe; + attempts++; + + if (attempts < MAX_RETRIES) { + logger.warning("IOException during " + operationName + ": " + ioe.getMessage() + + ". Retrying in " + (RETRY_DELAY_MS / 1000) + " seconds (attempt " + + attempts + " of " + MAX_RETRIES + ")"); + try { + Thread.sleep(RETRY_DELAY_MS); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("Retry interrupted", ie); + } + } else { + logger.severe("DataCite API failed for " + operationName + " after " + + MAX_RETRIES + " attempts due to: " + ioe.getMessage()); + throw lastException; + } + } } + + // This should never happen, but just in case + throw new IOException("Failed to execute request after " + MAX_RETRIES + " attempts"); } /** @@ -75,7 +146,7 @@ public void close() { public String getUrl(String doi) { HttpGet httpGet = new HttpGet(this.url + "/doi/" + doi); try { - HttpResponse response = httpClient.execute(httpGet,context); + HttpResponse response = executeWithRetry(httpGet, "getUrl"); HttpEntity entity = response.getEntity(); String data = null; @@ -104,7 +175,7 @@ public String postUrl(String doi, String url) throws IOException { httpPost.setHeader("Content-Type", "text/plain;charset=UTF-8"); httpPost.setEntity(new StringEntity("doi=" + doi + "\nurl=" + url, "utf-8")); - HttpResponse response = httpClient.execute(httpPost, context); + HttpResponse response = executeWithRetry(httpPost, "postUrl"); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 201) { String errMsg = "Response from postUrl: " + response.getStatusLine().getStatusCode() + ", " + data; @@ -124,7 +195,7 @@ public String getMetadata(String doi) { HttpGet httpGet = new HttpGet(this.url + "/metadata/" + doi); httpGet.setHeader("Accept", "application/xml"); try { - HttpResponse response = httpClient.execute(httpGet,context); + HttpResponse response = executeWithRetry(httpGet, "getMetadata"); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 200) { String errMsg = "Response from getMetadata: " + response.getStatusLine().getStatusCode() + ", " + data; @@ -133,7 +204,7 @@ public String getMetadata(String doi) { } return data; } catch (IOException ioe) { - logger.log(Level.SEVERE, "IOException when get metadata"); + logger.log(Level.SEVERE, "IOException when get metadata", ioe); throw new RuntimeException("IOException when get metadata", ioe); } } @@ -147,7 +218,7 @@ public String getMetadata(String doi) { public boolean testDOIExists(String doi) throws IOException { HttpGet httpGet = new HttpGet(this.url + "/metadata/" + doi); httpGet.setHeader("Accept", "application/xml"); - HttpResponse response = httpClient.execute(httpGet, context); + HttpResponse response = executeWithRetry(httpGet, "testDOIExists"); if (response.getStatusLine().getStatusCode() != 200) { EntityUtils.consumeQuietly(response.getEntity()); return false; @@ -166,7 +237,7 @@ public String postMetadata(String metadata) throws IOException { HttpPost httpPost = new HttpPost(this.url + "/metadata"); httpPost.setHeader("Content-Type", "application/xml;charset=UTF-8"); httpPost.setEntity(new StringEntity(metadata, "utf-8")); - HttpResponse response = httpClient.execute(httpPost, context); + HttpResponse response = executeWithRetry(httpPost, "postMetadata"); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 201) { String errMsg = "Response from postMetadata: " + response.getStatusLine().getStatusCode() + ", " + data; @@ -185,7 +256,7 @@ public String postMetadata(String metadata) throws IOException { public String inactiveDataset(String doi) { HttpDelete httpDelete = new HttpDelete(this.url + "/metadata/" + doi); try { - HttpResponse response = httpClient.execute(httpDelete,context); + HttpResponse response = executeWithRetry(httpDelete, "inactiveDataset"); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 200) { String errMsg = "Response code: " + response.getStatusLine().getStatusCode() + ", " + data; @@ -194,7 +265,7 @@ public String inactiveDataset(String doi) { } return data; } catch (IOException ioe) { - logger.log(Level.SEVERE, "IOException when inactive dataset"); + logger.log(Level.SEVERE, "IOException when inactive dataset", ioe); throw new RuntimeException("IOException when inactive dataset", ioe); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 6f513e30dec..2e86fae610e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -235,6 +235,21 @@ public enum FeatureFlags { * or revoked, at what times, and by whom. */ ROLE_ASSIGNMENT_HISTORY("role-assignment-history"), + + /** + * Only update a DataCite DOI when needed (for efficiency, lighter load on DataCite). + * This flag causes Dataverse to GET the latest metadata from DataCite for a DOI and + * comparing it with the current metadata in Dataverse and only sending a following POST + * request if needed. This potentially substitutes a read for an unnecessary write at DataCite, + * but would result in extra reads when all metadata in Dataverse is new. Setting the flag + * to "true" is recommended when using DataCite file DOIs. + * + * @apiNote Raise flag by setting + * "dataverse.feature.only-update-datacite-when-needed" + * @since Dataverse 6.9 + */ + ONLY_UPDATE_DATACITE_WHEN_NEEDED("only-update-datacite-when-needed"), + ; final String flag;