Skip to content
This repository was archived by the owner on Oct 6, 2023. It is now read-only.

Commit b75ccbc

Browse files
author
sowerstl
committed
Normalize DOI values before storage; (DOECODE-591)
1 parent b6466c3 commit b75ccbc

File tree

1 file changed

+44
-9
lines changed

1 file changed

+44
-9
lines changed

src/main/java/gov/osti/services/Metadata.java

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ public class Metadata {
168168
// get the SITE URL base for applications
169169
private static String PM_EMAIL = DoeServletContextListener.getConfigurationProperty("project.manager.email");
170170

171+
// set pattern for DOI normalization
172+
private static final Pattern DOI_TRIM_PATTERN = Pattern.compile("(10.\\d{4,9}\\/[-._;()\\/:A-Za-z0-9]+)$");
173+
171174
// create and start a ConnectorFactory for use by "autopopulate" service
172175
static {
173176
try {
@@ -2145,19 +2148,50 @@ private void sendToOsti(EntityManager em, DOECodeMetadata md) throws IOException
21452148
*
21462149
* @param md the Metadata to evaluate
21472150
*/
2148-
private void removeRiDups(DOECodeMetadata md) {
2149-
// remove RI duplicates
2151+
private void normalizeRelatedIdentifiers(DOECodeMetadata md) {
21502152
List<RelatedIdentifier> currentList = md.getRelatedIdentifiers();
21512153

2152-
if (currentList != null) {
2153-
Set<RelatedIdentifier> s = new HashSet<>();
2154+
// nothing to process
2155+
if (currentList == null || currentList.isEmpty())
2156+
return;
2157+
2158+
// trim DOI values
2159+
for(RelatedIdentifier ri : currentList)
2160+
if (RelatedIdentifier.Type.DOI.equals(ri.getIdentifierType()))
2161+
ri.setIdentifierValue(trimDoi(ri.getIdentifierValue()));
21542162

2155-
s.addAll(currentList);
2156-
currentList.clear();
2157-
currentList.addAll(s);
2163+
// remove RI duplicates
2164+
Set<RelatedIdentifier> s = new HashSet<>();
2165+
s.addAll(currentList);
2166+
currentList.clear();
2167+
currentList.addAll(s);
2168+
md.setRelatedIdentifiers(currentList);
2169+
}
21582170

2159-
md.setRelatedIdentifiers(currentList);
2171+
/**
2172+
* Trim away unneeded DOI prefixes, etc.
2173+
*
2174+
* @param doi the DOI to trim
2175+
*/
2176+
private String trimDoi(String doi) {
2177+
// trim DOI down to 10.* variation
2178+
if (!StringUtils.isBlank(doi)) {
2179+
doi = doi.trim();
2180+
Matcher m = DOI_TRIM_PATTERN.matcher(doi);
2181+
if (m.find())
2182+
doi = m.group(1);
21602183
}
2184+
return doi;
2185+
}
2186+
2187+
/**
2188+
* Normalize any DOI information.
2189+
*
2190+
* @param md the Metadata to evaluate
2191+
*/
2192+
private void normalizeDois(DOECodeMetadata md) {
2193+
// trim main DOI
2194+
md.setDoi(trimDoi(md.getDoi()));
21612195
}
21622196

21632197
/**
@@ -2166,7 +2200,8 @@ private void removeRiDups(DOECodeMetadata md) {
21662200
* @param md the Metadata to normalize
21672201
*/
21682202
private void performDataNormalization(DOECodeMetadata md) {
2169-
removeRiDups(md);
2203+
normalizeDois(md);
2204+
normalizeRelatedIdentifiers(md);
21702205
}
21712206

21722207
/**

0 commit comments

Comments
 (0)