Skip to content
This repository was archived by the owner on Oct 6, 2023. It is now read-only.

Commit c617275

Browse files
author
sowerstl
committed
Add URL normalization; (DOECODE-556/DOECODE-676)
1 parent b75ccbc commit c617275

File tree

1 file changed

+30
-5
lines changed

1 file changed

+30
-5
lines changed

src/main/java/gov/osti/services/Metadata.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ public class Metadata {
170170

171171
// set pattern for DOI normalization
172172
private static final Pattern DOI_TRIM_PATTERN = Pattern.compile("(10.\\d{4,9}\\/[-._;()\\/:A-Za-z0-9]+)$");
173+
private static final Pattern URL_TRIM_PATTERN = Pattern.compile("^(.*)(?<!\\/)\\/?$");
173174

174175
// create and start a ConnectorFactory for use by "autopopulate" service
175176
static {
@@ -2144,7 +2145,7 @@ private void sendToOsti(EntityManager em, DOECodeMetadata md) throws IOException
21442145
}
21452146

21462147
/**
2147-
* Remove duplicate RI entries from metadata.
2148+
* Remove duplicate RI entries and normalize values.
21482149
*
21492150
* @param md the Metadata to evaluate
21502151
*/
@@ -2155,10 +2156,12 @@ private void normalizeRelatedIdentifiers(DOECodeMetadata md) {
21552156
if (currentList == null || currentList.isEmpty())
21562157
return;
21572158

2158-
// trim DOI values
2159+
// trim DOI and URL values
21592160
for(RelatedIdentifier ri : currentList)
21602161
if (RelatedIdentifier.Type.DOI.equals(ri.getIdentifierType()))
21612162
ri.setIdentifierValue(trimDoi(ri.getIdentifierValue()));
2163+
else if (RelatedIdentifier.Type.URL.equals(ri.getIdentifierType()))
2164+
ri.setIdentifierValue(trimUrl(ri.getIdentifierValue()));
21622165

21632166
// remove RI duplicates
21642167
Set<RelatedIdentifier> s = new HashSet<>();
@@ -2185,13 +2188,35 @@ private String trimDoi(String doi) {
21852188
}
21862189

21872190
/**
2188-
* Normalize any DOI information.
2191+
* Trim away unneeded URL characters, etc.
2192+
*
2193+
* @param url the URL to trim
2194+
*/
2195+
private String trimUrl(String url) {
2196+
// remove extra spaces and single trailing slash, if exist
2197+
if (!StringUtils.isBlank(url)) {
2198+
url = url.trim();
2199+
Matcher m = URL_TRIM_PATTERN.matcher(url);
2200+
if (m.find())
2201+
url = m.group(1);
2202+
}
2203+
return url;
2204+
}
2205+
2206+
/**
2207+
* Normalize metadata information.
21892208
*
21902209
* @param md the Metadata to evaluate
21912210
*/
2192-
private void normalizeDois(DOECodeMetadata md) {
2211+
private void normalizeMetadata(DOECodeMetadata md) {
21932212
// trim main DOI
21942213
md.setDoi(trimDoi(md.getDoi()));
2214+
2215+
// trim main URLs
2216+
md.setRepositoryLink(trimUrl(md.getRepositoryLink()));
2217+
md.setLandingPage(trimUrl(md.getLandingPage()));
2218+
md.setProprietaryUrl(trimUrl(md.getProprietaryUrl()));
2219+
md.setDocumentationUrl(trimUrl(md.getDocumentationUrl()));
21952220
}
21962221

21972222
/**
@@ -2200,7 +2225,7 @@ private void normalizeDois(DOECodeMetadata md) {
22002225
* @param md the Metadata to normalize
22012226
*/
22022227
private void performDataNormalization(DOECodeMetadata md) {
2203-
normalizeDois(md);
2228+
normalizeMetadata(md);
22042229
normalizeRelatedIdentifiers(md);
22052230
}
22062231

0 commit comments

Comments
 (0)