Skip to content

Commit 8cb18f2

Browse files
authored
Merge pull request #238 from NASA-PDS/i237
Add JSON extraction for citation author and editor metadata
2 parents 5e6afb3 + 0395a03 commit 8cb18f2

File tree

9 files changed

+1158
-1166
lines changed

9 files changed

+1158
-1166
lines changed

harvest-legacy/src/main/java/gov/nasa/pds/harvest/search/crawler/metadata/extractor/Pds4MetExtractor.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,24 @@ protected List<Slot> extractMetadata(List<XPath> xPaths)
190190
name = node.getDisplayName();
191191
}
192192
}
193-
List<String> values = extractor.getValuesFromDoc(xpath.getValue());
193+
194+
// Check if this should be extracted as JSON (slot name ends with "_json")
195+
List<String> values;
196+
if (name != null && name.endsWith("_json")) {
197+
// Extract as JSON
198+
values = extractor.getValuesAsJsonFromDoc(xpath.getValue());
199+
} else {
200+
// Extract as regular text values
201+
values = extractor.getValuesFromDoc(xpath.getValue());
202+
}
203+
194204
if (values != null && (!values.isEmpty())) {
195205
Slot slot = new Slot(name, values);
196-
String unit = node.getAttributeValue("", Constants.UNIT);
197-
if (unit != null) {
198-
slot.setSlotType(unit);
206+
if (node != null) {
207+
String unit = node.getAttributeValue("", Constants.UNIT);
208+
if (unit != null) {
209+
slot.setSlotType(unit);
210+
}
199211
}
200212
slots.add(slot);
201213
}

harvest-legacy/src/main/java/gov/nasa/pds/harvest/search/util/XMLExtractor.java

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package gov.nasa.pds.harvest.search.util;
22

33
import java.io.File;
4+
import java.io.StringWriter;
45
import java.util.ArrayList;
56
import java.util.List;
7+
import java.util.logging.Logger;
68

79
import javax.xml.transform.sax.SAXSource;
810
import javax.xml.xpath.XPathConstants;
@@ -16,12 +18,17 @@
1618
import net.sf.saxon.trans.XPathException;
1719
import net.sf.saxon.xpath.XPathEvaluator;
1820

21+
import org.json.JSONObject;
22+
import org.json.XML;
1923
import org.xml.sax.InputSource;
2024

2125
/**
2226
* Class to extract data from an XML file.
2327
*/
2428
public class XMLExtractor {
29+
/** Logger instance */
30+
private static final Logger log = Logger.getLogger(XMLExtractor.class.getName());
31+
2532
/** The DOM source. */
2633
private DocumentInfo xml = null;
2734

@@ -286,4 +293,84 @@ public List<String> getAttributeValuesFromItem(String expression, Object item)
286293
}
287294
return vals;
288295
}
296+
297+
/**
298+
* Gets the values of the given expression as JSON strings.
299+
* Each matching node is converted to a JSON object and returned as a string.
300+
*
301+
* @param expression An XPath expression.
302+
*
303+
* @return A list of JSON strings, one for each matching node.
304+
*
305+
* @throws XPathExpressionException If the given expression was malformed.
306+
*/
307+
public List<String> getValuesAsJsonFromDoc(String expression)
308+
throws XPathExpressionException {
309+
return getValuesAsJsonFromItem(expression, xml);
310+
}
311+
312+
/**
313+
* Gets the values of the given expression as JSON strings.
314+
* Each matching node is converted to a JSON object and returned as a string.
315+
*
316+
* @param expression An XPath expression.
317+
* @param item The starting point from which to evaluate the XPath expression.
318+
*
319+
* @return A list of JSON strings, one for each matching node.
320+
*
321+
* @throws XPathExpressionException If the given expression was malformed.
322+
*/
323+
public List<String> getValuesAsJsonFromItem(String expression, Object item)
324+
throws XPathExpressionException {
325+
List<String> jsonStrings = new ArrayList<String>();
326+
List<TinyElementImpl> nList = (List<TinyElementImpl>) xpath.evaluate(
327+
expression, item, XPathConstants.NODESET);
328+
329+
if (nList != null) {
330+
for (int i = 0, sz = nList.size(); i < sz; i++) {
331+
TinyElementImpl node = nList.get(i);
332+
try {
333+
// Convert the node to an XML string
334+
String xmlString = nodeToString(node);
335+
336+
// Convert XML to JSON using org.json library
337+
JSONObject jsonObject = XML.toJSONObject(xmlString);
338+
339+
// Add the JSON string to the result list
340+
jsonStrings.add(jsonObject.toString());
341+
} catch (RuntimeException e) {
342+
// Let RuntimeExceptions (programming errors) propagate
343+
throw e;
344+
} catch (Exception e) {
345+
// If conversion fails, log and skip this node
346+
log.warning("Failed to convert XML node to JSON: " + e.getMessage());
347+
}
348+
}
349+
}
350+
return jsonStrings;
351+
}
352+
353+
/**
354+
* Converts a TinyElementImpl node to an XML string.
355+
*
356+
* @param node The node to convert.
357+
*
358+
* @return An XML string representation of the node.
359+
*
360+
* @throws Exception If conversion fails.
361+
*/
362+
private String nodeToString(TinyElementImpl node) throws Exception {
363+
// Use Saxon's built-in serialization
364+
net.sf.saxon.s9api.Processor processor = new net.sf.saxon.s9api.Processor(false);
365+
net.sf.saxon.s9api.Serializer serializer = processor.newSerializer();
366+
367+
StringWriter writer = new StringWriter();
368+
serializer.setOutputWriter(writer);
369+
serializer.setOutputProperty(net.sf.saxon.s9api.Serializer.Property.OMIT_XML_DECLARATION, "yes");
370+
serializer.setOutputProperty(net.sf.saxon.s9api.Serializer.Property.INDENT, "no");
371+
372+
serializer.serializeNode(new net.sf.saxon.s9api.XdmNode(node));
373+
374+
return writer.toString();
375+
}
289376
}

harvest-legacy/src/main/resources/conf/search/defaults/pds/pds4/bundle.xml

Lines changed: 9 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -75,65 +75,18 @@
7575
<field name="citation_editor_list" type="string">
7676
<registryPath>citation_editor_list</registryPath>
7777
</field>
78-
<field name="citation_author_organization_name" type="string">
79-
<registryPath>citation_author_organization_name</registryPath>
78+
<!-- JSON fields for authors and editors to preserve nested structure -->
79+
<field name="citation_author_person_json" type="string">
80+
<registryPath>citation_author_person_json</registryPath>
8081
</field>
81-
<field name="citation_author_organization_rorid" type="string">
82-
<registryPath>citation_author_organization_rorid</registryPath>
82+
<field name="citation_author_organization_json" type="string">
83+
<registryPath>citation_author_organization_json</registryPath>
8384
</field>
84-
<field name="citation_author_organization_contributor_type" type="string">
85-
<registryPath>citation_author_organization_contributor_type</registryPath>
85+
<field name="citation_editor_person_json" type="string">
86+
<registryPath>citation_editor_person_json</registryPath>
8687
</field>
87-
<field name="citation_author_person_contributor_type" type="string">
88-
<registryPath>citation_author_person_contributor_type</registryPath>
89-
</field>
90-
<field name="citation_author_person_display_full_name" type="string">
91-
<registryPath>citation_author_person_display_full_name</registryPath>
92-
</field>
93-
<field name="citation_author_person_given_name" type="string">
94-
<registryPath>citation_author_person_given_name</registryPath>
95-
</field>
96-
<field name="citation_author_person_family_name" type="string">
97-
<registryPath>citation_author_person_family_name</registryPath>
98-
</field>
99-
<field name="citation_author_person_orcid" type="string">
100-
<registryPath>citation_author_person_orcid</registryPath>
101-
</field>
102-
<field name="citation_author_person_affiliation_organization_name" type="string">
103-
<registryPath>citation_author_person_affiliation_organization_name</registryPath>
104-
</field>
105-
<field name="citation_author_person_affiliation_organization_rorid" type="string">
106-
<registryPath>citation_author_person_affiliation_organization_rorid</registryPath>
107-
</field>
108-
<field name="citation_editor_organization_name" type="string">
109-
<registryPath>citation_editor_organization_name</registryPath>
110-
</field>
111-
<field name="citation_editor_organization_rorid" type="string">
112-
<registryPath>citation_editor_organization_rorid</registryPath>
113-
</field>
114-
<field name="citation_editor_organization_contributor_type" type="string">
115-
<registryPath>citation_editor_organization_contributor_type</registryPath>
116-
</field>
117-
<field name="citation_editor_person_contributor_type" type="string">
118-
<registryPath>citation_editor_person_contributor_type</registryPath>
119-
</field>
120-
<field name="citation_editor_person_display_full_name" type="string">
121-
<registryPath>citation_editor_person_display_full_name</registryPath>
122-
</field>
123-
<field name="citation_editor_person_given_name" type="string">
124-
<registryPath>citation_editor_person_given_name</registryPath>
125-
</field>
126-
<field name="citation_editor_person_family_name" type="string">
127-
<registryPath>citation_editor_person_family_name</registryPath>
128-
</field>
129-
<field name="citation_editor_person_orcid" type="string">
130-
<registryPath>citation_editor_person_orcid</registryPath>
131-
</field>
132-
<field name="citation_editor_person_affiliation_organization_name" type="string">
133-
<registryPath>citation_editor_person_affiliation_organization_name</registryPath>
134-
</field>
135-
<field name="citation_editor_person_affiliation_organization_rorid" type="string">
136-
<registryPath>citation_editor_person_affiliation_organization_rorid</registryPath>
88+
<field name="citation_editor_organization_json" type="string">
89+
<registryPath>citation_editor_organization_json</registryPath>
13790
</field>
13891
<field name="citation_publication_year" type="string">
13992
<registryPath>citation_publication_year</registryPath>

0 commit comments

Comments
 (0)