Skip to content

Commit fd457de

Browse files
authored
Merge pull request #361 from metafacture/350-subfields
Support HTML attribute values as subfields
2 parents 25f4bb1 + 5786446 commit fd457de

File tree

2 files changed

+97
-4
lines changed

2 files changed

+97
-4
lines changed

metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
1717

1818
import java.io.IOException;
1919
import java.io.Reader;
20+
import java.io.UnsupportedEncodingException;
21+
import java.net.URLDecoder;
22+
import java.nio.charset.StandardCharsets;
23+
import java.util.HashMap;
24+
import java.util.Map;
2025
import java.util.UUID;
2126

2227
import org.apache.commons.io.IOUtils;
@@ -38,12 +43,28 @@
3843
* @author Fabian Steeg (fsteeg)
3944
*
4045
*/
41-
@Description("Decode HTML to metadata events")
46+
@Description("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47+
+ "the default attribute values to be used as subfields (e.g. by default "
48+
+ "`link rel=\"canonical\" href=\"http://example.org\"` becomes `link.canonical`). "
49+
+ "It expects an HTTP-style query string specifying as key the attributes whose value should "
50+
+ "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51+
+ "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52+
+ "(instead of another attribute), omit the value of the query-string key-value pair, "
53+
+ "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54+
+ "e.g. `&h3.class`")
4255
@In(Reader.class)
4356
@Out(StreamReceiver.class)
4457
@FluxCommand("decode-html")
4558
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
4659

60+
private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61+
"meta.name=content&meta.property=content&link.rel=href&a.rel=href";
62+
private Map<String, String> attrValsAsSubfields;
63+
64+
public HtmlDecoder() {
65+
setAttrValsAsSubfields(DEFAULT_ATTR_VALS_AS_SUBFIELDS);
66+
}
67+
4768
@Override
4869
public void process(final Reader reader) {
4970
try {
@@ -61,18 +82,54 @@ private void process(Element parent, StreamReceiver receiver) {
6182
for (Element element : parent.children()) {
6283
receiver.startEntity(element.nodeName());
6384
Attributes attributes = element.attributes();
85+
boolean addedValueAsSubfield = false;
6486
for (Attribute attribute : attributes) {
87+
addedValueAsSubfield = handleAttributeValuesAsSubfields(receiver, element, attributes, attribute);
6588
receiver.literal(attribute.getKey(), attribute.getValue());
6689
}
6790
if (element.children().isEmpty()) {
6891
String text = element.text().trim();
6992
String value = text.isEmpty() ? element.data() : text;
70-
if (!value.isEmpty()) {
93+
if (!value.isEmpty() && !addedValueAsSubfield) {
7194
receiver.literal("value", value);
7295
}
7396
}
7497
process(element, receiver);
7598
receiver.endEntity();
7699
}
77100
}
101+
102+
private boolean handleAttributeValuesAsSubfields(StreamReceiver receiver, Element element,
103+
Attributes attributes, Attribute attribute) {
104+
String fullFieldKey = element.nodeName() + "." + attribute.getKey();
105+
if (attrValsAsSubfields.containsKey(fullFieldKey)) {
106+
String configValue = attrValsAsSubfields.get(fullFieldKey);
107+
if (configValue.trim().isEmpty()) {
108+
receiver.literal(attribute.getValue(), element.text().trim());
109+
return true;
110+
} else {
111+
String value = attributes.get(configValue);
112+
receiver.literal(attribute.getValue(), value);
113+
}
114+
}
115+
return false;
116+
}
117+
118+
public void setAttrValsAsSubfields(String mapString) {
119+
this.attrValsAsSubfields = new HashMap<String, String>();
120+
String input = mapString.startsWith("&") ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
121+
: mapString;
122+
for (String nameValuePair : input.split("&")) {
123+
String[] nameValue = nameValuePair.split("=");
124+
try {
125+
String utf8 = StandardCharsets.UTF_8.name();
126+
String key = URLDecoder.decode(nameValue[0], utf8);
127+
String val = nameValue.length > 1 ? URLDecoder.decode(nameValue[1], utf8) : "";
128+
attrValsAsSubfields.put(key, val);
129+
} catch (UnsupportedEncodingException e) {
130+
e.printStackTrace();
131+
}
132+
}
133+
}
134+
78135
}

metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
1717

1818
import static org.mockito.Mockito.inOrder;
1919
import static org.mockito.Mockito.times;
20+
import static org.mockito.Mockito.never;
2021

2122
import java.io.StringReader;
2223

@@ -97,4 +98,39 @@ public void htmlScriptElementData() {
9798
ordered.verify(receiver, times(4)).endEntity();
9899
}
99100

101+
@Test
102+
public void htmlAttributesAsSubfieldsDefault() {
103+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"));
104+
final InOrder ordered = inOrder(receiver);
105+
ordered.verify(receiver).startEntity("meta");
106+
ordered.verify(receiver).literal("language", "DE");
107+
ordered.verify(receiver).literal("name", "language");
108+
ordered.verify(receiver).literal("content", "DE");
109+
ordered.verify(receiver, times(4)).endEntity();
110+
}
111+
112+
@Test
113+
public void htmlAttributesAsSubfieldsCustom() {
114+
htmlDecoder.setAttrValsAsSubfields("mods:url.access");
115+
htmlDecoder.process(new StringReader("<mods:url access=\"preview\">file:///img.png</mods:url>"));
116+
final InOrder ordered = inOrder(receiver);
117+
ordered.verify(receiver).startEntity("mods:url");
118+
ordered.verify(receiver).literal("preview", "file:///img.png");
119+
ordered.verify(receiver, never()).literal("value", "file:///img.png");
120+
ordered.verify(receiver, times(3)).endEntity();
121+
}
122+
123+
@Test
124+
public void htmlAttributesAsSubfieldsDefaultPlusCustom() {
125+
htmlDecoder.setAttrValsAsSubfields("&mods:url.access");
126+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"
127+
+ "<mods:url access=\"preview\">file:///img.png</mods:url>"));
128+
final InOrder ordered = inOrder(receiver);
129+
ordered.verify(receiver).startEntity("meta");
130+
ordered.verify(receiver).literal("language", "DE");
131+
ordered.verify(receiver).startEntity("mods:url");
132+
ordered.verify(receiver).literal("preview", "file:///img.png");
133+
ordered.verify(receiver, never()).literal("value", "file:///img.png");
134+
ordered.verify(receiver, times(3)).endEntity();
135+
}
100136
}

0 commit comments

Comments
 (0)