Skip to content

Commit 252f9f5

Browse files
committed
Support HTML attribute values as subfields
See #350
1 parent 28ded4e commit 252f9f5

File tree

2 files changed

+87
-3
lines changed

2 files changed

+87
-3
lines changed

metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
1717

1818
import java.io.IOException;
1919
import java.io.Reader;
20+
import java.io.UnsupportedEncodingException;
21+
import java.net.URLDecoder;
22+
import java.nio.charset.StandardCharsets;
23+
import java.util.HashMap;
24+
import java.util.Map;
2025
import java.util.UUID;
2126

2227
import org.apache.commons.io.IOUtils;
@@ -38,12 +43,28 @@
3843
* @author Fabian Steeg (fsteeg)
3944
*
4045
*/
41-
@Description("Decode HTML to metadata events")
46+
@Description("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47+
+ "the default attribute values to be used as subfields (e.g. by default "
48+
+ "`link rel=\"canonical\" href=\"http://example.org\"` becomes `link.canonical`). "
49+
+ "It expects an HTTP-style query string specifying as key the attributes whose value should "
50+
+ "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51+
+ "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52+
+ "(instead of another attribute), omit the value of the query-string key-value pair, "
53+
+ "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54+
+ "e.g. `&h3.class`")
4255
@In(Reader.class)
4356
@Out(StreamReceiver.class)
4457
@FluxCommand("decode-html")
4558
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
4659

60+
private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61+
"meta.name=content&meta.property=content&link.rel=href&a.rel=href";
62+
private Map<String, String> attrValsAsSubfields;
63+
64+
public HtmlDecoder() {
65+
setAttrValsAsSubfields(DEFAULT_ATTR_VALS_AS_SUBFIELDS);
66+
}
67+
4768
@Override
4869
public void process(final Reader reader) {
4970
try {
@@ -62,6 +83,7 @@ private void process(Element parent, StreamReceiver receiver) {
6283
receiver.startEntity(element.nodeName());
6384
Attributes attributes = element.attributes();
6485
for (Attribute attribute : attributes) {
86+
handleAttributeValuesAsSubfields(receiver, element, attributes, attribute);
6587
receiver.literal(attribute.getKey(), attribute.getValue());
6688
}
6789
if (element.children().isEmpty()) {
@@ -75,4 +97,35 @@ private void process(Element parent, StreamReceiver receiver) {
7597
receiver.endEntity();
7698
}
7799
}
100+
101+
private void handleAttributeValuesAsSubfields(StreamReceiver receiver, Element element,
102+
Attributes attributes, Attribute attribute) {
103+
String fullFieldKey = element.nodeName() + "." + attribute.getKey();
104+
if (attrValsAsSubfields.containsKey(fullFieldKey)) {
105+
String configValue = attrValsAsSubfields.get(fullFieldKey);
106+
if (configValue.trim().isEmpty()) {
107+
receiver.literal(attribute.getValue(), element.text().trim());
108+
} else {
109+
String value = attributes.get(configValue);
110+
receiver.literal(attribute.getValue(), value);
111+
}
112+
}
113+
}
114+
115+
public void setAttrValsAsSubfields(String mapString) {
116+
this.attrValsAsSubfields = new HashMap<String, String>();
117+
String input = mapString.startsWith("&") ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
118+
: mapString;
119+
for (String nameValuePair : input.split("&")) {
120+
String[] nameValue = nameValuePair.split("=");
121+
try {
122+
String utf8 = StandardCharsets.UTF_8.name();
123+
String key = URLDecoder.decode(nameValue[0], utf8);
124+
String val = nameValue.length > 1 ? URLDecoder.decode(nameValue[1], utf8) : "";
125+
attrValsAsSubfields.put(key, val);
126+
} catch (UnsupportedEncodingException e) {
127+
e.printStackTrace();
128+
}
129+
}
130+
}
78131
}

metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -97,4 +97,35 @@ public void htmlScriptElementData() {
9797
ordered.verify(receiver, times(4)).endEntity();
9898
}
9999

100+
@Test
101+
public void htmlAttributesAsSubfieldsDefault() {
102+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"));
103+
final InOrder ordered = inOrder(receiver);
104+
ordered.verify(receiver).startEntity("meta");
105+
ordered.verify(receiver).literal("language", "DE");
106+
ordered.verify(receiver, times(4)).endEntity();
107+
}
108+
109+
@Test
110+
public void htmlAttributesAsSubfieldsCustom() {
111+
htmlDecoder.setAttrValsAsSubfields("mods:url.access");
112+
htmlDecoder.process(new StringReader("<mods:url access=\"preview\">file:///img.png</mods:url>"));
113+
final InOrder ordered = inOrder(receiver);
114+
ordered.verify(receiver).startEntity("mods:url");
115+
ordered.verify(receiver).literal("preview", "file:///img.png");
116+
ordered.verify(receiver, times(3)).endEntity();
117+
}
118+
119+
@Test
120+
public void htmlAttributesAsSubfieldsDefaultPlusCustom() {
121+
htmlDecoder.setAttrValsAsSubfields("&mods:url.access");
122+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"
123+
+ "<mods:url access=\"preview\">file:///img.png</mods:url>"));
124+
final InOrder ordered = inOrder(receiver);
125+
ordered.verify(receiver).startEntity("meta");
126+
ordered.verify(receiver).literal("language", "DE");
127+
ordered.verify(receiver).startEntity("mods:url");
128+
ordered.verify(receiver).literal("preview", "file:///img.png");
129+
ordered.verify(receiver, times(3)).endEntity();
130+
}
100131
}

0 commit comments

Comments
 (0)