Skip to content

Commit f2ae2e9

Browse files
committed
Add HtmlDecoder and tests
With `decode-html` flux command See #312
1 parent 4cc023d commit f2ae2e9

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
21+
import org.apache.commons.io.IOUtils;
22+
import org.jsoup.Jsoup;
23+
import org.jsoup.nodes.Attribute;
24+
import org.jsoup.nodes.Attributes;
25+
import org.jsoup.nodes.Element;
26+
import org.metafacture.framework.FluxCommand;
27+
import org.metafacture.framework.StreamReceiver;
28+
import org.metafacture.framework.annotations.Description;
29+
import org.metafacture.framework.annotations.In;
30+
import org.metafacture.framework.annotations.Out;
31+
import org.metafacture.framework.helpers.DefaultObjectPipe;
32+
33+
/**
34+
* Decode HTML to metadata events. Each input document represents one record.
35+
*
36+
* @author Fabian Steeg (fsteeg)
37+
*
38+
*/
39+
@Description("Decode HTML to metadata events")
40+
@In(Reader.class)
41+
@Out(StreamReceiver.class)
42+
@FluxCommand("decode-html")
43+
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
44+
45+
@Override
46+
public void process(final Reader reader) {
47+
try {
48+
StreamReceiver receiver = getReceiver();
49+
receiver.startRecord(null);
50+
String html = IOUtils.toString(reader);
51+
for (Element element : Jsoup.parse(html).getAllElements()) {
52+
receiver.startEntity(element.nodeName());
53+
Attributes attributes = element.attributes();
54+
for (Attribute attribute : attributes) {
55+
receiver.literal(attribute.getKey(), attribute.getValue());
56+
}
57+
String text = element.text().trim();
58+
receiver.literal("value", text.isEmpty() ? element.data() : text);
59+
receiver.endEntity();
60+
}
61+
receiver.endRecord();
62+
} catch (IOException e) {
63+
e.printStackTrace();
64+
}
65+
}
66+
}

metafacture-html/src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@
1414
# limitations under the License.
1515
#
1616
html-to-xml org.metafacture.html.HtmlReader
17+
decode-html org.metafacture.html.HtmlDecoder
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import static org.mockito.Mockito.inOrder;
19+
20+
import java.io.StringReader;
21+
22+
import org.junit.Before;
23+
import org.junit.Test;
24+
import org.metafacture.framework.StreamReceiver;
25+
import org.mockito.InOrder;
26+
import org.mockito.Mock;
27+
import org.mockito.MockitoAnnotations;
28+
29+
/**
30+
* Tests for class {@link HtmlDecoder}.
31+
*
32+
* @author Fabian Steeg
33+
*
34+
*/
35+
public final class HtmlDecoderTest {
36+
37+
@Mock
38+
private StreamReceiver receiver;
39+
40+
private HtmlDecoder htmlDecoder;
41+
42+
@Before
43+
public void setup() {
44+
MockitoAnnotations.initMocks(this);
45+
htmlDecoder = new HtmlDecoder();
46+
htmlDecoder.setReceiver(receiver);
47+
}
48+
49+
@Test
50+
public void htmlElementsAsEntities() {
51+
htmlDecoder.process(new StringReader("<h1>Header</h1><p>Paragraph</p>"));
52+
final InOrder ordered = inOrder(receiver);
53+
ordered.verify(receiver).startEntity("h1");
54+
ordered.verify(receiver).literal("value", "Header");
55+
ordered.verify(receiver).endEntity();
56+
ordered.verify(receiver).startEntity("p");
57+
ordered.verify(receiver).literal("value", "Paragraph");
58+
ordered.verify(receiver).endEntity();
59+
}
60+
61+
@Test
62+
public void nestedEntities() {
63+
htmlDecoder.process(new StringReader("<ul><li>Item</li><ul>"));
64+
final InOrder ordered = inOrder(receiver);
65+
ordered.verify(receiver).startEntity("ul");
66+
ordered.verify(receiver).startEntity("li");
67+
ordered.verify(receiver).literal("value", "Item");
68+
ordered.verify(receiver).endEntity();
69+
ordered.verify(receiver).endEntity();
70+
}
71+
72+
@Test
73+
public void htmlAttributesAsLiterals() {
74+
htmlDecoder.process(new StringReader("<p class=lead>Text"));
75+
final InOrder ordered = inOrder(receiver);
76+
ordered.verify(receiver).startEntity("p");
77+
ordered.verify(receiver).literal("class", "lead");
78+
ordered.verify(receiver).literal("value", "Text");
79+
ordered.verify(receiver).endEntity();
80+
}
81+
82+
@Test
83+
public void htmlScriptElementData() {
84+
htmlDecoder.process(new StringReader("<script type=application/ld+json>{\"id\":\"theId\"}</script>"));
85+
final InOrder ordered = inOrder(receiver);
86+
ordered.verify(receiver).startEntity("script");
87+
ordered.verify(receiver).literal("type", "application/ld+json");
88+
ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}");
89+
ordered.verify(receiver).endEntity();
90+
}
91+
92+
}

0 commit comments

Comments
 (0)