Skip to content

Commit 6605128

Browse files
committed
Process document recursively
Set generated record ID, only process content of leaf nodes See #312
1 parent f2ae2e9 commit 6605128

File tree

2 files changed

+39
-19
lines changed

2 files changed

+39
-19
lines changed

metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717

1818
import java.io.IOException;
1919
import java.io.Reader;
20+
import java.util.UUID;
2021

2122
import org.apache.commons.io.IOUtils;
2223
import org.jsoup.Jsoup;
2324
import org.jsoup.nodes.Attribute;
2425
import org.jsoup.nodes.Attributes;
26+
import org.jsoup.nodes.Document;
2527
import org.jsoup.nodes.Element;
2628
import org.metafacture.framework.FluxCommand;
2729
import org.metafacture.framework.StreamReceiver;
@@ -46,21 +48,31 @@ public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
4648
public void process(final Reader reader) {
4749
try {
4850
StreamReceiver receiver = getReceiver();
49-
receiver.startRecord(null);
50-
String html = IOUtils.toString(reader);
51-
for (Element element : Jsoup.parse(html).getAllElements()) {
52-
receiver.startEntity(element.nodeName());
53-
Attributes attributes = element.attributes();
54-
for (Attribute attribute : attributes) {
55-
receiver.literal(attribute.getKey(), attribute.getValue());
56-
}
57-
String text = element.text().trim();
58-
receiver.literal("value", text.isEmpty() ? element.data() : text);
59-
receiver.endEntity();
60-
}
51+
receiver.startRecord(UUID.randomUUID().toString());
52+
Document document = Jsoup.parse(IOUtils.toString(reader));
53+
process(document, receiver);
6154
receiver.endRecord();
6255
} catch (IOException e) {
6356
e.printStackTrace();
6457
}
6558
}
59+
60+
private void process(Element parent, StreamReceiver receiver) {
61+
for (Element element : parent.children()) {
62+
receiver.startEntity(element.nodeName());
63+
Attributes attributes = element.attributes();
64+
for (Attribute attribute : attributes) {
65+
receiver.literal(attribute.getKey(), attribute.getValue());
66+
}
67+
if (element.children().isEmpty()) {
68+
String text = element.text().trim();
69+
String value = text.isEmpty() ? element.data() : text;
70+
if (!value.isEmpty()) {
71+
receiver.literal("value", value);
72+
}
73+
}
74+
process(element, receiver);
75+
receiver.endEntity();
76+
}
77+
}
6678
}

metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package org.metafacture.html;
1717

1818
import static org.mockito.Mockito.inOrder;
19+
import static org.mockito.Mockito.times;
1920

2021
import java.io.StringReader;
2122

@@ -50,23 +51,28 @@ public void setup() {
5051
public void htmlElementsAsEntities() {
5152
htmlDecoder.process(new StringReader("<h1>Header</h1><p>Paragraph</p>"));
5253
final InOrder ordered = inOrder(receiver);
54+
ordered.verify(receiver).startEntity("html");
55+
ordered.verify(receiver).startEntity("head");
56+
ordered.verify(receiver).endEntity();
57+
ordered.verify(receiver).startEntity("body");
5358
ordered.verify(receiver).startEntity("h1");
5459
ordered.verify(receiver).literal("value", "Header");
5560
ordered.verify(receiver).endEntity();
5661
ordered.verify(receiver).startEntity("p");
5762
ordered.verify(receiver).literal("value", "Paragraph");
58-
ordered.verify(receiver).endEntity();
63+
ordered.verify(receiver, times(3)).endEntity();
5964
}
6065

6166
@Test
6267
public void nestedEntities() {
63-
htmlDecoder.process(new StringReader("<ul><li>Item</li><ul>"));
68+
htmlDecoder.process(new StringReader("<ul><li>Item</li></ul>"));
6469
final InOrder ordered = inOrder(receiver);
6570
ordered.verify(receiver).startEntity("ul");
6671
ordered.verify(receiver).startEntity("li");
6772
ordered.verify(receiver).literal("value", "Item");
68-
ordered.verify(receiver).endEntity();
69-
ordered.verify(receiver).endEntity();
73+
// elements above plus body, html
74+
ordered.verify(receiver, times(4)).endEntity();
75+
7076
}
7177

7278
@Test
@@ -76,17 +82,19 @@ public void htmlAttributesAsLiterals() {
7682
ordered.verify(receiver).startEntity("p");
7783
ordered.verify(receiver).literal("class", "lead");
7884
ordered.verify(receiver).literal("value", "Text");
79-
ordered.verify(receiver).endEntity();
85+
// elements above plus body, html
86+
ordered.verify(receiver, times(3)).endEntity();
8087
}
81-
88+
8289
@Test
8390
public void htmlScriptElementData() {
8491
htmlDecoder.process(new StringReader("<script type=application/ld+json>{\"id\":\"theId\"}</script>"));
8592
final InOrder ordered = inOrder(receiver);
8693
ordered.verify(receiver).startEntity("script");
8794
ordered.verify(receiver).literal("type", "application/ld+json");
8895
ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}");
89-
ordered.verify(receiver).endEntity();
96+
// elements above plus body, html
97+
ordered.verify(receiver, times(4)).endEntity();
9098
}
9199

92100
}

0 commit comments

Comments
 (0)