Skip to content

Commit 3c9998c

Browse files
authored
Merge pull request #313 from metafacture/312-html
Add HTML input support
2 parents e0fdd23 + 3427a67 commit 3c9998c

File tree

9 files changed

+358
-0
lines changed

9 files changed

+358
-0
lines changed

metafacture-html/build.gradle

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
ext.mavenName = 'Metafacture HTML'
18+
description = 'Modules for processing HTML documents'
19+
20+
dependencies {
21+
api project(':metafacture-framework')
22+
implementation project(':metafacture-commons')
23+
implementation 'org.slf4j:slf4j-api:1.7.21'
24+
implementation 'org.apache.commons:commons-compress:1.12'
25+
implementation 'commons-io:commons-io:2.6'
26+
implementation 'org.jsoup:jsoup:1.12.1'
27+
testImplementation 'junit:junit:4.12'
28+
testImplementation 'org.mockito:mockito-core:2.5.5'
29+
testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21'
30+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
import java.util.UUID;
21+
22+
import org.apache.commons.io.IOUtils;
23+
import org.jsoup.Jsoup;
24+
import org.jsoup.nodes.Attribute;
25+
import org.jsoup.nodes.Attributes;
26+
import org.jsoup.nodes.Document;
27+
import org.jsoup.nodes.Element;
28+
import org.metafacture.framework.FluxCommand;
29+
import org.metafacture.framework.StreamReceiver;
30+
import org.metafacture.framework.annotations.Description;
31+
import org.metafacture.framework.annotations.In;
32+
import org.metafacture.framework.annotations.Out;
33+
import org.metafacture.framework.helpers.DefaultObjectPipe;
34+
35+
/**
36+
* Decode HTML to metadata events. Each input document represents one record.
37+
*
38+
* @author Fabian Steeg (fsteeg)
39+
*
40+
*/
41+
@Description("Decode HTML to metadata events")
42+
@In(Reader.class)
43+
@Out(StreamReceiver.class)
44+
@FluxCommand("decode-html")
45+
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
46+
47+
@Override
48+
public void process(final Reader reader) {
49+
try {
50+
StreamReceiver receiver = getReceiver();
51+
receiver.startRecord(UUID.randomUUID().toString());
52+
Document document = Jsoup.parse(IOUtils.toString(reader));
53+
process(document, receiver);
54+
receiver.endRecord();
55+
} catch (IOException e) {
56+
e.printStackTrace();
57+
}
58+
}
59+
60+
private void process(Element parent, StreamReceiver receiver) {
61+
for (Element element : parent.children()) {
62+
receiver.startEntity(element.nodeName());
63+
Attributes attributes = element.attributes();
64+
for (Attribute attribute : attributes) {
65+
receiver.literal(attribute.getKey(), attribute.getValue());
66+
}
67+
if (element.children().isEmpty()) {
68+
String text = element.text().trim();
69+
String value = text.isEmpty() ? element.data() : text;
70+
if (!value.isEmpty()) {
71+
receiver.literal("value", value);
72+
}
73+
}
74+
process(element, receiver);
75+
receiver.endEntity();
76+
}
77+
}
78+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
21+
import org.apache.commons.io.IOUtils;
22+
import org.jsoup.Jsoup;
23+
import org.jsoup.nodes.Document;
24+
import org.jsoup.nodes.Element;
25+
import org.metafacture.framework.FluxCommand;
26+
import org.metafacture.framework.ObjectReceiver;
27+
import org.metafacture.framework.annotations.Description;
28+
import org.metafacture.framework.annotations.In;
29+
import org.metafacture.framework.annotations.Out;
30+
import org.metafacture.framework.helpers.DefaultObjectPipe;
31+
32+
/**
33+
* Extracts the first script from an HTML document
34+
*
35+
* @author Fabian Steeg
36+
*/
37+
@Description("Extracts the first script from an HTML document")
38+
@In(Reader.class)
39+
@Out(String.class)
40+
@FluxCommand("extract-script")
41+
public class ScriptExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
42+
@Override
43+
public void process(final Reader reader) {
44+
try {
45+
Document document = Jsoup.parse(IOUtils.toString(reader));
46+
Element firstScript = document.select("script").first();
47+
getReceiver().process(firstScript.data());
48+
} catch (IOException e) {
49+
e.printStackTrace();
50+
}
51+
}
52+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#
2+
# Copyright 2020 Fabian Steeg, hbz
3+
#
4+
# Licensed under the Apache License, Version 2.0 the "License";
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
decode-html org.metafacture.html.HtmlDecoder
17+
extract-script org.metafacture.html.ScriptExtractor
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import static org.mockito.Mockito.inOrder;
19+
import static org.mockito.Mockito.times;
20+
21+
import java.io.StringReader;
22+
23+
import org.junit.Before;
24+
import org.junit.Test;
25+
import org.metafacture.framework.StreamReceiver;
26+
import org.mockito.InOrder;
27+
import org.mockito.Mock;
28+
import org.mockito.MockitoAnnotations;
29+
30+
/**
31+
* Tests for class {@link HtmlDecoder}.
32+
*
33+
* @author Fabian Steeg
34+
*
35+
*/
36+
public final class HtmlDecoderTest {
37+
38+
@Mock
39+
private StreamReceiver receiver;
40+
41+
private HtmlDecoder htmlDecoder;
42+
43+
@Before
44+
public void setup() {
45+
MockitoAnnotations.initMocks(this);
46+
htmlDecoder = new HtmlDecoder();
47+
htmlDecoder.setReceiver(receiver);
48+
}
49+
50+
@Test
51+
public void htmlElementsAsEntities() {
52+
htmlDecoder.process(new StringReader("<h1>Header</h1><p>Paragraph</p>"));
53+
final InOrder ordered = inOrder(receiver);
54+
ordered.verify(receiver).startEntity("html");
55+
ordered.verify(receiver).startEntity("head");
56+
ordered.verify(receiver).endEntity();
57+
ordered.verify(receiver).startEntity("body");
58+
ordered.verify(receiver).startEntity("h1");
59+
ordered.verify(receiver).literal("value", "Header");
60+
ordered.verify(receiver).endEntity();
61+
ordered.verify(receiver).startEntity("p");
62+
ordered.verify(receiver).literal("value", "Paragraph");
63+
ordered.verify(receiver, times(3)).endEntity();
64+
}
65+
66+
@Test
67+
public void nestedEntities() {
68+
htmlDecoder.process(new StringReader("<ul><li>Item</li></ul>"));
69+
final InOrder ordered = inOrder(receiver);
70+
ordered.verify(receiver).startEntity("ul");
71+
ordered.verify(receiver).startEntity("li");
72+
ordered.verify(receiver).literal("value", "Item");
73+
// elements above plus body, html
74+
ordered.verify(receiver, times(4)).endEntity();
75+
76+
}
77+
78+
@Test
79+
public void htmlAttributesAsLiterals() {
80+
htmlDecoder.process(new StringReader("<p class=lead>Text"));
81+
final InOrder ordered = inOrder(receiver);
82+
ordered.verify(receiver).startEntity("p");
83+
ordered.verify(receiver).literal("class", "lead");
84+
ordered.verify(receiver).literal("value", "Text");
85+
// elements above plus body, html
86+
ordered.verify(receiver, times(3)).endEntity();
87+
}
88+
89+
@Test
90+
public void htmlScriptElementData() {
91+
htmlDecoder.process(new StringReader("<script type=application/ld+json>{\"id\":\"theId\"}</script>"));
92+
final InOrder ordered = inOrder(receiver);
93+
ordered.verify(receiver).startEntity("script");
94+
ordered.verify(receiver).literal("type", "application/ld+json");
95+
ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}");
96+
// elements above plus body, html
97+
ordered.verify(receiver, times(4)).endEntity();
98+
}
99+
100+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import static org.mockito.Mockito.verify;
19+
import static org.mockito.Mockito.verifyNoMoreInteractions;
20+
21+
import java.io.StringReader;
22+
23+
import org.junit.After;
24+
import org.junit.Before;
25+
import org.junit.Test;
26+
import org.metafacture.framework.ObjectReceiver;
27+
import org.mockito.Mock;
28+
import org.mockito.MockitoAnnotations;
29+
30+
/**
31+
* Tests for {@link ScriptExtractor}.
32+
*
33+
* @author Fabian Steeg
34+
*
35+
*/
36+
public final class ScriptExtractorTest {
37+
38+
private static final StringReader IN = new StringReader("<html><script>{\"code\":\"yo\"}");
39+
private static final String OUT = "{\"code\":\"yo\"}";
40+
41+
private ScriptExtractor scriptExtractor;
42+
43+
@Mock
44+
private ObjectReceiver<String> receiver;
45+
46+
@Before
47+
public void setup() {
48+
MockitoAnnotations.initMocks(this);
49+
scriptExtractor = new ScriptExtractor();
50+
scriptExtractor.setReceiver(receiver);
51+
}
52+
53+
@Test
54+
public void testShouldProcessRecordsFollowedbySeparator() {
55+
scriptExtractor.process(IN);
56+
verify(receiver).process(OUT);
57+
verifyNoMoreInteractions(receiver);
58+
}
59+
60+
@After
61+
public void cleanup() {
62+
scriptExtractor.closeStream();
63+
}
64+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Copyright 2020 Fabian Steeg, hbz
2+
#
3+
# Licensed under the Apache License, Version 2.0 the "License";
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
org.slf4j.simpleLogger.defaultLogLevel = DEBUG

metafacture-json/src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@
1414
# limitations under the License.
1515
#
1616
encode-json org.metafacture.json.JsonEncoder
17+
decode-json org.metafacture.json.JsonDecoder

settings.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ include ':metafacture-strings'
2828
include ':metafacture-formeta'
2929
include ':metafacture-formatting'
3030
include ':metafacture-xml'
31+
include ':metafacture-html'
3132
include ':metafacture-triples'
3233
include ':metafacture-statistics'
3334
include ':metafacture-io'

0 commit comments

Comments
 (0)