Skip to content

Commit 1dfddca

Browse files
committed
Basic HtmlReader with html-to-xml flux command
Parse HTML with jsoup, write XML. See example in test. See #312
1 parent e0fdd23 commit 1dfddca

File tree

6 files changed

+177
-0
lines changed

6 files changed

+177
-0
lines changed

metafacture-html/build.gradle

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
ext.mavenName = 'Metafacture HTML'
18+
description = 'Modules for processing HTML documents'
19+
20+
dependencies {
21+
api project(':metafacture-framework')
22+
implementation project(':metafacture-commons')
23+
implementation 'org.slf4j:slf4j-api:1.7.21'
24+
implementation 'org.apache.commons:commons-compress:1.12'
25+
implementation 'commons-io:commons-io:2.6'
26+
implementation 'org.jsoup:jsoup:1.12.1'
27+
testImplementation 'junit:junit:4.12'
28+
testImplementation 'org.mockito:mockito-core:2.5.5'
29+
testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21'
30+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
21+
import org.apache.commons.io.IOUtils;
22+
import org.jsoup.Jsoup;
23+
import org.jsoup.nodes.Document;
24+
import org.metafacture.framework.FluxCommand;
25+
import org.metafacture.framework.ObjectReceiver;
26+
import org.metafacture.framework.annotations.Description;
27+
import org.metafacture.framework.annotations.In;
28+
import org.metafacture.framework.annotations.Out;
29+
import org.metafacture.framework.helpers.DefaultObjectPipe;
30+
31+
/**
32+
* Parses HTML to X(HT)ML
33+
*
34+
* @author Fabian Steeg
35+
*/
36+
@Description("Parses HTML to X(HT)ML")
37+
@In(Reader.class)
38+
@Out(String.class)
39+
@FluxCommand("html-to-xml")
40+
public class HtmlReader extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
41+
@Override
42+
public void process(final Reader reader) {
43+
try {
44+
Document document = Jsoup.parse(IOUtils.toString(reader));
45+
document.outputSettings().prettyPrint(false).syntax(Document.OutputSettings.Syntax.xml);
46+
getReceiver().process(document.html());
47+
} catch (IOException e) {
48+
e.printStackTrace();
49+
}
50+
}
51+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Copyright 2020 Fabian Steeg, hbz
3+
#
4+
# Licensed under the Apache License, Version 2.0 the "License";
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
html-to-xml org.metafacture.html.HtmlReader
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import static org.mockito.Mockito.verify;
19+
import static org.mockito.Mockito.verifyNoMoreInteractions;
20+
21+
import java.io.StringReader;
22+
23+
import org.junit.After;
24+
import org.junit.Before;
25+
import org.junit.Test;
26+
import org.metafacture.framework.ObjectReceiver;
27+
import org.mockito.Mock;
28+
import org.mockito.MockitoAnnotations;
29+
30+
/**
31+
* Tests for {@link HtmlReader}.
32+
*
33+
* @author Fabian Steeg
34+
*
35+
*/
36+
public final class HtmlReaderTest {
37+
38+
private static final StringReader IN = new StringReader("<html><i>hi");
39+
private static final String OUT = "<html><head></head><body><i>hi</i></body></html>";
40+
41+
private HtmlReader htmlReader;
42+
43+
@Mock
44+
private ObjectReceiver<String> receiver;
45+
46+
@Before
47+
public void setup() {
48+
MockitoAnnotations.initMocks(this);
49+
htmlReader = new HtmlReader();
50+
htmlReader.setReceiver(receiver);
51+
}
52+
53+
@Test
54+
public void testShouldProcessRecordsFollowedbySeparator() {
55+
htmlReader.process(IN);
56+
verify(receiver).process(OUT);
57+
verifyNoMoreInteractions(receiver);
58+
}
59+
60+
@After
61+
public void cleanup() {
62+
htmlReader.closeStream();
63+
}
64+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Copyright 2020 Fabian Steeg, hbz
2+
#
3+
# Licensed under the Apache License, Version 2.0 the "License";
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
org.slf4j.simpleLogger.defaultLogLevel = DEBUG

settings.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ include ':metafacture-strings'
2828
include ':metafacture-formeta'
2929
include ':metafacture-formatting'
3030
include ':metafacture-xml'
31+
include ':metafacture-html'
3132
include ':metafacture-triples'
3233
include ':metafacture-statistics'
3334
include ':metafacture-io'

0 commit comments

Comments
 (0)