13
13
* See the License for the specific language governing permissions and
14
14
* limitations under the License.
15
15
*/
16
+
16
17
package org .metafacture .html ;
17
18
18
- import java .io .IOException ;
19
- import java .io .Reader ;
20
- import java .io .UnsupportedEncodingException ;
21
- import java .net .URLDecoder ;
22
- import java .nio .charset .StandardCharsets ;
23
- import java .util .HashMap ;
24
- import java .util .Map ;
25
- import java .util .UUID ;
19
+ import org .metafacture .framework .FluxCommand ;
20
+ import org .metafacture .framework .StreamReceiver ;
21
+ import org .metafacture .framework .annotations .Description ;
22
+ import org .metafacture .framework .annotations .In ;
23
+ import org .metafacture .framework .annotations .Out ;
24
+ import org .metafacture .framework .helpers .DefaultObjectPipe ;
26
25
27
26
import org .apache .commons .io .IOUtils ;
28
27
import org .jsoup .Jsoup ;
29
28
import org .jsoup .nodes .Attribute ;
30
29
import org .jsoup .nodes .Attributes ;
31
30
import org .jsoup .nodes .Document ;
32
31
import org .jsoup .nodes .Element ;
33
- import org .metafacture .framework .FluxCommand ;
34
- import org .metafacture .framework .StreamReceiver ;
35
- import org .metafacture .framework .annotations .Description ;
36
- import org .metafacture .framework .annotations .In ;
37
- import org .metafacture .framework .annotations .Out ;
38
- import org .metafacture .framework .helpers .DefaultObjectPipe ;
39
32
import org .slf4j .Logger ;
40
33
import org .slf4j .LoggerFactory ;
41
34
35
+ import java .io .IOException ;
36
+ import java .io .Reader ;
37
+ import java .io .UnsupportedEncodingException ;
38
+ import java .net .URLDecoder ;
39
+ import java .nio .charset .StandardCharsets ;
40
+ import java .util .HashMap ;
41
+ import java .util .Map ;
42
+ import java .util .UUID ;
43
+
42
44
/**
43
45
* Decode HTML to metadata events. Each input document represents one record.
44
46
*
45
47
* @author Fabian Steeg (fsteeg)
46
48
*
47
49
*/
48
- @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
49
- + "the default attribute values to be used as subfields (e.g. by default "
50
- + "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). "
51
- + "It expects an HTTP-style query string specifying as key the attributes whose value should "
52
- + "be used as a subfield, and as value the attribute whose value should be the subfield value, "
53
- + "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
54
- + "(instead of another attribute), omit the value of the query-string key-value pair, "
55
- + "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
56
- + "e.g. `&h3.class`" )
50
+ @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override " +
51
+ "the default attribute values to be used as subfields (e.g. by default " +
52
+ "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). " +
53
+ "It expects an HTTP-style query string specifying as key the attributes whose value should " +
54
+ "be used as a subfield, and as value the attribute whose value should be the subfield value, " +
55
+ "e.g. the default contains `link.rel=href`. To use the HTML element text as the value " +
56
+ "(instead of another attribute), omit the value of the query-string key-value pair, " +
57
+ "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, " +
58
+ "e.g. `&h3.class`" )
57
59
@ In (Reader .class )
58
60
@ Out (StreamReceiver .class )
59
61
@ FluxCommand ("decode-html" )
60
62
public class HtmlDecoder extends DefaultObjectPipe <Reader , StreamReceiver > {
61
63
64
+ private static final Logger LOG = LoggerFactory .getLogger (HtmlDecoder .class );
65
+
62
66
private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
63
67
"meta.name=content&meta.property=content&link.rel=href&a.rel=href" ;
68
+
64
69
private Map <String , String > attrValsAsSubfields ;
65
- private static final Logger LOG =
66
- LoggerFactory .getLogger (HtmlDecoder .class );
67
70
68
71
public HtmlDecoder () {
69
72
setAttrValsAsSubfields (DEFAULT_ATTR_VALS_AS_SUBFIELDS );
@@ -72,28 +75,29 @@ public HtmlDecoder() {
72
75
@ Override
73
76
public void process (final Reader reader ) {
74
77
try {
75
- StreamReceiver receiver = getReceiver ();
78
+ final StreamReceiver receiver = getReceiver ();
76
79
receiver .startRecord (UUID .randomUUID ().toString ());
77
- Document document = Jsoup .parse (IOUtils .toString (reader ));
80
+ final Document document = Jsoup .parse (IOUtils .toString (reader ));
78
81
process (document , receiver );
79
82
receiver .endRecord ();
80
- } catch (IOException e ) {
83
+ }
84
+ catch (final IOException e ) {
81
85
LOG .error (e .getMessage (), e );
82
86
}
83
87
}
84
88
85
- private void process (Element parent , StreamReceiver receiver ) {
86
- for (Element element : parent .children ()) {
89
+ private void process (final Element parent , final StreamReceiver receiver ) {
90
+ for (final Element element : parent .children ()) {
87
91
receiver .startEntity (element .nodeName ());
88
- Attributes attributes = element .attributes ();
92
+ final Attributes attributes = element .attributes ();
89
93
boolean addedValueAsSubfield = false ;
90
- for (Attribute attribute : attributes ) {
94
+ for (final Attribute attribute : attributes ) {
91
95
addedValueAsSubfield = handleAttributeValuesAsSubfields (receiver , element , attributes , attribute );
92
96
receiver .literal (attribute .getKey (), attribute .getValue ());
93
97
}
94
98
if (element .children ().isEmpty ()) {
95
- String text = element .text ().trim ();
96
- String value = text .isEmpty () ? element .data () : text ;
99
+ final String text = element .text ().trim ();
100
+ final String value = text .isEmpty () ? element .data () : text ;
97
101
if (!value .isEmpty () && !addedValueAsSubfield ) {
98
102
receiver .literal ("value" , value );
99
103
}
@@ -103,34 +107,34 @@ private void process(Element parent, StreamReceiver receiver) {
103
107
}
104
108
}
105
109
106
- private boolean handleAttributeValuesAsSubfields (StreamReceiver receiver , Element element ,
107
- Attributes attributes , Attribute attribute ) {
108
- String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
110
+ private boolean handleAttributeValuesAsSubfields (final StreamReceiver receiver , final Element element , final Attributes attributes , final Attribute attribute ) {
111
+ final String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
109
112
if (attrValsAsSubfields .containsKey (fullFieldKey )) {
110
- String configValue = attrValsAsSubfields .get (fullFieldKey );
113
+ final String configValue = attrValsAsSubfields .get (fullFieldKey );
111
114
if (configValue .trim ().isEmpty ()) {
112
115
receiver .literal (attribute .getValue (), element .text ().trim ());
113
116
return true ;
114
- } else {
115
- String value = attributes .get (configValue );
117
+ }
118
+ else {
119
+ final String value = attributes .get (configValue );
116
120
receiver .literal (attribute .getValue (), value );
117
121
}
118
122
}
119
123
return false ;
120
124
}
121
125
122
- public void setAttrValsAsSubfields (String mapString ) {
126
+ public void setAttrValsAsSubfields (final String mapString ) {
123
127
this .attrValsAsSubfields = new HashMap <String , String >();
124
- String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
125
- : mapString ;
126
- for (String nameValuePair : input .split ("&" )) {
127
- String [] nameValue = nameValuePair .split ("=" );
128
+ final String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString : mapString ;
129
+ for (final String nameValuePair : input .split ("&" )) {
130
+ final String [] nameValue = nameValuePair .split ("=" );
128
131
try {
129
- String utf8 = StandardCharsets .UTF_8 .name ();
130
- String key = URLDecoder .decode (nameValue [0 ], utf8 );
131
- String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
132
+ final String utf8 = StandardCharsets .UTF_8 .name ();
133
+ final String key = URLDecoder .decode (nameValue [0 ], utf8 );
134
+ final String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
132
135
attrValsAsSubfields .put (key , val );
133
- } catch (UnsupportedEncodingException e ) {
136
+ }
137
+ catch (final UnsupportedEncodingException e ) {
134
138
LOG .error (e .getMessage (), e );
135
139
}
136
140
}
0 commit comments