1
1
/*
2
- * Copyright 2020 Fabian Steeg, hbz
2
+ * Copyright 2020, 2021 Fabian Steeg, hbz
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 the "License";
5
5
* you may not use this file except in compliance with the License.
17
17
18
18
import java .io .IOException ;
19
19
import java .io .Reader ;
20
+ import java .io .UnsupportedEncodingException ;
21
+ import java .net .URLDecoder ;
22
+ import java .nio .charset .StandardCharsets ;
23
+ import java .util .HashMap ;
24
+ import java .util .Map ;
20
25
import java .util .UUID ;
21
26
22
27
import org .apache .commons .io .IOUtils ;
38
43
* @author Fabian Steeg (fsteeg)
39
44
*
40
45
*/
41
- @ Description ("Decode HTML to metadata events" )
46
+ @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47
+ + "the default attribute values to be used as subfields (e.g. by default "
48
+ + "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). "
49
+ + "It expects an HTTP-style query string specifying as key the attributes whose value should "
50
+ + "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51
+ + "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52
+ + "(instead of another attribute), omit the value of the query-string key-value pair, "
53
+ + "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54
+ + "e.g. `&h3.class`" )
42
55
@ In (Reader .class )
43
56
@ Out (StreamReceiver .class )
44
57
@ FluxCommand ("decode-html" )
45
58
public class HtmlDecoder extends DefaultObjectPipe <Reader , StreamReceiver > {
46
59
60
+ private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61
+ "meta.name=content&meta.property=content&link.rel=href&a.rel=href" ;
62
+ private Map <String , String > attrValsAsSubfields ;
63
+
64
+ public HtmlDecoder () {
65
+ setAttrValsAsSubfields (DEFAULT_ATTR_VALS_AS_SUBFIELDS );
66
+ }
67
+
47
68
@ Override
48
69
public void process (final Reader reader ) {
49
70
try {
@@ -62,6 +83,7 @@ private void process(Element parent, StreamReceiver receiver) {
62
83
receiver .startEntity (element .nodeName ());
63
84
Attributes attributes = element .attributes ();
64
85
for (Attribute attribute : attributes ) {
86
+ handleAttributeValuesAsSubfields (receiver , element , attributes , attribute );
65
87
receiver .literal (attribute .getKey (), attribute .getValue ());
66
88
}
67
89
if (element .children ().isEmpty ()) {
@@ -75,4 +97,35 @@ private void process(Element parent, StreamReceiver receiver) {
75
97
receiver .endEntity ();
76
98
}
77
99
}
100
+
101
+ private void handleAttributeValuesAsSubfields (StreamReceiver receiver , Element element ,
102
+ Attributes attributes , Attribute attribute ) {
103
+ String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
104
+ if (attrValsAsSubfields .containsKey (fullFieldKey )) {
105
+ String configValue = attrValsAsSubfields .get (fullFieldKey );
106
+ if (configValue .trim ().isEmpty ()) {
107
+ receiver .literal (attribute .getValue (), element .text ().trim ());
108
+ } else {
109
+ String value = attributes .get (configValue );
110
+ receiver .literal (attribute .getValue (), value );
111
+ }
112
+ }
113
+ }
114
+
115
+ public void setAttrValsAsSubfields (String mapString ) {
116
+ this .attrValsAsSubfields = new HashMap <String , String >();
117
+ String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
118
+ : mapString ;
119
+ for (String nameValuePair : input .split ("&" )) {
120
+ String [] nameValue = nameValuePair .split ("=" );
121
+ try {
122
+ String utf8 = StandardCharsets .UTF_8 .name ();
123
+ String key = URLDecoder .decode (nameValue [0 ], utf8 );
124
+ String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
125
+ attrValsAsSubfields .put (key , val );
126
+ } catch (UnsupportedEncodingException e ) {
127
+ e .printStackTrace ();
128
+ }
129
+ }
130
+ }
78
131
}
0 commit comments