1
1
/*
2
- * Copyright 2020 Fabian Steeg, hbz
2
+ * Copyright 2020, 2021 Fabian Steeg, hbz
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 the "License";
5
5
* you may not use this file except in compliance with the License.
17
17
18
18
import java .io .IOException ;
19
19
import java .io .Reader ;
20
+ import java .io .UnsupportedEncodingException ;
21
+ import java .net .URLDecoder ;
22
+ import java .nio .charset .StandardCharsets ;
23
+ import java .util .HashMap ;
24
+ import java .util .Map ;
20
25
import java .util .UUID ;
21
26
22
27
import org .apache .commons .io .IOUtils ;
38
43
* @author Fabian Steeg (fsteeg)
39
44
*
40
45
*/
41
- @ Description ("Decode HTML to metadata events" )
46
+ @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47
+ + "the default attribute values to be used as subfields (e.g. by default "
48
+ + "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). "
49
+ + "It expects an HTTP-style query string specifying as key the attributes whose value should "
50
+ + "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51
+ + "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52
+ + "(instead of another attribute), omit the value of the query-string key-value pair, "
53
+ + "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54
+ + "e.g. `&h3.class`" )
42
55
@ In (Reader .class )
43
56
@ Out (StreamReceiver .class )
44
57
@ FluxCommand ("decode-html" )
45
58
public class HtmlDecoder extends DefaultObjectPipe <Reader , StreamReceiver > {
46
59
60
+ private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61
+ "meta.name=content&meta.property=content&link.rel=href&a.rel=href" ;
62
+ private Map <String , String > attrValsAsSubfields ;
63
+
64
+ public HtmlDecoder () {
65
+ setAttrValsAsSubfields (DEFAULT_ATTR_VALS_AS_SUBFIELDS );
66
+ }
67
+
47
68
@ Override
48
69
public void process (final Reader reader ) {
49
70
try {
@@ -61,18 +82,54 @@ private void process(Element parent, StreamReceiver receiver) {
61
82
for (Element element : parent .children ()) {
62
83
receiver .startEntity (element .nodeName ());
63
84
Attributes attributes = element .attributes ();
85
+ boolean addedValueAsSubfield = false ;
64
86
for (Attribute attribute : attributes ) {
87
+ addedValueAsSubfield = handleAttributeValuesAsSubfields (receiver , element , attributes , attribute );
65
88
receiver .literal (attribute .getKey (), attribute .getValue ());
66
89
}
67
90
if (element .children ().isEmpty ()) {
68
91
String text = element .text ().trim ();
69
92
String value = text .isEmpty () ? element .data () : text ;
70
- if (!value .isEmpty ()) {
93
+ if (!value .isEmpty () && ! addedValueAsSubfield ) {
71
94
receiver .literal ("value" , value );
72
95
}
73
96
}
74
97
process (element , receiver );
75
98
receiver .endEntity ();
76
99
}
77
100
}
101
+
102
+ private boolean handleAttributeValuesAsSubfields (StreamReceiver receiver , Element element ,
103
+ Attributes attributes , Attribute attribute ) {
104
+ String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
105
+ if (attrValsAsSubfields .containsKey (fullFieldKey )) {
106
+ String configValue = attrValsAsSubfields .get (fullFieldKey );
107
+ if (configValue .trim ().isEmpty ()) {
108
+ receiver .literal (attribute .getValue (), element .text ().trim ());
109
+ return true ;
110
+ } else {
111
+ String value = attributes .get (configValue );
112
+ receiver .literal (attribute .getValue (), value );
113
+ }
114
+ }
115
+ return false ;
116
+ }
117
+
118
+ public void setAttrValsAsSubfields (String mapString ) {
119
+ this .attrValsAsSubfields = new HashMap <String , String >();
120
+ String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
121
+ : mapString ;
122
+ for (String nameValuePair : input .split ("&" )) {
123
+ String [] nameValue = nameValuePair .split ("=" );
124
+ try {
125
+ String utf8 = StandardCharsets .UTF_8 .name ();
126
+ String key = URLDecoder .decode (nameValue [0 ], utf8 );
127
+ String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
128
+ attrValsAsSubfields .put (key , val );
129
+ } catch (UnsupportedEncodingException e ) {
130
+ e .printStackTrace ();
131
+ }
132
+ }
133
+ }
134
+
78
135
}
0 commit comments