11package cz .metacentrum .perun .core .impl ;
22
3+ import cz .metacentrum .perun .core .api .BeansUtils ;
34import org .owasp .html .CssSchema ;
45import org .owasp .html .HtmlChangeListener ;
56import org .owasp .html .HtmlPolicyBuilder ;
1819 * @author: Matej Hakoš <[email protected] > 1920 */
2021public class HTMLParser {
21- private static final String [] allowedTags = {"a" , "article" , "aside" , "b" , "blockquote" , "br" , "button" , "caption" , "center" , "cite" , "decorator" , "del" , "details" , "div" , "em" , "footer" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "header" , "hr" , "i" , "img" , "kbd" , "label" , "li" , "ol" , "p" , "pre" , "section" , "select" , "span" , "strong" , "sup" , "table" , "tbody" , "td" , "textarea" , "tfoot" , "th" , "thead" , "tr" , "ul" };
22- private static final String [] allowedAttributes = {"align" , "class" , "color" , "disabled" , "height" , "hidden" , "href" , "id" , "label" , "size" , "span" , "src" , "srcset" , "style" , "width" };
23- private static final String [] allowedStyles = {"color" , "background-color" , "font-size" , "font-family" , "text-align" , "margin" , "padding" , "border" , "width" , "height" , "display" , "position" , "top" , "bottom" , "left" , "right" , "overflow" , "float" , "clear" , "z-index" };
22+ private static final String [] allowedTags = {"a" , "article" , "aside" , "b" , "blockquote" , "br" , "button" , "caption" , "center" , "cite" , "decorator" , "del" , "details" , "div" , "em" , "footer" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "header" , "hr" , "i" , "img" , "kbd" , "label" , "li" , "ol" , "p" , "pre" , "section" , "select" , "span" , "strong" , "sup" , "table" , "tbody" , "td" , "textarea" , "tfoot" , "th" , "thead" , "tr" , "u" , " ul" };
23+ private static final String [] allowedAttributes = {"align" , "class" , "color" , "data-lang" , " disabled" , "height" , "hidden" , "href" , "id" , "label" , "rel" , " size" , "span" , "src" , "srcset" , "style" , "type " , "width" };
24+ private static final String [] allowedStyles = {"color" , "background-color" , "font-size" , "font-family" , "text-align" , "margin" , "padding" , "border" , "width" , "max-height" , " height" , "display" , "position" , "top" , "bottom" , "left" , "right" , "overflow" , "float" , "clear" , "z-index" };
2425 private static final String [] allowedUrlProtocols = {"https" , "mailto" };
2526 private static PolicyFactory policy = null ;
2627
2728 private List <String > escapedTags = new ArrayList <>();
2829 private List <String > escapedAttributes = new ArrayList <>();
2930 private List <String > escapedStyles = new ArrayList <>();
31+ private List <String > escapedLinks = new ArrayList <>();
3032
3133 private String rawHTML = "" ;
3234 private String escapedHTML = "" ;
33- private String [] escapedStrings = new String []{"" , "" , "" };
35+ private String [] escapedStrings = new String []{"" , "" , "" , "" };
3436 private boolean isInputValid = true ;
3537
3638 public HTMLParser () {
@@ -54,6 +56,10 @@ private static PolicyFactory generatePolicy() {
5456 }
5557 CssSchema cssWhitelist = CssSchema .withProperties (Arrays .stream (allowedStyles ).toList ());
5658 p .allowStyling (cssWhitelist );
59+
60+ // allow 'href' and 'target' attribute on 'a' tags
61+ p .allowAttributes ("href" ).onElements ("a" );
62+ p .allowAttributes ("target" ).onElements ("a" );
5763 return p .toFactory ();
5864 }
5965
@@ -64,11 +70,11 @@ private static PolicyFactory generatePolicy() {
6470 * @param escaped - escaped input
6571 */
6672 private void computeEscapedStyles (String input , String escaped ) {
67- Pattern pattern = Pattern .compile ("style=\" (.*?)\" " );
73+ Pattern pattern = Pattern .compile ("style=( \" |') (.*?)( \" |') " );
6874 if (input == null || escaped == null ) return ;
6975 Matcher matcher = pattern .matcher (input );
7076 while (matcher .find ()) {
71- String style = matcher .group (1 );
77+ String style = matcher .group (2 );
7278 String [] styles = style .split (";" );
7379 for (String s : styles ) {
7480 String [] split = s .split (":" );
@@ -79,7 +85,7 @@ private void computeEscapedStyles(String input, String escaped) {
7985 }
8086 matcher = pattern .matcher (escaped );
8187 while (matcher .find ()) {
82- String style = matcher .group (1 );
88+ String style = matcher .group (2 );
8389 String [] styles = style .split (";" );
8490 for (String s : styles ) {
8591 String [] split = s .split (":" );
@@ -90,6 +96,27 @@ private void computeEscapedStyles(String input, String escaped) {
9096 }
9197 }
9298
99+ /**
100+ * Computes the difference between all links in the escaped and unescaped input.
101+ *
102+ * @param input - unescaped input
103+ * @param escaped - escaped input
104+ */
105+ public void computeInvalidLink (String input , String escaped ) {
106+ Pattern pattern = Pattern .compile ("href=(\" |')(.*?)(\" |')" );
107+ if (input == null || escaped == null ) return ;
108+ Matcher matcher = pattern .matcher (input );
109+ while (matcher .find ()) {
110+ // Multiple groups, url is in group 2
111+ String link = matcher .group (2 ).trim ();
112+ escapedLinks .add (link );
113+ }
114+
115+ for (String protocol : allowedUrlProtocols ) {
116+ escapedLinks .removeIf (link -> link .startsWith (protocol ));
117+ }
118+ }
119+
93120 /**
94121 * Clears the list of escaped tags and attributes.
95122 * Recomputes the policy and resets the escapedHTML/unescapedHTML and escapedStrings.
@@ -99,9 +126,10 @@ public HTMLParser clear() {
99126 escapedTags .clear ();
100127 escapedAttributes .clear ();
101128 escapedStyles .clear ();
129+ escapedLinks .clear ();
102130 escapedHTML = "" ;
103131 rawHTML = "" ;
104- escapedStrings = new String []{"" , "" , "" };
132+ escapedStrings = new String []{"" , "" , "" , "" };
105133 isInputValid = true ;
106134 policy = generatePolicy ();
107135 return this ;
@@ -112,8 +140,10 @@ public HTMLParser clear() {
112140 * @return escapedHTML - sanitized HTML input
113141 */
114142 public String getEscapedHTML () {
115- return rawHTML ;
116- // return escapedHTML;
143+ if (!BeansUtils .getCoreConfig ().getForceHTMLSanitization ()) {
144+ return rawHTML ;
145+ }
146+ return escapedHTML ;
117147 }
118148
119149 /**
@@ -129,8 +159,10 @@ public String getRawInput() {
129159 * Returns validity of the last input.
130160 */
131161 public boolean isInputValid () {
132- return true ;
133- // return isInputValid;
162+ if (!BeansUtils .getCoreConfig ().getForceHTMLSanitization ()){
163+ return true ;
164+ }
165+ return isInputValid ;
134166 }
135167
136168 /**
@@ -150,15 +182,26 @@ public String[] getEscaped() {
150182 */
151183 public HTMLParser sanitizeHTML (String input ) {
152184 rawHTML = input ;
185+ if (!BeansUtils .getCoreConfig ().getForceHTMLSanitization ()) {
186+ return this ;
187+ }
153188 HtmlChangeListener <List <List <String >>> listener = new HtmlChangeListener <>() {
154189 @ Override
155190 public void discardedTag (@ Nullable List <List <String >> output , String tag ) {
191+ if (tag .equals ("a" )) {
192+ return ;
193+ }
156194 output .get (0 ).add (tag );
157195 }
158196
159197 @ Override
160198 public void discardedAttributes (@ Nullable List <List <String >> output , String tag , String ... attributes ) {
161- output .get (1 ).add (Arrays .toString (attributes ) + " in " + tag );
199+ // Remove 'href', because it gets computed in computeInvalidLink
200+ List <String > attrs = Arrays .stream (attributes ).filter (attr -> !attr .equals ("href" )).toList ();
201+ if (attrs .isEmpty ()) {
202+ return ;
203+ }
204+ output .get (1 ).add (tag + " " + attrs );
162205 }
163206 };
164207
@@ -167,12 +210,20 @@ public void discardedAttributes(@Nullable List<List<String>> output, String tag,
167210 output .add (new ArrayList <>());
168211 output .add (new ArrayList <>());
169212 String escapedOutput = policy .sanitize (input , listener , output );
213+
170214 escapedTags = output .get (0 );
171215 escapedAttributes = output .get (1 );
172216 computeEscapedStyles (input , escapedOutput );
217+ computeInvalidLink (input , escapedOutput );
218+
219+ // Remove whitespaces and filter empty strings
220+ escapedTags = escapedTags .stream ().map (String ::trim ).filter (s -> !s .isEmpty ()).toList ();
221+ escapedAttributes = escapedAttributes .stream ().map (String ::trim ).filter (s -> !s .isEmpty ()).toList ();
222+ escapedStyles = escapedStyles .stream ().map (String ::trim ).filter (s -> !s .isEmpty ()).toList ();
223+ escapedLinks = escapedLinks .stream ().map (String ::trim ).filter (s -> !s .isEmpty ()).toList ();
173224
174225 escapedHTML = escapedOutput ;
175- isInputValid = escapedTags .isEmpty () && escapedAttributes .isEmpty () && escapedStyles .isEmpty ();
226+ isInputValid = escapedTags .isEmpty () && escapedAttributes .isEmpty () && escapedStyles .isEmpty () && escapedLinks . isEmpty () ;
176227 return this ;
177228 }
178229
@@ -185,13 +236,14 @@ public void discardedAttributes(@Nullable List<List<String>> output, String tag,
185236 */
186237 public HTMLParser checkEscapedHTML (String escaped , String unescaped ) {
187238 if (escaped .equals (unescaped ) || isInputValid ){
188- escapedStrings = new String []{"" , "" , "" };
239+ escapedStrings = new String []{"" , "" , "" , "" };
189240 return this ;
190241 }
191242
192243 escapedStrings [0 ] = String .join (", " , escapedTags );
193244 escapedStrings [1 ] = String .join (", " , escapedAttributes );
194245 escapedStrings [2 ] = String .join (", " , escapedStyles );
246+ escapedStrings [3 ] = String .join (", " , escapedLinks );
195247 return this ;
196248 }
197249
@@ -208,14 +260,17 @@ public HTMLParser checkEscapedHTML() {
208260 public static String getMessage (String [] escaped ) {
209261 StringBuilder message = new StringBuilder ();
210262 if (escaped [0 ].length () > 0 ) {
211- message .append ("The following tags are not allowed : " ).append (escaped [0 ]).append (". " );
263+ message .append ("Following tags are invalid : " ).append (escaped [0 ]).append (". " );
212264 }
213265 if (escaped [1 ].length () > 0 ) {
214- message .append ("The following attributes are not allowed : " ).append (escaped [1 ]).append (". " );
266+ message .append ("Following attributes are invalid : " ).append (escaped [1 ]).append (". " );
215267 }
216268 if (escaped [2 ].length () > 0 ) {
217- message .append ("The following styles are not allowed : " ).append (escaped [2 ]).append (". " );
269+ message .append ("Following styles are invalid : " ).append (escaped [2 ]).append (". " );
218270 }
271+ if (escaped [3 ].length () > 0 ) {
272+ message .append ("'" ).append (escaped [3 ]).append ("' links contain invalid protocol (allowed: https, mailto)." );
273+ }
219274 return message .toString ();
220275 }
221276
0 commit comments