Skip to content

Commit 6e656c5

Browse files
committed
Move HTML validation from naucse.utils.views to naucse.validation
The validator wasn't even used in views!
1 parent d6c3f64 commit 6e656c5

File tree

7 files changed

+216
-209
lines changed

7 files changed

+216
-209
lines changed

naucse/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
import naucse.utils.views
1515
from naucse.utils.models import Model, YamlProperty, DataProperty, DirProperty, MultipleModelDirProperty, ForkProperty
1616
from naucse.utils.models import reify, arca
17-
from naucse.utils.views import AllowedElementsParser, absolute_urls_to_freeze
17+
from naucse.utils.views import absolute_urls_to_freeze
18+
from naucse.validation import AllowedElementsParser
1819
from naucse.templates import setup_jinja_env, vars_functions
1920
from naucse.utils.markdown import convert_markdown
2021
from naucse.utils.notebook import convert_notebook

naucse/utils/links.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from xml.dom import SyntaxErr
44

55
from naucse.models import Page
6-
from naucse.utils.views import DisallowedStyle
6+
from naucse.validation import DisallowedStyle
77

88

99
class InvalidInfo(Exception):

naucse/utils/views.py

Lines changed: 0 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,8 @@
33
import json
44
import os
55
from collections import deque, defaultdict
6-
from html.parser import HTMLParser
76
from pathlib import Path
8-
from xml.dom import SyntaxErr
97

10-
import cssutils
118
from arca.exceptions import PullError, BuildError, RequirementsMismatch
129
from arca.utils import get_hash_for_file
1310

@@ -104,131 +101,6 @@ def get_lesson_tree_hash(repo, lesson_slug):
104101
return commit
105102

106103

107-
class DisallowedElement(Exception):
108-
pass
109-
110-
111-
class InvalidHTML(DisallowedElement):
112-
pass
113-
114-
115-
class DisallowedAttribute(DisallowedElement):
116-
pass
117-
118-
119-
class DisallowedStyle(Exception):
120-
121-
_BASE = "Style element or page css are only allowed when they modify .dataframe elements."
122-
COULD_NOT_PARSE = _BASE + " Ccould not parse the styles and verify."
123-
OUT_OF_SCOPE = _BASE + " Rendered page contains a style that modifies something else."
124-
125-
126-
class AllowedElementsParser(HTMLParser):
127-
"""
128-
This parser is used on all HTML returned from forked repositories.
129-
130-
It raises exceptions in two cases:
131-
132-
* :class:`DisallowedElement` - if a element not defined in :attr:`allowed_elements` is used
133-
* :class:`DisallowedStyle` - if a <style> element contains unparsable css or if it modifies something
134-
different than ``.dataframe`` elements.
135-
"""
136-
137-
def __init__(self, **kwargs):
138-
super(AllowedElementsParser, self).__init__(**kwargs)
139-
self.css_parser = cssutils.CSSParser(raiseExceptions=True)
140-
141-
#: Set of allowed HTML elements
142-
#: It has been compiled out of elements currently used in canonical lessons
143-
self.allowed_elements = {
144-
# functional:
145-
'a', 'abbr', 'audio', 'img', 'source',
146-
147-
# styling:
148-
'big', 'blockquote', 'code', 'font', 'i', 'tt', 'kbd', 'u', 'var', 'small', 'em', 'strong', 'sub',
149-
150-
# formatting:
151-
'br', 'div', 'hr', 'p', 'pre', 'span',
152-
153-
# lists:
154-
'dd', 'dl', 'dt', 'li', 'ul', 'ol',
155-
156-
# headers:
157-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
158-
159-
# tables:
160-
'table', 'tbody', 'td', 'th', 'thead', 'tr',
161-
162-
# icons:
163-
'svg', 'circle', 'path',
164-
165-
# A special check is applied in :meth:`handle_data` method
166-
# (only ``.dataframe`` styles allowed, generated from notebook converter)
167-
'style',
168-
}
169-
170-
#: Set of allowed HTML attributes
171-
#: Compiled out of currently used in canonical lesson
172-
self.allowed_attributes = {
173-
'alt', 'aria-hidden', 'border', 'class', 'color', 'colspan', 'controls', 'cx', 'cy', 'd', 'halign', 'href',
174-
'id', 'r', 'rowspan', 'src', 'start', 'title', 'type', 'valign', 'viewbox',
175-
176-
# inline styles generated from notebook converter
177-
'style',
178-
}
179-
180-
self.attrs = set()
181-
182-
def error(self, message):
183-
raise InvalidHTML(message)
184-
185-
def check_attributes(self, attrs):
186-
attr_names = set([x[0] for x in attrs])
187-
188-
if len(attr_names - self.allowed_attributes):
189-
raise DisallowedAttribute("Attributes '{}' are not allowed".format(", ".join(attr_names)))
190-
191-
def handle_starttag(self, tag, attrs):
192-
if tag not in self.allowed_elements:
193-
raise DisallowedElement(f"Element {tag} is not allowed.")
194-
195-
self.check_attributes(attrs)
196-
197-
def handle_startendtag(self, tag, attrs):
198-
if tag not in self.allowed_elements:
199-
raise DisallowedElement(f"Element {tag} is not allowed.")
200-
201-
self.check_attributes(attrs)
202-
203-
def handle_data(self, data):
204-
if self.lasttag == "style":
205-
self.validate_css(data)
206-
207-
def reset_and_feed(self, data):
208-
self.reset()
209-
self.feed(data)
210-
211-
def allow_selector(self, selector: str):
212-
if not selector.startswith(".dataframe "):
213-
return False
214-
215-
return True
216-
217-
def validate_css(self, data):
218-
try:
219-
parsed_css = self.css_parser.parseString(data)
220-
except SyntaxErr:
221-
raise DisallowedStyle(DisallowedStyle.COULD_NOT_PARSE)
222-
else:
223-
if len(parsed_css.cssRules) == 0:
224-
return
225-
226-
if not all([self.allow_selector(selector.selectorText)
227-
for rule in parsed_css.cssRules
228-
for selector in rule.selectorList]):
229-
raise DisallowedStyle(DisallowedStyle.OUT_OF_SCOPE)
230-
231-
232104
def forks_enabled():
233105
""" Returns if forks are enabled. By default they're not (for the purposes of local development).
234106

naucse/validation.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from xml.dom import SyntaxErr
2+
3+
from html.parser import HTMLParser
4+
5+
import cssutils
6+
7+
8+
class DisallowedElement(Exception):
9+
pass
10+
11+
12+
class InvalidHTML(DisallowedElement):
13+
pass
14+
15+
16+
class DisallowedAttribute(DisallowedElement):
17+
pass
18+
19+
20+
class DisallowedStyle(Exception):
21+
22+
_BASE = "Style element or page css are only allowed when they modify .dataframe elements."
23+
COULD_NOT_PARSE = _BASE + " Ccould not parse the styles and verify."
24+
OUT_OF_SCOPE = _BASE + " Rendered page contains a style that modifies something else."
25+
26+
27+
class AllowedElementsParser(HTMLParser):
28+
"""
29+
This parser is used on all HTML returned from forked repositories.
30+
31+
It raises exceptions in two cases:
32+
33+
* :class:`DisallowedElement` - if a element not defined in :attr:`allowed_elements` is used
34+
* :class:`DisallowedStyle` - if a <style> element contains unparsable css or if it modifies something
35+
different than ``.dataframe`` elements.
36+
"""
37+
38+
def __init__(self, **kwargs):
39+
super(AllowedElementsParser, self).__init__(**kwargs)
40+
self.css_parser = cssutils.CSSParser(raiseExceptions=True)
41+
42+
#: Set of allowed HTML elements
43+
#: It has been compiled out of elements currently used in canonical lessons
44+
self.allowed_elements = {
45+
# functional:
46+
'a', 'abbr', 'audio', 'img', 'source',
47+
48+
# styling:
49+
'big', 'blockquote', 'code', 'font', 'i', 'tt', 'kbd', 'u', 'var', 'small', 'em', 'strong', 'sub',
50+
51+
# formatting:
52+
'br', 'div', 'hr', 'p', 'pre', 'span',
53+
54+
# lists:
55+
'dd', 'dl', 'dt', 'li', 'ul', 'ol',
56+
57+
# headers:
58+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
59+
60+
# tables:
61+
'table', 'tbody', 'td', 'th', 'thead', 'tr',
62+
63+
# icons:
64+
'svg', 'circle', 'path',
65+
66+
# A special check is applied in :meth:`handle_data` method
67+
# (only ``.dataframe`` styles allowed, generated from notebook converter)
68+
'style',
69+
}
70+
71+
#: Set of allowed HTML attributes
72+
#: Compiled out of currently used in canonical lesson
73+
self.allowed_attributes = {
74+
'alt', 'aria-hidden', 'border', 'class', 'color', 'colspan', 'controls', 'cx', 'cy', 'd', 'halign', 'href',
75+
'id', 'r', 'rowspan', 'src', 'start', 'title', 'type', 'valign', 'viewbox',
76+
77+
# inline styles generated from notebook converter
78+
'style',
79+
}
80+
81+
self.attrs = set()
82+
83+
def error(self, message):
84+
raise InvalidHTML(message)
85+
86+
def check_attributes(self, attrs):
87+
attr_names = set([x[0] for x in attrs])
88+
89+
if len(attr_names - self.allowed_attributes):
90+
raise DisallowedAttribute("Attributes '{}' are not allowed".format(", ".join(attr_names)))
91+
92+
def handle_starttag(self, tag, attrs):
93+
if tag not in self.allowed_elements:
94+
raise DisallowedElement(f"Element {tag} is not allowed.")
95+
96+
self.check_attributes(attrs)
97+
98+
def handle_startendtag(self, tag, attrs):
99+
if tag not in self.allowed_elements:
100+
raise DisallowedElement(f"Element {tag} is not allowed.")
101+
102+
self.check_attributes(attrs)
103+
104+
def handle_data(self, data):
105+
if self.lasttag == "style":
106+
self.validate_css(data)
107+
108+
def reset_and_feed(self, data):
109+
self.reset()
110+
self.feed(data)
111+
112+
def allow_selector(self, selector: str):
113+
if not selector.startswith(".dataframe "):
114+
return False
115+
116+
return True
117+
118+
def validate_css(self, data):
119+
try:
120+
parsed_css = self.css_parser.parseString(data)
121+
except SyntaxErr:
122+
raise DisallowedStyle(DisallowedStyle.COULD_NOT_PARSE)
123+
else:
124+
if len(parsed_css.cssRules) == 0:
125+
return
126+
127+
if not all([self.allow_selector(selector.selectorText)
128+
for rule in parsed_css.cssRules
129+
for selector in rule.selectorList]):
130+
raise DisallowedStyle(DisallowedStyle.OUT_OF_SCOPE)

naucse/views.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
from naucse.utils.links import (process_course_data, process_session_data, process_page_data, process_footer_data,
2525
InvalidInfo)
2626
from naucse.utils.models import arca
27-
from naucse.utils.views import (get_recent_runs, list_months, DisallowedStyle,
28-
DisallowedElement, does_course_return_info, absolute_urls_to_freeze,
29-
raise_errors_from_forks, page_content_cache_key, InvalidHTML, get_edit_info)
27+
from naucse.utils.views import get_recent_runs, list_months
28+
from naucse.utils.views import does_course_return_info
29+
from naucse.utils.views import absolute_urls_to_freeze, raise_errors_from_forks
30+
from naucse.utils.views import page_content_cache_key, get_edit_info
31+
from naucse.validation import DisallowedStyle, DisallowedElement, InvalidHTML
3032

3133
# so it can be mocked
3234
import naucse.utils.views

test_naucse/test_validation.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import pytest
2+
3+
import naucse.validation
4+
5+
6+
def test_allowed_elements():
7+
allowed_elements = naucse.validation.AllowedElementsParser()
8+
9+
allowed_elements.reset_and_feed(
10+
"<div><strong><u><a>Test</a></u></div>"
11+
)
12+
13+
with pytest.raises(naucse.validation.DisallowedElement):
14+
allowed_elements.reset_and_feed(
15+
"<div><script>alert('XSS')</script></div>"
16+
)
17+
18+
19+
def test_allowed_styles():
20+
allowed_elements = naucse.validation.AllowedElementsParser()
21+
22+
allowed_elements.reset_and_feed(
23+
"""
24+
<style>
25+
.dataframe thead tr:only-child th {
26+
text-align: right;
27+
}
28+
29+
</style>
30+
"""
31+
)
32+
33+
# valid styles, but wrong elements
34+
with pytest.raises(naucse.validation.DisallowedStyle):
35+
allowed_elements.reset_and_feed(
36+
"""
37+
<style>
38+
.green {
39+
color: green;
40+
}
41+
</style>
42+
"""
43+
)
44+
45+
# can't parse
46+
with pytest.raises(naucse.validation.DisallowedStyle):
47+
allowed_elements.reset_and_feed(
48+
"""
49+
<style>
50+
.green {
51+
color: green
52+
</style>
53+
"""
54+
)
55+
56+
# multiple selectors in one rule
57+
# valid:
58+
allowed_elements.reset_and_feed(
59+
"""
60+
<style>
61+
.dataframe .green, .dataframe .also-green {
62+
color: green;
63+
}
64+
</style>
65+
"""
66+
)
67+
68+
# invalid:
69+
with pytest.raises(naucse.validation.DisallowedStyle):
70+
allowed_elements.reset_and_feed(
71+
"""
72+
<style>
73+
.dataframe .green, .also-green {
74+
color: green;
75+
}
76+
</style>
77+
"""
78+
)

0 commit comments

Comments
 (0)