Skip to content

Commit 9965fc7

Browse files
[WEB-4806] chore: improved the html validation (#7676)
* chore: improved the html validation * chore: removed the models changes * chore: removed extra filters * chore: changed the protocols
1 parent cb1e93f commit 9965fc7

File tree

1 file changed

+159
-2
lines changed

1 file changed

+159
-2
lines changed

apps/api/plane/utils/content_validator.py

Lines changed: 159 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import base64
33
import nh3
44
from plane.utils.exception_logger import log_exception
5+
from bs4 import BeautifulSoup
6+
from collections import defaultdict
7+
58

69
# Maximum allowed size for binary data (10MB)
710
MAX_SIZE = 10 * 1024 * 1024
@@ -19,7 +22,8 @@
1922

2023
def validate_binary_data(data):
2124
"""
22-
Validate that binary data appears to be valid document format and doesn't contain malicious content.
25+
Validate that binary data appears to be a valid document format
26+
and doesn't contain malicious content.
2327
2428
Args:
2529
data (bytes or str): The binary data to validate, or base64-encoded string
@@ -60,6 +64,141 @@ def validate_binary_data(data):
6064
return True, None
6165

6266

67+
# Combine custom components and editor-specific nodes into a single set of tags
68+
CUSTOM_TAGS = {
69+
# editor node/tag names
70+
"imageComponent",
71+
"image",
72+
"mention",
73+
"link",
74+
"customColor",
75+
"emoji",
76+
"tableHeader",
77+
"tableCell",
78+
"tableRow",
79+
"codeBlock",
80+
"code",
81+
"horizontalRule",
82+
"calloutComponent",
83+
# component-style tag used by editor embeds
84+
"image-component",
85+
}
86+
ALLOWED_TAGS = nh3.ALLOWED_TAGS | CUSTOM_TAGS
87+
88+
# Merge nh3 defaults with all attributes used across our custom components
89+
ATTRIBUTES = {
90+
"*": {
91+
"class",
92+
"id",
93+
"title",
94+
"role",
95+
"aria-label",
96+
"aria-hidden",
97+
"style",
98+
# common editor data-* attributes seen in stored HTML
99+
# (wildcards like data-* are NOT supported by nh3; we add known keys
100+
# here and dynamically include all data-* seen in the input below)
101+
"data-tight",
102+
"data-node-type",
103+
"data-type",
104+
"data-checked",
105+
"data-background",
106+
"data-text-color",
107+
"data-icon-name",
108+
"data-icon-color",
109+
"data-background-color",
110+
"data-emoji-unicode",
111+
"data-emoji-url",
112+
"data-logo-in-use",
113+
"data-block-type",
114+
"data-name",
115+
"data-entity-id",
116+
"data-entity-group-id",
117+
},
118+
"a": {"href", "target"},
119+
# editor node/tag attributes
120+
"imageComponent": {"id", "width", "height", "aspectRatio", "src", "alignment"},
121+
"image": {"width", "height", "aspectRatio", "alignment", "src", "alt", "title"},
122+
"mention": {"id", "entity_identifier", "entity_name"},
123+
"link": {"href", "target"},
124+
"customColor": {"color", "backgroundColor"},
125+
"emoji": {"name"},
126+
"tableHeader": {"colspan", "rowspan", "colwidth", "background", "hideContent"},
127+
"tableCell": {
128+
"colspan",
129+
"rowspan",
130+
"colwidth",
131+
"background",
132+
"textColor",
133+
"hideContent",
134+
},
135+
"tableRow": {"background", "textColor"},
136+
"codeBlock": {"language"},
137+
"calloutComponent": {
138+
"data-icon-color",
139+
"data-icon-name",
140+
"data-emoji-unicode",
141+
"data-emoji-url",
142+
"data-logo-in-use",
143+
"data-background",
144+
"data-block-type",
145+
},
146+
# image-component (from editor extension and seeds)
147+
"image-component": {"src", "id", "width", "height", "aspectratio", "alignment"},
148+
}
149+
150+
SAFE_PROTOCOLS = {"http", "https", "mailto", "tel"}
151+
152+
153+
def _compute_html_sanitization_diff(before_html: str, after_html: str):
154+
"""
155+
Compute a coarse diff between original and sanitized HTML.
156+
157+
Returns a dict with:
158+
- removed_tags: mapping[tag] -> removed_count
159+
- removed_attributes: mapping[tag] -> sorted list of attribute names removed
160+
"""
161+
try:
162+
163+
def collect(soup):
164+
tag_counts = defaultdict(int)
165+
attrs_by_tag = defaultdict(set)
166+
for el in soup.find_all(True):
167+
tag_name = (el.name or "").lower()
168+
if not tag_name:
169+
continue
170+
tag_counts[tag_name] += 1
171+
for attr_name in list(el.attrs.keys()):
172+
if isinstance(attr_name, str) and attr_name:
173+
attrs_by_tag[tag_name].add(attr_name.lower())
174+
return tag_counts, attrs_by_tag
175+
176+
soup_before = BeautifulSoup(before_html or "", "html.parser")
177+
soup_after = BeautifulSoup(after_html or "", "html.parser")
178+
179+
counts_before, attrs_before = collect(soup_before)
180+
counts_after, attrs_after = collect(soup_after)
181+
182+
removed_tags = {}
183+
for tag, cnt_before in counts_before.items():
184+
cnt_after = counts_after.get(tag, 0)
185+
if cnt_after < cnt_before:
186+
removed = cnt_before - cnt_after
187+
removed_tags[tag] = removed
188+
189+
removed_attributes = {}
190+
for tag, before_set in attrs_before.items():
191+
after_set = attrs_after.get(tag, set())
192+
removed = before_set - after_set
193+
if removed:
194+
removed_attributes[tag] = sorted(list(removed))
195+
196+
return {"removed_tags": removed_tags, "removed_attributes": removed_attributes}
197+
except Exception:
198+
# Best-effort only; if diffing fails we don't block the request
199+
return {"removed_tags": {}, "removed_attributes": {}}
200+
201+
63202
def validate_html_content(html_content: str):
64203
"""
65204
Sanitize HTML content using nh3.
@@ -73,7 +212,25 @@ def validate_html_content(html_content: str):
73212
return False, "HTML content exceeds maximum size limit (10MB)", None
74213

75214
try:
76-
clean_html = nh3.clean(html_content)
215+
clean_html = nh3.clean(
216+
html_content,
217+
tags=ALLOWED_TAGS,
218+
attributes=ATTRIBUTES,
219+
url_schemes=SAFE_PROTOCOLS,
220+
)
221+
# Report removals to logger (Sentry) if anything was stripped
222+
diff = _compute_html_sanitization_diff(html_content, clean_html)
223+
if diff.get("removed_tags") or diff.get("removed_attributes"):
224+
try:
225+
import json
226+
227+
summary = json.dumps(diff)
228+
except Exception:
229+
summary = str(diff)
230+
log_exception(
231+
f"HTML sanitization removals: {summary}",
232+
warning=True,
233+
)
77234
return True, None, clean_html
78235
except Exception as e:
79236
log_exception(e)

0 commit comments

Comments
 (0)