Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 159 additions & 2 deletions apps/api/plane/utils/content_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import base64
import nh3
from plane.utils.exception_logger import log_exception
from bs4 import BeautifulSoup
from collections import defaultdict


# Maximum allowed size for binary data (10MB)
MAX_SIZE = 10 * 1024 * 1024
Expand All @@ -19,7 +22,8 @@

def validate_binary_data(data):
"""
Validate that binary data appears to be valid document format and doesn't contain malicious content.
Validate that binary data appears to be a valid document format
and doesn't contain malicious content.

Args:
data (bytes or str): The binary data to validate, or base64-encoded string
Expand Down Expand Up @@ -60,6 +64,141 @@ def validate_binary_data(data):
return True, None


# Combine custom components and editor-specific nodes into a single set of tags
CUSTOM_TAGS = {
# editor node/tag names
"imageComponent",
"image",
"mention",
"link",
"customColor",
"emoji",
"tableHeader",
"tableCell",
"tableRow",
"codeBlock",
"code",
"horizontalRule",
"calloutComponent",
# component-style tag used by editor embeds
"image-component",
}
ALLOWED_TAGS = nh3.ALLOWED_TAGS | CUSTOM_TAGS

# Merge nh3 defaults with all attributes used across our custom components
ATTRIBUTES = {
"*": {
"class",
"id",
"title",
"role",
"aria-label",
"aria-hidden",
"style",
# common editor data-* attributes seen in stored HTML
# (wildcards like data-* are NOT supported by nh3; we add known keys
# here and dynamically include all data-* seen in the input below)
"data-tight",
"data-node-type",
"data-type",
"data-checked",
"data-background",
"data-text-color",
"data-icon-name",
"data-icon-color",
"data-background-color",
"data-emoji-unicode",
"data-emoji-url",
"data-logo-in-use",
"data-block-type",
"data-name",
"data-entity-id",
"data-entity-group-id",
},
"a": {"href", "target"},
# editor node/tag attributes
"imageComponent": {"id", "width", "height", "aspectRatio", "src", "alignment"},
"image": {"width", "height", "aspectRatio", "alignment", "src", "alt", "title"},
"mention": {"id", "entity_identifier", "entity_name"},
"link": {"href", "target"},
"customColor": {"color", "backgroundColor"},
"emoji": {"name"},
"tableHeader": {"colspan", "rowspan", "colwidth", "background", "hideContent"},
"tableCell": {
"colspan",
"rowspan",
"colwidth",
"background",
"textColor",
"hideContent",
},
"tableRow": {"background", "textColor"},
"codeBlock": {"language"},
"calloutComponent": {
"data-icon-color",
"data-icon-name",
"data-emoji-unicode",
"data-emoji-url",
"data-logo-in-use",
"data-background",
"data-block-type",
},
# image-component (from editor extension and seeds)
"image-component": {"src", "id", "width", "height", "aspectratio", "alignment"},
}

SAFE_PROTOCOLS = {"http", "https", "mailto", "tel"}


def _compute_html_sanitization_diff(before_html: str, after_html: str):
"""
Compute a coarse diff between original and sanitized HTML.

Returns a dict with:
- removed_tags: mapping[tag] -> removed_count
- removed_attributes: mapping[tag] -> sorted list of attribute names removed
"""
try:

def collect(soup):
tag_counts = defaultdict(int)
attrs_by_tag = defaultdict(set)
for el in soup.find_all(True):
tag_name = (el.name or "").lower()
if not tag_name:
continue
tag_counts[tag_name] += 1
for attr_name in list(el.attrs.keys()):
if isinstance(attr_name, str) and attr_name:
attrs_by_tag[tag_name].add(attr_name.lower())
return tag_counts, attrs_by_tag

soup_before = BeautifulSoup(before_html or "", "html.parser")
soup_after = BeautifulSoup(after_html or "", "html.parser")

counts_before, attrs_before = collect(soup_before)
counts_after, attrs_after = collect(soup_after)

removed_tags = {}
for tag, cnt_before in counts_before.items():
cnt_after = counts_after.get(tag, 0)
if cnt_after < cnt_before:
removed = cnt_before - cnt_after
removed_tags[tag] = removed

removed_attributes = {}
for tag, before_set in attrs_before.items():
after_set = attrs_after.get(tag, set())
removed = before_set - after_set
if removed:
removed_attributes[tag] = sorted(list(removed))

return {"removed_tags": removed_tags, "removed_attributes": removed_attributes}
except Exception:
# Best-effort only; if diffing fails we don't block the request
return {"removed_tags": {}, "removed_attributes": {}}


def validate_html_content(html_content: str):
"""
Sanitize HTML content using nh3.
Expand All @@ -73,7 +212,25 @@ def validate_html_content(html_content: str):
return False, "HTML content exceeds maximum size limit (10MB)", None

try:
clean_html = nh3.clean(html_content)
clean_html = nh3.clean(
html_content,
tags=ALLOWED_TAGS,
attributes=ATTRIBUTES,
url_schemes=SAFE_PROTOCOLS,
)
# Report removals to logger (Sentry) if anything was stripped
diff = _compute_html_sanitization_diff(html_content, clean_html)
if diff.get("removed_tags") or diff.get("removed_attributes"):
try:
import json

summary = json.dumps(diff)
except Exception:
summary = str(diff)
log_exception(
f"HTML sanitization removals: {summary}",
warning=True,
)
return True, None, clean_html
except Exception as e:
log_exception(e)
Expand Down
Loading