22import base64
33import nh3
44from plane .utils .exception_logger import log_exception
5+ from bs4 import BeautifulSoup
6+ from collections import defaultdict
7+
58
69# Maximum allowed size for binary data (10MB)
710MAX_SIZE = 10 * 1024 * 1024
1922
2023def validate_binary_data (data ):
2124 """
22- Validate that binary data appears to be valid document format and doesn't contain malicious content.
25+ Validate that binary data appears to be a valid document format
26+ and doesn't contain malicious content.
2327
2428 Args:
2529 data (bytes or str): The binary data to validate, or base64-encoded string
@@ -60,6 +64,141 @@ def validate_binary_data(data):
6064 return True , None
6165
6266
67+ # Combine custom components and editor-specific nodes into a single set of tags
68+ CUSTOM_TAGS = {
69+ # editor node/tag names
70+ "imageComponent" ,
71+ "image" ,
72+ "mention" ,
73+ "link" ,
74+ "customColor" ,
75+ "emoji" ,
76+ "tableHeader" ,
77+ "tableCell" ,
78+ "tableRow" ,
79+ "codeBlock" ,
80+ "code" ,
81+ "horizontalRule" ,
82+ "calloutComponent" ,
83+ # component-style tag used by editor embeds
84+ "image-component" ,
85+ }
86+ ALLOWED_TAGS = nh3 .ALLOWED_TAGS | CUSTOM_TAGS
87+
88+ # Merge nh3 defaults with all attributes used across our custom components
89+ ATTRIBUTES = {
90+ "*" : {
91+ "class" ,
92+ "id" ,
93+ "title" ,
94+ "role" ,
95+ "aria-label" ,
96+ "aria-hidden" ,
97+ "style" ,
98+ # common editor data-* attributes seen in stored HTML
99+ # (wildcards like data-* are NOT supported by nh3; we add known keys
100+ # here and dynamically include all data-* seen in the input below)
101+ "data-tight" ,
102+ "data-node-type" ,
103+ "data-type" ,
104+ "data-checked" ,
105+ "data-background" ,
106+ "data-text-color" ,
107+ "data-icon-name" ,
108+ "data-icon-color" ,
109+ "data-background-color" ,
110+ "data-emoji-unicode" ,
111+ "data-emoji-url" ,
112+ "data-logo-in-use" ,
113+ "data-block-type" ,
114+ "data-name" ,
115+ "data-entity-id" ,
116+ "data-entity-group-id" ,
117+ },
118+ "a" : {"href" , "target" },
119+ # editor node/tag attributes
120+ "imageComponent" : {"id" , "width" , "height" , "aspectRatio" , "src" , "alignment" },
121+ "image" : {"width" , "height" , "aspectRatio" , "alignment" , "src" , "alt" , "title" },
122+ "mention" : {"id" , "entity_identifier" , "entity_name" },
123+ "link" : {"href" , "target" },
124+ "customColor" : {"color" , "backgroundColor" },
125+ "emoji" : {"name" },
126+ "tableHeader" : {"colspan" , "rowspan" , "colwidth" , "background" , "hideContent" },
127+ "tableCell" : {
128+ "colspan" ,
129+ "rowspan" ,
130+ "colwidth" ,
131+ "background" ,
132+ "textColor" ,
133+ "hideContent" ,
134+ },
135+ "tableRow" : {"background" , "textColor" },
136+ "codeBlock" : {"language" },
137+ "calloutComponent" : {
138+ "data-icon-color" ,
139+ "data-icon-name" ,
140+ "data-emoji-unicode" ,
141+ "data-emoji-url" ,
142+ "data-logo-in-use" ,
143+ "data-background" ,
144+ "data-block-type" ,
145+ },
146+ # image-component (from editor extension and seeds)
147+ "image-component" : {"src" , "id" , "width" , "height" , "aspectratio" , "alignment" },
148+ }
149+
150+ SAFE_PROTOCOLS = {"http" , "https" , "mailto" , "tel" }
151+
152+
153+ def _compute_html_sanitization_diff (before_html : str , after_html : str ):
154+ """
155+ Compute a coarse diff between original and sanitized HTML.
156+
157+ Returns a dict with:
158+ - removed_tags: mapping[tag] -> removed_count
159+ - removed_attributes: mapping[tag] -> sorted list of attribute names removed
160+ """
161+ try :
162+
163+ def collect (soup ):
164+ tag_counts = defaultdict (int )
165+ attrs_by_tag = defaultdict (set )
166+ for el in soup .find_all (True ):
167+ tag_name = (el .name or "" ).lower ()
168+ if not tag_name :
169+ continue
170+ tag_counts [tag_name ] += 1
171+ for attr_name in list (el .attrs .keys ()):
172+ if isinstance (attr_name , str ) and attr_name :
173+ attrs_by_tag [tag_name ].add (attr_name .lower ())
174+ return tag_counts , attrs_by_tag
175+
176+ soup_before = BeautifulSoup (before_html or "" , "html.parser" )
177+ soup_after = BeautifulSoup (after_html or "" , "html.parser" )
178+
179+ counts_before , attrs_before = collect (soup_before )
180+ counts_after , attrs_after = collect (soup_after )
181+
182+ removed_tags = {}
183+ for tag , cnt_before in counts_before .items ():
184+ cnt_after = counts_after .get (tag , 0 )
185+ if cnt_after < cnt_before :
186+ removed = cnt_before - cnt_after
187+ removed_tags [tag ] = removed
188+
189+ removed_attributes = {}
190+ for tag , before_set in attrs_before .items ():
191+ after_set = attrs_after .get (tag , set ())
192+ removed = before_set - after_set
193+ if removed :
194+ removed_attributes [tag ] = sorted (list (removed ))
195+
196+ return {"removed_tags" : removed_tags , "removed_attributes" : removed_attributes }
197+ except Exception :
198+ # Best-effort only; if diffing fails we don't block the request
199+ return {"removed_tags" : {}, "removed_attributes" : {}}
200+
201+
63202def validate_html_content (html_content : str ):
64203 """
65204 Sanitize HTML content using nh3.
@@ -73,7 +212,25 @@ def validate_html_content(html_content: str):
73212 return False , "HTML content exceeds maximum size limit (10MB)" , None
74213
75214 try :
76- clean_html = nh3 .clean (html_content )
215+ clean_html = nh3 .clean (
216+ html_content ,
217+ tags = ALLOWED_TAGS ,
218+ attributes = ATTRIBUTES ,
219+ url_schemes = SAFE_PROTOCOLS ,
220+ )
221+ # Report removals to logger (Sentry) if anything was stripped
222+ diff = _compute_html_sanitization_diff (html_content , clean_html )
223+ if diff .get ("removed_tags" ) or diff .get ("removed_attributes" ):
224+ try :
225+ import json
226+
227+ summary = json .dumps (diff )
228+ except Exception :
229+ summary = str (diff )
230+ log_exception (
231+ f"HTML sanitization removals: { summary } " ,
232+ warning = True ,
233+ )
77234 return True , None , clean_html
78235 except Exception as e :
79236 log_exception (e )
0 commit comments