From 30bfd0bf1e7889f2df31390f60c12c93b53e76db Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 11 Oct 2025 21:34:12 +0200 Subject: [PATCH] refactor(sanitizer): use a smarter approach wrt. SanitizeHTML The issue with slices.Contains is that it keeps growing and never shrinks. An rss entry could start with a bunch of random tags, and those would always be considered valid in the `case html.EndTagToken` case. While can't use a strict push/pop queue, as we might have things like that are valid HTML, we can look in the slice for the farther index of the tag that is being closed, and pop everything after it. --- internal/reader/sanitizer/sanitizer.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index d6f0e969ffd..93de850e729 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -202,6 +202,16 @@ type SanitizerOptions struct { OpenLinksInNewTab bool } +// lastIndex returns the index of the last matching element e present in s. +func lastIndex[T comparable](s []T, e T) int { + for i := len(s) - 1; i >= 0; i-- { + if s[i] == e { + return i + } + } + return -1 +} + func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string { var tagStack []string var parentTag string @@ -278,8 +288,13 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s } case html.EndTagToken: if len(blockedStack) == 0 { - if isValidTag(tagName) && slices.Contains(tagStack, tagName) { - buffer.WriteString("") + if isValidTag(tagName) { + // We can't use a strict push/pop queue, as we might have things like + // that are valid HTML. + if idx := lastIndex(tagStack, tagName); idx != -1 { + tagStack = tagStack[:idx] + buffer.WriteString("") + } } } else { if blockedStack[len(blockedStack)-1] == tagName {