From 2c26f9226fd78826d79b63ee0e0b5590ae86a0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mislav=20Marohni=C4=87?= Date: Sun, 15 Jun 2025 20:59:46 +0200 Subject: [PATCH] Preserve IDs & class names of unwrapped DIVs Some documents use DIVs instead of P elements, or they wrap a single P element. In both of these cases, readability will replace the DIV with P, but in the process the ID of the original DIV would be discarded. This would break the table of contents if it relied on that ID to link to that section. This change makes it so that the "id" and "class" attributes are preserved when a DIV has been swapped out with a P. --- parser.go | 8 ++++++++ test-pages/gmw/expected.html | 2 +- test-pages/lazy-image-1/expected.html | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/parser.go b/parser.go index 6e9ad45..0e8ae98 100644 --- a/parser.go +++ b/parser.go @@ -890,8 +890,16 @@ func (ps *Parser) grabArticle() *html.Node { // the scoring algorithm with DIVs with are, in // practice, paragraphs. if ps.hasSingleTagInsideElement(node, "p") && ps.getLinkDensity(node) < 0.25 { + divID := dom.ID(node) + divClassName := dom.ClassName(node) newNode := dom.Children(node)[0] node, _ = dom.ReplaceChild(node.Parent, newNode, node) + if divID != "" && dom.ID(node) == "" { + dom.SetAttribute(node, "id", divID) + } + if divClassName != "" && dom.ClassName(node) == "" { + dom.SetAttribute(node, "class", divClassName) + } elementsToScore = append(elementsToScore, node) } else if !ps.hasChildBlockElement(node) { ps.setNodeTag(node, "p") diff --git a/test-pages/gmw/expected.html b/test-pages/gmw/expected.html index 6843c7d..79d1b75 100644 --- a/test-pages/gmw/expected.html +++ b/test-pages/gmw/expected.html @@ -51,7 +51,7 @@

-

[责任编辑:肖春芳]

+

[责任编辑:肖春芳]

\ No newline at end of file diff --git a/test-pages/lazy-image-1/expected.html b/test-pages/lazy-image-1/expected.html index a5bce5f..2e8e4ce 100644 --- a/test-pages/lazy-image-1/expected.html +++ b/test-pages/lazy-image-1/expected.html @@ -38,7 +38,7 @@

(for more details about V8 and its garbage collector you can read my previous article here)

-

+

Stay focused on the CPU!

@@ -61,7 +61,7 @@

CPU profiling: what’s the difference with CPU monitoring?

-

+

“Most commonly, profiling information serves to aid program optimization. Profiling is achieved by instrumenting either the program source code or its binary executable form using a tool called a profiler”