Skip to content

Commit 962f88a

Browse files
committed
summaize() handles more things better.
XML: Added Element.getHtmlText().
1 parent b229f28 commit 962f88a

File tree

2 files changed

+79
-8
lines changed

2 files changed

+79
-8
lines changed

src/app.lua2p

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ local function setup()
194194
if programArguments[i] then
195195
errorNoPos("[Arguments] Unknown argument '%s'.", programArguments[i])
196196
end
197+
198+
if not pathRel:find(".", 1, true) then
199+
pathRel = pathRel .. ".xml"
200+
end
197201
end
198202

199203
local contents = !(unindent[=[
@@ -235,7 +239,7 @@ local function setup()
235239
file:write(contents)
236240
file:close()
237241

238-
printf("Created page: %s", path)
242+
printf("Created feed: %s", path)
239243

240244
----------------
241245
elseif kind == "sitemap" then
@@ -816,34 +820,71 @@ local function setup()
816820
return t
817821
end,
818822

819-
-- html = summarize( html, maxCharacters [, keepAnchorsAndImages=false ] ) -- @Doc
823+
-- html = summarize( html, maxCharacters [, keepSomeElements=false ] ) -- @Doc
824+
-- keepSomeElements includes <a>, <code>, <img>, <math> and <svg>.
820825
summarize = function(html, maxChars, keepSomeElements)
821826
html = "<div>" .. html .. "</div>"
822827
local doc = assert(xmlLib.parseHtml(html))
823828
local protected
824829

830+
-- Remove (probably) unwanted elements.
831+
local elementsToRemove = {
832+
"dialog",
833+
"menu", -- (Non-standard.)
834+
"nav",
835+
"script",
836+
"style",
837+
"template",
838+
}
839+
if not keepSomeElements then
840+
table.insert(elementsToRemove, "math")
841+
table.insert(elementsToRemove, "svg")
842+
end
843+
844+
xmlLib.walk(doc, false, function(tag, el)
845+
for i, childNode in ipairsr(el) do
846+
if xmlLib.isElement(childNode) and isAny(childNode.tag, unpack(elementsToRemove)) then
847+
table.remove(el, i)
848+
end
849+
end
850+
end)
851+
825852
-- Protect certain things, like links.
853+
-- @Incomplete: Always protect things like <br>, <wbr>, <bdi> and ruby tags. (Also <sub>?)
854+
-- @Incomplete: Always protect attributes like 'dir'.
855+
-- @Incomplete: Protect things like <i> and <strong>.
856+
-- @Incomplete: Handle <a><img></a>.
826857
if keepSomeElements then
827858
protected = {--[[ element1, ... ]]}
828859

829860
xmlLib.walk(doc, false, function(tag, el)
830861
for i, childNode in ipairs(el) do
862+
-- @Robustness: Make sure $$PLACEHOLDER#$$ strings don't exist in the original content.
831863
if xmlLib.isText(childNode) then
832864
-- void
833865

834866
elseif childNode.tag == "a" and childNode.attr.href and not childNode.attr.href:find"^javascript:" then
835-
table.insert(protected, xmlLib.element("a", {href=childNode.attr.href, childNode:getText()}))
836-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$" -- @Robustness: Make sure the string doesn't exist in the original content.
867+
table.insert(protected, xmlLib.element("a", {href=childNode.attr.href, childNode:getHtmlText()}))
868+
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
869+
870+
elseif childNode.tag == "code" then
871+
table.insert(protected, xmlLib.element("code", childNode:getHtmlText()))
872+
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
837873

838874
elseif childNode.tag == "img" and childNode.attr.src then
839875
table.insert(protected, xmlLib.element("img", {src=childNode.attr.src, alt=(childNode.attr.alt or "")}))
840-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$" -- @Robustness: Make sure the string doesn't exist in the original content.
876+
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
877+
878+
elseif childNode.tag == "math" or childNode.tag == "svg" then
879+
table.insert(protected, childNode)
880+
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
841881
end
842882
end
843883
end)
844884
end
845885

846886
-- Make the content into a list of paragraphs or similar.
887+
-- @Incomplete: Handle tables better?
847888
for i, childNode in ipairsr(doc) do
848889
if xmlLib.isText(childNode) then
849890
table.remove(doc, i)
@@ -853,12 +894,16 @@ local function setup()
853894
doc[i] = xmlLib.newElement(childNode.tag)
854895

855896
for li in childNode:eachChildElement() do
856-
local text = trim(li:getText():gsub("%s+", " "))
897+
local text = trim(li:getHtmlText():gsub("%s+", " "))
857898
table.insert(doc[i], xmlLib.element("li", text))
858899
end
859900

901+
elseif childNode.tag == "pre" then
902+
local text = trim(childNode:getHtmlText():gsub("%s+", " "))
903+
doc[i] = xmlLib.element("pre", text)
904+
860905
else
861-
local text = trim(childNode:getText():gsub("%s+", " "))
906+
local text = trim(childNode:getHtmlText():gsub("%s+", " "))
862907
doc[i] = xmlLib.element("p", text)
863908
end
864909

src/xml.lua2p

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
Element.getAttributes, Element.setAttribute, Element.updateAttributes
3939
Element.getChildByName, Element.findAllElementsByName
4040
Element.getFirstElement
41-
Element.getText, Element.getTextOfDirectChildren
41+
Element.getText, Element.getTextOfDirectChildren, Element.getHtmlText
4242
Element.mapElements
4343
Element.match, Element.substitute
4444
Element.removeWhitespaceNodes
@@ -707,6 +707,32 @@ function Element.getTextOfDirectChildren(el)
707707
end
708708
Element.get_text = Element.getTextOfDirectChildren -- :PenlightCompatibility
709709

710+
do
711+
local function _getHtmlText(buffer, el)
712+
if HTML_EMPTY_ELEMENTS[el.tag] then
713+
if el.tag == "img" then
714+
table.insert(buffer, el.attr.alt) -- May be nil.
715+
end
716+
return
717+
end
718+
719+
for _, childNode in ipairs(el) do
720+
if !!(IS_TEXT`childNode`) then
721+
table.insert(buffer, childNode)
722+
else
723+
_getHtmlText(buffer, childNode)
724+
end
725+
end
726+
end
727+
728+
-- text = Element:getHtmlText( ) -- @Doc
729+
function Element.getHtmlText(el)
730+
local buffer = {}
731+
_getHtmlText(buffer, el)
732+
return table.concat(buffer)
733+
end
734+
end
735+
710736

711737

712738
do

0 commit comments

Comments
 (0)