Skip to content

Commit 3a62de8

Browse files
committed
HTML/Markdown: Attempting to scramble e-mail addresses.
1 parent 2d38461 commit 3a62de8

File tree

1 file changed

+74
-25
lines changed

1 file changed

+74
-25
lines changed

src/xml.lua2p

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ local HTML_END_TAG_PATTERNS_FOR_TAGS_WITH_UNENCODED_CONTENTS = {
9191

9292

9393
local xml = {
94-
htmlAllowNoAttributeValue = true, -- @Doc (Maybe this should be an argument for toHtml()?)
94+
htmlAllowNoAttributeValue = true, -- @Doc (Maybe this should be an argument for toHtml()?)
95+
htmlScrambleEmailAddresses = true, -- @Doc (Maybe this should be an argument for toHtml()?)
9596
}
9697
xml.__index = xml
9798

@@ -128,7 +129,7 @@ end
128129
local function insertNode(el, node)
129130
if node == "" then
130131
-- void
131-
elseif !!(IS_TEXT`node`) and !!(IS_TEXT`el[#el]`) then
132+
elseif @@IS_TEXT(node) and @@IS_TEXT(el[#el]) then
132133
el[#el] = el[#el] .. node -- @Speed
133134
else
134135
table.insert(el, node)
@@ -697,7 +698,7 @@ xml.set_attribs = xml.updateAttributes -- :PenlightCompatibility
697698
do
698699
local function _getText(buffer, el)
699700
for _, childNode in ipairs(el) do
700-
if !!(IS_TEXT`childNode`) then
701+
if @@IS_TEXT(childNode) then
701702
table.insert(buffer, childNode)
702703
else
703704
_getText(buffer, childNode)
@@ -716,7 +717,7 @@ end
716717
function xml.getTextOfDirectChildren(el)
717718
local buffer = {}
718719
for _, childNode in ipairs(el) do
719-
if !!(IS_TEXT`childNode`) then table.insert(buffer, childNode) end
720+
if @@IS_TEXT(childNode) then table.insert(buffer, childNode) end
720721
end
721722
return table.concat(buffer)
722723
end
@@ -732,7 +733,7 @@ do
732733
end
733734

734735
for _, childNode in ipairs(el) do
735-
if !!(IS_TEXT`childNode`) then
736+
if @@IS_TEXT(childNode) then
736737
table.insert(buffer, childNode)
737738
else
738739
_getHtmlText(buffer, childNode)
@@ -800,8 +801,8 @@ do
800801
if node == nil then node = "" end
801802

802803
-- Attribute string matching is straight equality, except if the pattern is a $ capture, which always succeeds.
803-
if !!(IS_TEXT`node`) then
804-
if not !!(IS_TEXT`patEl`) then return false end
804+
if @@IS_TEXT(node) then
805+
if not @@IS_TEXT(patEl) then return false end
805806

806807
-- print(node, patEl) -- DEBUG
807808

@@ -854,7 +855,7 @@ do
854855

855856
local function advanceElement()
856857
elChildI = elChildI + 1 -- Next child element of data.
857-
while el[elChildI] and !!(IS_TEXT`el[elChildI]`) do
858+
while el[elChildI] and @@IS_TEXT(el[elChildI]) do
858859
elChildI = elChildI + 1
859860
end
860861
return elChildI <= #el
@@ -917,7 +918,7 @@ do
917918

918919
xml.walk(patEl, false, function(tagName, currentEl)
919920
if
920-
!!(IS_TEXT`currentEl[1]`) and xml.isElement(currentEl[2]) and !!(IS_TEXT`currentEl[3]`)
921+
@@IS_TEXT(currentEl[1]) and xml.isElement(currentEl[2]) and @@IS_TEXT(currentEl[3])
921922
and currentEl[1]:find"%s*{{" and currentEl[3]:find"}}%s*"
922923
then
923924
table.remove(currentEl, 3)
@@ -1087,7 +1088,7 @@ local function shouldEncodeAsCdata(s)
10871088
end
10881089

10891090
local function nodeToXml(buffer, node)
1090-
if !!(IS_TEXT`node`) then
1091+
if @@IS_TEXT(node) then
10911092
if shouldEncodeAsCdata(node) then
10921093
table.insert(buffer, "<![CDATA[")
10931094
table.insert(buffer, node)
@@ -1140,7 +1141,7 @@ end
11401141

11411142
-- xmlString = xml.contentsToXml( node )
11421143
function xml.contentsToXml(node)
1143-
if !!(IS_TEXT`node`) then return "" end
1144+
if @@IS_TEXT(node) then return "" end
11441145

11451146
local buffer = {}
11461147
for _, childNode in ipairs(node) do
@@ -1152,7 +1153,7 @@ end
11521153

11531154

11541155
local function nodeToXmlPretty(buffer, node, initialIndent,indent,attrIndent, indentTags)
1155-
if !!(IS_TEXT`node`) then
1156+
if @@IS_TEXT(node) then
11561157
if not node:find"%S" then
11571158
-- void
11581159
elseif shouldEncodeAsCdata(node) then
@@ -1245,9 +1246,52 @@ xml.__tostring = xml.toPrettyXml -- :PenlightCompatibility
12451246

12461247

12471248

1248-
local function nodeToHtml(buffer, node)
1249-
if !!(IS_TEXT`node`) then
1250-
table.insert(buffer, xml.encodeMoreEntities(node))
1249+
local function encodeEmailValue(buffer, s)
1250+
--
1251+
-- Encode characters as a mix of decimal and hexadecimal entities to
1252+
-- increase the chance of fooling address-harvesting bots.
1253+
--
1254+
-- We use a deterministic encoding to make unit testing possible.
1255+
-- We encode roughly 40% hex, 40% dec, 20% plain.
1256+
--
1257+
-- This algorithm comes from Markdown.pl by John Gruber and was based on a
1258+
-- filter that Matthew Wickline wrote on some mailing list in the ancient
1259+
-- times. Isn't programming fun?
1260+
--
1261+
local encoderHex = {count=1, rate=.40, encode=function(c) return F("&#x%x;", c:byte()) end}
1262+
local encoderDec = {count=0, rate=.40, encode=function(c) return F("&#%d;", c:byte()) end}
1263+
local encoderPlain = {count=0, rate=.20, encode=function(c) return c end}
1264+
local encoders = {encoderHex, encoderDec, encoderPlain}
1265+
1266+
for pos = 1, #s do
1267+
for _, encoder in ipairs(encoders) do
1268+
encoder.count = encoder.count + encoder.rate
1269+
end
1270+
1271+
if encoders[2].count > encoders[1].count then encoders[1], encoders[2] = encoders[2], encoders[1] end
1272+
if encoders[3].count > encoders[2].count then encoders[2], encoders[3] = encoders[3], encoders[2] end
1273+
if encoders[2].count > encoders[1].count then encoders[1], encoders[2] = encoders[2], encoders[1] end
1274+
1275+
local encoder = encoders[1]
1276+
local c = s:sub(pos, pos)
1277+
1278+
-- Force encoding of "@" to make the address less visible.
1279+
if @@CONSTSET{"@","&","<",">",'"',"'"}[c] and encoder == encoderPlain then encoder = encoders[2] end
1280+
1281+
table.insert(buffer, encoder.encode(c))
1282+
encoder.count = encoder.count - 1
1283+
end
1284+
end
1285+
1286+
local function nodeToHtml(buffer, node, encodeTextAsEmail)
1287+
if @@IS_TEXT(node) then
1288+
if encodeTextAsEmail and node:find("@", 1, true) then
1289+
-- We just assume the text is an e-mail address. It's possible
1290+
-- it's not. It's also possible the node is just "@".
1291+
encodeEmailValue(buffer, node)
1292+
else
1293+
table.insert(buffer, xml.encodeMoreEntities(node))
1294+
end
12511295
return
12521296
end
12531297

@@ -1270,7 +1314,12 @@ local function nodeToHtml(buffer, node)
12701314

12711315
if not (attrValue == "" and allowNoAttrValue) then
12721316
table.insert(buffer, '="')
1273-
table.insert(buffer, xml.encodeMoreEntities(attrValue))
1317+
if attrName == "href" and el.tag == "a" and attrValue:find"^[Mm][Aa][Ii][Ll][Tt][Oo]:" then
1318+
encodeEmailValue(buffer, attrValue)
1319+
encodeTextAsEmail = xml.htmlScrambleEmailAddresses
1320+
else
1321+
table.insert(buffer, xml.encodeMoreEntities(attrValue))
1322+
end
12741323
table.insert(buffer, '"')
12751324
end
12761325
end
@@ -1288,7 +1337,7 @@ local function nodeToHtml(buffer, node)
12881337
end
12891338
else
12901339
for _, childNode in ipairs(el) do
1291-
nodeToHtml(buffer, childNode)
1340+
nodeToHtml(buffer, childNode, encodeTextAsEmail)
12921341
end
12931342
end
12941343
end
@@ -1309,17 +1358,17 @@ function xml.toHtml(node, preface)
13091358
if preface then
13101359
buffer[1] = (type(preface) == "string") and preface or !(HTML_STANDARD_PREFACE.."\n")
13111360
end
1312-
nodeToHtml(buffer, node)
1361+
nodeToHtml(buffer, node, false)
13131362
return table.concat(buffer, "")
13141363
end
13151364

13161365
-- htmlString = xml.contentsToHtml( node )
13171366
function xml.contentsToHtml(node)
1318-
if !!(IS_TEXT`node`) then return "" end
1367+
if @@IS_TEXT(node) then return "" end
13191368

13201369
local buffer = {}
13211370
for _, childNode in ipairs(node) do
1322-
nodeToHtml(buffer, childNode)
1371+
nodeToHtml(buffer, childNode, false)
13231372
end
13241373
return table.concat(buffer, "")
13251374
end
@@ -1345,7 +1394,7 @@ function xml.parseXml(s, pathForError)
13451394
end
13461395

13471396
for _, node in ipairs(docWrapper) do
1348-
if not !!(IS_TEXT`node`) then
1397+
if not @@IS_TEXT(node) then
13491398
if doc then
13501399
fileError(pathForError, s, #s+1, "There are multiple root elements.")
13511400
end
@@ -1384,7 +1433,7 @@ function xml.parseHtml(s, pathForError)
13841433
end
13851434

13861435
for _, node in ipairs(docWrapper) do
1387-
if not !!(IS_TEXT`node`) then
1436+
if not @@IS_TEXT(node) then
13881437
if doc then
13891438
fileError(pathForError, s, #s, "There are multiple root elements.")
13901439
end
@@ -1527,7 +1576,7 @@ function xml.element(tagName, prototype)
15271576
if not prototype then
15281577
-- void
15291578

1530-
elseif !!(IS_TEXT`prototype`) or xml.isElement(prototype) then
1579+
elseif @@IS_TEXT(prototype) or xml.isElement(prototype) then
15311580
el[1] = prototype
15321581

15331582
else
@@ -1548,7 +1597,7 @@ xml.elem = xml.element -- :PenlightCompatibility
15481597

15491598
do
15501599
local function _clone(node, textSubstCallback, parentEl)
1551-
if !!(IS_TEXT`node`) then
1600+
if @@IS_TEXT(node) then
15521601
if textSubstCallback then
15531602
node = textSubstCallback(node, "*TEXT", parentEl) or errorf("No value returned from text substitution callback for '*TEXT'.")
15541603
end
@@ -1595,7 +1644,7 @@ end
15951644
-- nodesLookEqual = xml.compare( value1, value2 )
15961645
-- Returns false if any value is not a node.
15971646
function xml.compare(v1, v2)
1598-
if !!(IS_TEXT`v1`) and !!(IS_TEXT`v2`) then
1647+
if @@IS_TEXT(v1) and @@IS_TEXT(v2) then
15991648
return v1 == v2
16001649
end
16011650
if not (xml.isElement(v1) and xml.isElement(v2)) then

0 commit comments

Comments
 (0)