Skip to content

Commit 5f3a284

Browse files
Add character count (w/ and w/o spaces) to wordcount filter. (#98)
1 parent 477e4f6 commit 5f3a284

File tree

3 files changed

+22
-4
lines changed

3 files changed

+22
-4
lines changed

wordcount/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# wordcount
22

3-
This filter counts the words in the body of a document (omitting
3+
This filter counts the words and characters in the body of a document (omitting
44
metadata like titles and abstracts), including words in code.
5-
It should be more accurate than `wc -w` run directly on a
6-
Markdown document, since the latter will count markup
5+
It should be more accurate than `wc -w` or `wc -m` run directly on a
6+
Markdown document, since `wc` will also count markup
77
characters, like the `#` in front of an ATX header, or
8-
tags in HTML documents, as words.
8+
tags in HTML documents.
99

1010
To run it, `pandoc --lua-filter wordcount.lua myfile.md`.
1111
The word count will be printed to stdout.

wordcount/expected.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
15 words in body
2+
68 characters in body
3+
79 characters in body (including spaces)

wordcount/wordcount.lua

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,45 @@
11
-- counts words in a document
22

33
words = 0
4+
characters = 0
5+
characters_and_spaces = 0
46

57
wordcount = {
68
Str = function(el)
79
-- we don't count a word if it's entirely punctuation:
810
if el.text:match("%P") then
911
words = words + 1
1012
end
13+
characters = characters + utf8.len(el.text)
14+
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
1115
end,
1216

17+
Space = function(el)
18+
characters_and_spaces = characters_and_spaces + 1
19+
end,
20+
1321
Code = function(el)
1422
_,n = el.text:gsub("%S+","")
1523
words = words + n
24+
text_nospace = el.text:gsub("%s", "")
25+
characters = characters + utf8.len(text_nospace)
26+
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
1627
end,
1728

1829
CodeBlock = function(el)
1930
_,n = el.text:gsub("%S+","")
2031
words = words + n
32+
text_nospace = el.text:gsub("%s", "")
33+
characters = characters + utf8.len(text_nospace)
34+
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
2135
end
2236
}
2337

2438
function Pandoc(el)
2539
-- skip metadata, just count body:
2640
pandoc.walk_block(pandoc.Div(el.blocks), wordcount)
2741
print(words .. " words in body")
42+
print(characters .. " characters in body")
43+
print(characters_and_spaces .. " characters in body (including spaces)")
2844
os.exit(0)
2945
end

0 commit comments

Comments
 (0)