Skip to content

Commit 59018e0

Browse files
committed
feat: major paragraph spacing improvements and context-aware processing
This release introduces significant architectural improvements to address paragraph spacing issues and enhances the overall quality of Markdown output. Changes: - Fixed paragraph spacing within containers (resolves issue #6) - Added context-aware processing with block vs inline element detection - Introduced Html2Markdown.ElementTypes module for element classification - Enhanced list formatting with proper line breaks between items - Improved code block whitespace preservation using Base64 encoding workaround - Better language detection for syntax-highlighted code blocks - Streamlined processing logic for cleaner, more readable output Breaking changes: - Paragraph spacing now uses standard double newlines (\n\n) instead of single newline + space (\n \n) for better Markdown compliance - More comprehensive content preservation (includes navigation elements)
1 parent be2e9e8 commit 59018e0

File tree

13 files changed

+1260
-380
lines changed

13 files changed

+1260
-380
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Add `html2markdown` to your list of dependencies in `mix.exs`:
1313
```elixir
1414
def deps do
1515
[
16-
{:html2markdown, "~> 0.2.1"}
16+
{:html2markdown, "~> 0.3.0"}
1717
]
1818
end
1919
```

lib/html2markdown/converter.ex

Lines changed: 96 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ defmodule Html2Markdown.Converter do
2323
- Preserves whitespace in code blocks while normalizing elsewhere
2424
"""
2525

26-
alias Html2Markdown.{TableConverter, Options}
26+
alias Html2Markdown.{TableConverter, Options, ElementTypes}
2727

2828
@spec convert_to_markdown(list(Floki.html_node()), Options.t()) :: String.t()
2929
def convert_to_markdown(document, opts) do
@@ -226,10 +226,10 @@ defmodule Html2Markdown.Converter do
226226
defp process_node_to_iolist({"hr", _, _}, _opts), do: "\n\n---\n\n"
227227

228228
defp process_node_to_iolist({"section", _, children}, opts),
229-
do: ["\n", process_children_to_iolist(children, opts), "\n"]
229+
do: process_children_with_context(children, opts, :block)
230230

231231
defp process_node_to_iolist({"article", _, children}, opts),
232-
do: ["\n", process_children_to_iolist(children, opts), "\n"]
232+
do: process_children_with_context(children, opts, :block)
233233

234234
defp process_node_to_iolist({"picture", _, children}, opts) do
235235
case Enum.find(children, fn
@@ -250,7 +250,20 @@ defmodule Html2Markdown.Converter do
250250
end
251251

252252
defp process_node_to_iolist({"div", _, children}, opts),
253-
do: [process_children_to_iolist(children, opts), "\n"]
253+
do: process_children_with_context(children, opts, :block)
254+
255+
# Handle spans with preserved whitespace
256+
defp process_node_to_iolist({"span", attrs, children}, opts) do
257+
case List.keyfind(attrs, "data-ws", 0) do
258+
{"data-ws", encoded} ->
259+
# Decode preserved whitespace
260+
Base.decode64!(encoded)
261+
262+
_ ->
263+
# Normal span processing
264+
process_children_to_iolist(children, opts)
265+
end
266+
end
254267

255268
defp process_node_to_iolist({_, _, children}, opts),
256269
do: process_children_to_iolist(children, opts)
@@ -261,6 +274,7 @@ defmodule Html2Markdown.Converter do
261274
|> String.trim()
262275
|> normalize_whitespace()
263276
else
277+
# When not normalizing whitespace (e.g., in code blocks), preserve text exactly as-is
264278
text
265279
end
266280
end
@@ -296,9 +310,18 @@ defmodule Html2Markdown.Converter do
296310
end
297311

298312
defp detect_language(classes) do
299-
case Regex.run(~r/language-(\w+)/, classes) do
300-
[_, lang] -> lang
301-
_ -> ""
313+
cond do
314+
# First check for standard language- prefix
315+
match = Regex.run(~r/language-(\w+)/, classes) ->
316+
elem(List.to_tuple(match), 1)
317+
318+
# Check for makeup syntax highlighting classes
319+
match = Regex.run(~r/makeup (\w+)/, classes) ->
320+
elem(List.to_tuple(match), 1)
321+
322+
# Default to empty string
323+
true ->
324+
""
302325
end
303326
end
304327

@@ -403,16 +426,79 @@ defmodule Html2Markdown.Converter do
403426
defp process_ordered_list_item_to_iolist(other, _index, opts),
404427
do: process_node_to_iolist(other, opts)
405428

406-
defp process_children_to_iolist(children, opts) do
429+
# Context-aware processing for better spacing control
430+
defp process_children_with_context(children, opts, context) do
431+
final_context = determine_context(children, context)
432+
433+
case final_context do
434+
:block -> process_block_children(children, opts)
435+
:inline -> process_inline_children(children, opts)
436+
end
437+
end
438+
439+
# Determine processing context based on children content
440+
defp determine_context(children, :auto) do
441+
has_block_elements =
442+
Enum.any?(children, fn
443+
{tag, _, _} when is_binary(tag) -> ElementTypes.block_element?(tag)
444+
_ -> false
445+
end)
446+
447+
if has_block_elements, do: :block, else: :inline
448+
end
449+
450+
defp determine_context(_children, context), do: context
451+
452+
# Process block children with proper spacing between block elements
453+
defp process_block_children(children, opts) do
454+
children
455+
|> Enum.filter(&ElementTypes.content_node?/1)
456+
|> Enum.map(&process_node_to_iolist(&1, opts))
457+
|> Enum.reject(&ElementTypes.empty_content?/1)
458+
|> join_with_block_spacing()
459+
end
460+
461+
# Process inline children with smart spacing (existing logic)
462+
defp process_inline_children(children, opts) do
407463
iolist =
408464
children
409465
|> Enum.map(&process_node_to_iolist(&1, opts))
466+
467+
# Only apply smart spacing and trim when normalizing whitespace
468+
if opts.normalize_whitespace do
469+
iolist
410470
|> join_with_smart_spacing()
471+
|> IO.iodata_to_binary()
472+
|> String.trim()
473+
else
474+
# When not normalizing (e.g., in code blocks), just return the iolist as-is
475+
iolist
476+
end
477+
end
478+
479+
# Join block elements with proper spacing (double newlines)
480+
defp join_with_block_spacing([]), do: []
481+
482+
defp join_with_block_spacing([first | rest]) do
483+
Enum.reduce(rest, [first], fn item, acc ->
484+
[acc, "\n\n", item]
485+
end)
486+
end
487+
488+
# Legacy function maintained for backward compatibility
489+
defp process_children_to_iolist(children, opts) do
490+
iolist =
491+
children
492+
|> Enum.map(&process_node_to_iolist(&1, opts))
411493

412-
# Only trim if we're normalizing whitespace
494+
# Only apply smart spacing and trim when normalizing whitespace
413495
if opts.normalize_whitespace do
414-
iolist |> IO.iodata_to_binary() |> String.trim()
496+
iolist
497+
|> join_with_smart_spacing()
498+
|> IO.iodata_to_binary()
499+
|> String.trim()
415500
else
501+
# When not normalizing (e.g., in code blocks), just return the iolist as-is
416502
iolist
417503
end
418504
end

lib/html2markdown/element_types.ex

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
defmodule Html2Markdown.ElementTypes do
2+
@moduledoc """
3+
Provides element type classification for HTML to Markdown conversion.
4+
5+
This module categorizes HTML elements into block, inline, and other types
6+
to enable proper spacing and formatting decisions during conversion.
7+
"""
8+
9+
@block_elements ~w[
10+
p div h1 h2 h3 h4 h5 h6 ul ol li blockquote pre
11+
article section aside header footer main nav
12+
table tr td th thead tbody tfoot
13+
dl dt dd figure figcaption
14+
details summary
15+
]
16+
17+
@inline_elements ~w[
18+
a em strong code span i b u del sup sub
19+
img br hr abbr cite q time mark
20+
]
21+
22+
@list_elements ~w[ul ol li]
23+
24+
@heading_elements ~w[h1 h2 h3 h4 h5 h6]
25+
26+
@doc """
27+
Determines if an HTML element is a block-level element.
28+
29+
Block elements typically start on a new line and take up the full width
30+
available. They should be separated by blank lines in markdown.
31+
32+
## Examples
33+
34+
iex> Html2Markdown.ElementTypes.block_element?("p")
35+
true
36+
37+
iex> Html2Markdown.ElementTypes.block_element?("span")
38+
false
39+
"""
40+
@spec block_element?(String.t()) :: boolean()
41+
def block_element?(tag) when tag in @block_elements, do: true
42+
def block_element?(_), do: false
43+
44+
@doc """
45+
Determines if an HTML element is an inline element.
46+
47+
Inline elements flow within text and don't create line breaks.
48+
They should not have extra spacing added around them.
49+
50+
## Examples
51+
52+
iex> Html2Markdown.ElementTypes.inline_element?("em")
53+
true
54+
55+
iex> Html2Markdown.ElementTypes.inline_element?("p")
56+
false
57+
"""
58+
@spec inline_element?(String.t()) :: boolean()
59+
def inline_element?(tag) when tag in @inline_elements, do: true
60+
def inline_element?(_), do: false
61+
62+
@doc """
63+
Determines if an HTML element is a list element.
64+
65+
## Examples
66+
67+
iex> Html2Markdown.ElementTypes.list_element?("ul")
68+
true
69+
70+
iex> Html2Markdown.ElementTypes.list_element?("li")
71+
true
72+
73+
iex> Html2Markdown.ElementTypes.list_element?("p")
74+
false
75+
"""
76+
@spec list_element?(String.t()) :: boolean()
77+
def list_element?(tag) when tag in @list_elements, do: true
78+
def list_element?(_), do: false
79+
80+
@doc """
81+
Determines if an HTML element is a heading element.
82+
83+
## Examples
84+
85+
iex> Html2Markdown.ElementTypes.heading_element?("h1")
86+
true
87+
88+
iex> Html2Markdown.ElementTypes.heading_element?("h7")
89+
false
90+
"""
91+
@spec heading_element?(String.t()) :: boolean()
92+
def heading_element?(tag) when tag in @heading_elements, do: true
93+
def heading_element?(_), do: false
94+
95+
@doc """
96+
Determines if an element should be treated as content.
97+
98+
This is used to filter out empty text nodes, comments, etc.
99+
"""
100+
@spec content_node?(Floki.html_node()) :: boolean()
101+
def content_node?({:comment, _}), do: false
102+
def content_node?({tag, _, _}) when is_binary(tag), do: true
103+
104+
def content_node?(text) when is_binary(text) do
105+
String.trim(text) != ""
106+
end
107+
108+
def content_node?(_), do: false
109+
110+
@doc """
111+
Determines if processed content is empty.
112+
113+
Used to filter out elements that produce no markdown output.
114+
"""
115+
@spec empty_content?(iodata()) :: boolean()
116+
def empty_content?([]), do: true
117+
def empty_content?(""), do: true
118+
119+
def empty_content?(content) when is_binary(content) do
120+
String.trim(content) == ""
121+
end
122+
123+
def empty_content?(content) when is_list(content) do
124+
content
125+
|> IO.iodata_to_binary()
126+
|> String.trim()
127+
|> then(&(&1 == ""))
128+
end
129+
130+
def empty_content?(_), do: false
131+
end

lib/html2markdown/parser.ex

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,40 @@ defmodule Html2Markdown.Parser do
3838
end
3939

4040
defp prep_document(content) do
41-
if is_html_document?(content), do: content, else: wrap_fragment(content)
41+
content = if is_html_document?(content), do: content, else: wrap_fragment(content)
42+
43+
# Preserve whitespace in code blocks by replacing whitespace spans
44+
# with placeholders that won't be stripped by Floki
45+
preserve_code_whitespace(content)
46+
end
47+
48+
# Replace whitespace-only spans in code blocks with placeholders
49+
defp preserve_code_whitespace(content) do
50+
# Match <pre> or <code> blocks and preserve whitespace within them
51+
content
52+
|> String.replace(~r/<(pre|code)([^>]*)>(.*?)<\/\1>/s, fn full_match ->
53+
case Regex.run(~r/<(pre|code)([^>]*)>(.*?)<\/\1>/s, full_match) do
54+
[_, tag, attrs, inner] ->
55+
preserved_inner =
56+
inner
57+
|> String.replace(~r/<span([^>]*class="w"[^>]*)>([^<]*)<\/span>/, fn span_match ->
58+
case Regex.run(~r/<span([^>]*class="w"[^>]*)>([^<]*)<\/span>/, span_match) do
59+
[_, span_attrs, ws_content] ->
60+
# Encode whitespace content to preserve it
61+
encoded = Base.encode64(ws_content)
62+
~s(<span#{span_attrs} data-ws="#{encoded}"></span>)
63+
64+
_ ->
65+
span_match
66+
end
67+
end)
68+
69+
"<#{tag}#{attrs}>#{preserved_inner}</#{tag}>"
70+
71+
_ ->
72+
full_match
73+
end
74+
end)
4275
end
4376

4477
defp is_html_document?(content) do

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ defmodule Html2Markdown.MixProject do
44
def project do
55
[
66
app: :html2markdown,
7-
version: "0.2.1",
7+
version: "0.3.0",
88
elixir: "~> 1.15",
99
start_permanent: Mix.env() == :prod,
1010
description: description(),

0 commit comments

Comments
 (0)