Skip to content

Commit 691ad52

Browse files
authored
Merge pull request #10 from agoodway/fix-paragraph-spacing
Fix paragraph spacing
2 parents 1663f9b + 59018e0 commit 691ad52

File tree

15 files changed

+1552
-442
lines changed

15 files changed

+1552
-442
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Add `html2markdown` to your list of dependencies in `mix.exs`:
1414
```elixir
1515
def deps do
1616
[
17-
{:html2markdown, "~> 0.2.1"}
17+
{:html2markdown, "~> 0.3.0"}
1818
]
1919
end
2020
```

lib/html2markdown.ex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ defmodule Html2Markdown do
1010
## Basic Usage
1111
1212
iex> Html2Markdown.convert("<h1>Hello</h1><p>World</p>")
13-
"\\n# Hello\\n\\n\\n\\nWorld\\n"
13+
"# Hello\\n\\nWorld"
1414
1515
## Configuration
1616
@@ -154,7 +154,7 @@ defmodule Html2Markdown do
154154
## Examples
155155
156156
iex> Html2Markdown.convert("<p>Hello</p>", %{navigation_classes: ["custom-nav"]})
157-
"\\nHello\\n"
157+
"Hello"
158158
159159
"""
160160
@spec convert(html_content(), conversion_options()) :: markdown_content()

lib/html2markdown/converter.ex

Lines changed: 155 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@ defmodule Html2Markdown.Converter do
2323
- Preserves whitespace in code blocks while normalizing elsewhere
2424
"""
2525

26-
alias Html2Markdown.{TableConverter, Options}
26+
alias Html2Markdown.{TableConverter, Options, ElementTypes}
2727

2828
@spec convert_to_markdown(list(Floki.html_node()), Options.t()) :: String.t()
2929
def convert_to_markdown(document, opts) do
3030
document
3131
|> build_markdown_iolist(opts)
3232
|> IO.iodata_to_binary()
33+
|> String.replace("{{BR}}{{/BR}}", " \n")
34+
|> String.trim()
3335
end
3436

3537
# Optimized: Build iolist instead of string concatenation
@@ -55,25 +57,25 @@ defmodule Html2Markdown.Converter do
5557

5658
# Process nodes to iolist for better performance
5759
defp process_node_to_iolist({"h1", _, children}, opts),
58-
do: ["\n", "# ", process_children_to_iolist(children, opts), "\n"]
60+
do: ["# ", process_children_to_iolist(children, opts)]
5961

6062
defp process_node_to_iolist({"h2", _, children}, opts),
61-
do: ["\n", "## ", process_children_to_iolist(children, opts), "\n"]
63+
do: ["## ", process_children_to_iolist(children, opts)]
6264

6365
defp process_node_to_iolist({"h3", _, children}, opts),
64-
do: ["\n", "### ", process_children_to_iolist(children, opts), "\n"]
66+
do: ["### ", process_children_to_iolist(children, opts)]
6567

6668
defp process_node_to_iolist({"h4", _, children}, opts),
67-
do: ["\n", "#### ", process_children_to_iolist(children, opts), "\n"]
69+
do: ["#### ", process_children_to_iolist(children, opts)]
6870

6971
defp process_node_to_iolist({"h5", _, children}, opts),
70-
do: ["\n", "##### ", process_children_to_iolist(children, opts), "\n"]
72+
do: ["##### ", process_children_to_iolist(children, opts)]
7173

7274
defp process_node_to_iolist({"h6", _, children}, opts),
73-
do: ["\n", "###### ", process_children_to_iolist(children, opts), "\n"]
75+
do: ["###### ", process_children_to_iolist(children, opts)]
7476

7577
defp process_node_to_iolist({"p", _, children}, opts),
76-
do: ["\n", process_children_to_iolist(children, opts), "\n"]
78+
do: process_children_to_iolist(children, opts)
7779

7880
defp process_node_to_iolist({"ul", _, children}, opts),
7981
do: process_ul_list_to_iolist(children, opts)
@@ -220,14 +222,14 @@ defmodule Html2Markdown.Converter do
220222
end
221223
end
222224

223-
defp process_node_to_iolist({"br", _, _}, _opts), do: "\n\n"
225+
defp process_node_to_iolist({"br", _, _}, _opts), do: "{{BR}}{{/BR}}"
224226
defp process_node_to_iolist({"hr", _, _}, _opts), do: "\n\n---\n\n"
225227

226228
defp process_node_to_iolist({"section", _, children}, opts),
227-
do: ["\n", process_children_to_iolist(children, opts), "\n"]
229+
do: process_children_with_context(children, opts, :block)
228230

229231
defp process_node_to_iolist({"article", _, children}, opts),
230-
do: ["\n", process_children_to_iolist(children, opts), "\n"]
232+
do: process_children_with_context(children, opts, :block)
231233

232234
defp process_node_to_iolist({"picture", _, children}, opts) do
233235
case Enum.find(children, fn
@@ -248,7 +250,20 @@ defmodule Html2Markdown.Converter do
248250
end
249251

250252
defp process_node_to_iolist({"div", _, children}, opts),
251-
do: [process_children_to_iolist(children, opts), "\n"]
253+
do: process_children_with_context(children, opts, :block)
254+
255+
# Handle spans with preserved whitespace
256+
defp process_node_to_iolist({"span", attrs, children}, opts) do
257+
case List.keyfind(attrs, "data-ws", 0) do
258+
{"data-ws", encoded} ->
259+
# Decode preserved whitespace
260+
Base.decode64!(encoded)
261+
262+
_ ->
263+
# Normal span processing
264+
process_children_to_iolist(children, opts)
265+
end
266+
end
252267

253268
defp process_node_to_iolist({_, _, children}, opts),
254269
do: process_children_to_iolist(children, opts)
@@ -259,6 +274,7 @@ defmodule Html2Markdown.Converter do
259274
|> String.trim()
260275
|> normalize_whitespace()
261276
else
277+
# When not normalizing whitespace (e.g., in code blocks), preserve text exactly as-is
262278
text
263279
end
264280
end
@@ -283,20 +299,29 @@ defmodule Html2Markdown.Converter do
283299
# Disable whitespace normalization for code blocks
284300
code_opts = Map.put(opts, :normalize_whitespace, false)
285301
content = process_children_to_iolist(children, code_opts)
286-
["\n```\n", content, "\n```\n"]
302+
["```\n", content, "\n```"]
287303
end
288304

289305
defp process_code_block_to_iolist(classes, children, opts) do
290306
# Disable whitespace normalization for code blocks
291307
code_opts = Map.put(opts, :normalize_whitespace, false)
292308
language = detect_language(classes)
293-
["\n```", language, "\n", process_children_to_iolist(children, code_opts), "\n```\n"]
309+
["```", language, "\n", process_children_to_iolist(children, code_opts), "\n```"]
294310
end
295311

296312
defp detect_language(classes) do
297-
case Regex.run(~r/language-(\w+)/, classes) do
298-
[_, lang] -> lang
299-
_ -> ""
313+
cond do
314+
# First check for standard language- prefix
315+
match = Regex.run(~r/language-(\w+)/, classes) ->
316+
elem(List.to_tuple(match), 1)
317+
318+
# Check for makeup syntax highlighting classes
319+
match = Regex.run(~r/makeup (\w+)/, classes) ->
320+
elem(List.to_tuple(match), 1)
321+
322+
# Default to empty string
323+
true ->
324+
""
300325
end
301326
end
302327

@@ -375,24 +400,18 @@ defmodule Html2Markdown.Converter do
375400
end
376401

377402
defp process_ul_list_to_iolist(children, opts) when is_list(children) do
378-
items =
379-
children
380-
|> Enum.map(&process_list_item_to_iolist(&1, opts))
381-
|> Enum.intersperse("\n")
382-
383-
["\n", items, "\n"]
403+
children
404+
|> Enum.map(&process_list_item_to_iolist(&1, opts))
405+
|> Enum.intersperse("\n")
384406
end
385407

386408
defp process_ol_list_to_iolist(children, opts) when is_list(children) do
387-
items =
388-
children
389-
|> Enum.with_index(1)
390-
|> Enum.map(fn {child, index} ->
391-
process_ordered_list_item_to_iolist(child, index, opts)
392-
end)
393-
|> Enum.intersperse("\n")
394-
395-
["\n", items, "\n"]
409+
children
410+
|> Enum.with_index(1)
411+
|> Enum.map(fn {child, index} ->
412+
process_ordered_list_item_to_iolist(child, index, opts)
413+
end)
414+
|> Enum.intersperse("\n")
396415
end
397416

398417
defp process_list_item_to_iolist({"li", _, children}, opts),
@@ -407,20 +426,121 @@ defmodule Html2Markdown.Converter do
407426
defp process_ordered_list_item_to_iolist(other, _index, opts),
408427
do: process_node_to_iolist(other, opts)
409428

429+
# Context-aware processing for better spacing control
430+
defp process_children_with_context(children, opts, context) do
431+
final_context = determine_context(children, context)
432+
433+
case final_context do
434+
:block -> process_block_children(children, opts)
435+
:inline -> process_inline_children(children, opts)
436+
end
437+
end
438+
439+
# Determine processing context based on children content
440+
defp determine_context(children, :auto) do
441+
has_block_elements =
442+
Enum.any?(children, fn
443+
{tag, _, _} when is_binary(tag) -> ElementTypes.block_element?(tag)
444+
_ -> false
445+
end)
446+
447+
if has_block_elements, do: :block, else: :inline
448+
end
449+
450+
defp determine_context(_children, context), do: context
451+
452+
# Process block children with proper spacing between block elements
453+
defp process_block_children(children, opts) do
454+
children
455+
|> Enum.filter(&ElementTypes.content_node?/1)
456+
|> Enum.map(&process_node_to_iolist(&1, opts))
457+
|> Enum.reject(&ElementTypes.empty_content?/1)
458+
|> join_with_block_spacing()
459+
end
460+
461+
# Process inline children with smart spacing (existing logic)
462+
defp process_inline_children(children, opts) do
463+
iolist =
464+
children
465+
|> Enum.map(&process_node_to_iolist(&1, opts))
466+
467+
# Only apply smart spacing and trim when normalizing whitespace
468+
if opts.normalize_whitespace do
469+
iolist
470+
|> join_with_smart_spacing()
471+
|> IO.iodata_to_binary()
472+
|> String.trim()
473+
else
474+
# When not normalizing (e.g., in code blocks), just return the iolist as-is
475+
iolist
476+
end
477+
end
478+
479+
# Join block elements with proper spacing (double newlines)
480+
defp join_with_block_spacing([]), do: []
481+
482+
defp join_with_block_spacing([first | rest]) do
483+
Enum.reduce(rest, [first], fn item, acc ->
484+
[acc, "\n\n", item]
485+
end)
486+
end
487+
488+
# Legacy function maintained for backward compatibility
410489
defp process_children_to_iolist(children, opts) do
411490
iolist =
412491
children
413492
|> Enum.map(&process_node_to_iolist(&1, opts))
414-
|> Enum.intersperse(" ")
415493

416-
# Only trim if we're normalizing whitespace
494+
# Only apply smart spacing and trim when normalizing whitespace
417495
if opts.normalize_whitespace do
418-
iolist |> IO.iodata_to_binary() |> String.trim()
496+
iolist
497+
|> join_with_smart_spacing()
498+
|> IO.iodata_to_binary()
499+
|> String.trim()
419500
else
501+
# When not normalizing (e.g., in code blocks), just return the iolist as-is
420502
iolist
421503
end
422504
end
423505

506+
# Join nodes with spaces, but avoid spaces before punctuation
507+
defp join_with_smart_spacing([]), do: []
508+
509+
defp join_with_smart_spacing([first | rest]) do
510+
Enum.reduce(rest, [first], fn node, acc ->
511+
binary_node = IO.iodata_to_binary(node)
512+
binary_acc = IO.iodata_to_binary(acc)
513+
514+
cond do
515+
# Don't add space before punctuation
516+
match?(<<?., _::binary>>, binary_node) or
517+
match?(<<?:, _::binary>>, binary_node) or
518+
match?(<<?;, _::binary>>, binary_node) or
519+
match?(<<?!, _::binary>>, binary_node) or
520+
match?(<<??, _::binary>>, binary_node) or
521+
match?(<<?), _::binary>>, binary_node) or
522+
match?(<<?,, _::binary>>, binary_node) ->
523+
[acc, node]
524+
525+
# Don't add space around BR placeholder
526+
String.ends_with?(binary_acc, "{{BR}}{{/BR}}") ->
527+
[acc, node]
528+
529+
# Don't add space before BR placeholder
530+
String.starts_with?(binary_node, "{{BR}}{{/BR}}") ->
531+
[acc, node]
532+
533+
# Don't add space for empty nodes
534+
binary_node == "" ->
535+
acc
536+
537+
# Add space in other cases
538+
true ->
539+
[acc, " ", node]
540+
end
541+
end)
542+
end
543+
424544
defp normalize_whitespace(text) do
425545
text
426546
|> String.split("\n", trim: false)

0 commit comments

Comments
 (0)