@@ -23,13 +23,15 @@ defmodule Html2Markdown.Converter do
2323 - Preserves whitespace in code blocks while normalizing elsewhere
2424 """
2525
26- alias Html2Markdown . { TableConverter , Options }
26+ alias Html2Markdown . { TableConverter , Options , ElementTypes }
2727
2828 @ spec convert_to_markdown ( list ( Floki . html_node ( ) ) , Options . t ( ) ) :: String . t ( )
2929 def convert_to_markdown ( document , opts ) do
3030 document
3131 |> build_markdown_iolist ( opts )
3232 |> IO . iodata_to_binary ( )
33+ |> String . replace ( "{{BR}}{{/BR}}" , " \n " )
34+ |> String . trim ( )
3335 end
3436
3537 # Optimized: Build iolist instead of string concatenation
@@ -55,25 +57,25 @@ defmodule Html2Markdown.Converter do
5557
5658 # Process nodes to iolist for better performance
5759 defp process_node_to_iolist ( { "h1" , _ , children } , opts ) ,
58- do: [ "\n " , " # ", process_children_to_iolist ( children , opts ) , " \n " ]
60+ do: [ "# " , process_children_to_iolist ( children , opts ) ]
5961
6062 defp process_node_to_iolist ( { "h2" , _ , children } , opts ) ,
61- do: [ "\n " , " ## ", process_children_to_iolist ( children , opts ) , " \n " ]
63+ do: [ "## " , process_children_to_iolist ( children , opts ) ]
6264
6365 defp process_node_to_iolist ( { "h3" , _ , children } , opts ) ,
64- do: [ "\n " , " ### ", process_children_to_iolist ( children , opts ) , " \n " ]
66+ do: [ "### " , process_children_to_iolist ( children , opts ) ]
6567
6668 defp process_node_to_iolist ( { "h4" , _ , children } , opts ) ,
67- do: [ "\n " , " #### ", process_children_to_iolist ( children , opts ) , " \n " ]
69+ do: [ "#### " , process_children_to_iolist ( children , opts ) ]
6870
6971 defp process_node_to_iolist ( { "h5" , _ , children } , opts ) ,
70- do: [ "\n " , " ##### ", process_children_to_iolist ( children , opts ) , " \n " ]
72+ do: [ "##### " , process_children_to_iolist ( children , opts ) ]
7173
7274 defp process_node_to_iolist ( { "h6" , _ , children } , opts ) ,
73- do: [ "\n " , " ###### ", process_children_to_iolist ( children , opts ) , " \n " ]
75+ do: [ "###### " , process_children_to_iolist ( children , opts ) ]
7476
7577 defp process_node_to_iolist ( { "p" , _ , children } , opts ) ,
76- do: [ " \n " , process_children_to_iolist ( children , opts ) , " \n " ]
78+ do: process_children_to_iolist ( children , opts )
7779
7880 defp process_node_to_iolist ( { "ul" , _ , children } , opts ) ,
7981 do: process_ul_list_to_iolist ( children , opts )
@@ -220,14 +222,14 @@ defmodule Html2Markdown.Converter do
220222 end
221223 end
222224
223- defp process_node_to_iolist ( { "br" , _ , _ } , _opts ) , do: "\n \n "
225+ defp process_node_to_iolist ( { "br" , _ , _ } , _opts ) , do: "{{BR}}{{/BR}} "
224226 defp process_node_to_iolist ( { "hr" , _ , _ } , _opts ) , do: "\n \n ---\n \n "
225227
226228 defp process_node_to_iolist ( { "section" , _ , children } , opts ) ,
227- do: [ " \n " , process_children_to_iolist ( children , opts ) , " \n " ]
229+ do: process_children_with_context ( children , opts , :block )
228230
229231 defp process_node_to_iolist ( { "article" , _ , children } , opts ) ,
230- do: [ " \n " , process_children_to_iolist ( children , opts ) , " \n " ]
232+ do: process_children_with_context ( children , opts , :block )
231233
232234 defp process_node_to_iolist ( { "picture" , _ , children } , opts ) do
233235 case Enum . find ( children , fn
@@ -248,7 +250,20 @@ defmodule Html2Markdown.Converter do
248250 end
249251
250252 defp process_node_to_iolist ( { "div" , _ , children } , opts ) ,
251- do: [ process_children_to_iolist ( children , opts ) , "\n " ]
253+ do: process_children_with_context ( children , opts , :block )
254+
255+ # Handle spans with preserved whitespace
256+ defp process_node_to_iolist ( { "span" , attrs , children } , opts ) do
257+ case List . keyfind ( attrs , "data-ws" , 0 ) do
258+ { "data-ws" , encoded } ->
259+ # Decode preserved whitespace
260+ Base . decode64! ( encoded )
261+
262+ _ ->
263+ # Normal span processing
264+ process_children_to_iolist ( children , opts )
265+ end
266+ end
252267
253268 defp process_node_to_iolist ( { _ , _ , children } , opts ) ,
254269 do: process_children_to_iolist ( children , opts )
@@ -259,6 +274,7 @@ defmodule Html2Markdown.Converter do
259274 |> String . trim ( )
260275 |> normalize_whitespace ( )
261276 else
277+ # When not normalizing whitespace (e.g., in code blocks), preserve text exactly as-is
262278 text
263279 end
264280 end
@@ -283,20 +299,29 @@ defmodule Html2Markdown.Converter do
283299 # Disable whitespace normalization for code blocks
284300 code_opts = Map . put ( opts , :normalize_whitespace , false )
285301 content = process_children_to_iolist ( children , code_opts )
286- [ "\n ```\n " , content , "\n ```\n " ]
302+ [ "```\n " , content , "\n ```" ]
287303 end
288304
289305 defp process_code_block_to_iolist ( classes , children , opts ) do
290306 # Disable whitespace normalization for code blocks
291307 code_opts = Map . put ( opts , :normalize_whitespace , false )
292308 language = detect_language ( classes )
293- [ "\n ```" , language , "\n " , process_children_to_iolist ( children , code_opts ) , "\n ```\n " ]
309+ [ "```" , language , "\n " , process_children_to_iolist ( children , code_opts ) , "\n ```" ]
294310 end
295311
296312 defp detect_language ( classes ) do
297- case Regex . run ( ~r/ language-(\w +)/ , classes ) do
298- [ _ , lang ] -> lang
299- _ -> ""
313+ cond do
314+ # First check for standard language- prefix
315+ match = Regex . run ( ~r/ language-(\w +)/ , classes ) ->
316+ elem ( List . to_tuple ( match ) , 1 )
317+
318+ # Check for makeup syntax highlighting classes
319+ match = Regex . run ( ~r/ makeup (\w +)/ , classes ) ->
320+ elem ( List . to_tuple ( match ) , 1 )
321+
322+ # Default to empty string
323+ true ->
324+ ""
300325 end
301326 end
302327
@@ -375,24 +400,18 @@ defmodule Html2Markdown.Converter do
375400 end
376401
377402 defp process_ul_list_to_iolist ( children , opts ) when is_list ( children ) do
378- items =
379- children
380- |> Enum . map ( & process_list_item_to_iolist ( & 1 , opts ) )
381- |> Enum . intersperse ( "\n " )
382-
383- [ "\n " , items , "\n " ]
403+ children
404+ |> Enum . map ( & process_list_item_to_iolist ( & 1 , opts ) )
405+ |> Enum . intersperse ( "\n " )
384406 end
385407
386408 defp process_ol_list_to_iolist ( children , opts ) when is_list ( children ) do
387- items =
388- children
389- |> Enum . with_index ( 1 )
390- |> Enum . map ( fn { child , index } ->
391- process_ordered_list_item_to_iolist ( child , index , opts )
392- end )
393- |> Enum . intersperse ( "\n " )
394-
395- [ "\n " , items , "\n " ]
409+ children
410+ |> Enum . with_index ( 1 )
411+ |> Enum . map ( fn { child , index } ->
412+ process_ordered_list_item_to_iolist ( child , index , opts )
413+ end )
414+ |> Enum . intersperse ( "\n " )
396415 end
397416
398417 defp process_list_item_to_iolist ( { "li" , _ , children } , opts ) ,
@@ -407,20 +426,121 @@ defmodule Html2Markdown.Converter do
407426 defp process_ordered_list_item_to_iolist ( other , _index , opts ) ,
408427 do: process_node_to_iolist ( other , opts )
409428
429+ # Context-aware processing for better spacing control
430+ defp process_children_with_context ( children , opts , context ) do
431+ final_context = determine_context ( children , context )
432+
433+ case final_context do
434+ :block -> process_block_children ( children , opts )
435+ :inline -> process_inline_children ( children , opts )
436+ end
437+ end
438+
439+ # Determine processing context based on children content
440+ defp determine_context ( children , :auto ) do
441+ has_block_elements =
442+ Enum . any? ( children , fn
443+ { tag , _ , _ } when is_binary ( tag ) -> ElementTypes . block_element? ( tag )
444+ _ -> false
445+ end )
446+
447+ if has_block_elements , do: :block , else: :inline
448+ end
449+
450+ defp determine_context ( _children , context ) , do: context
451+
452+ # Process block children with proper spacing between block elements
453+ defp process_block_children ( children , opts ) do
454+ children
455+ |> Enum . filter ( & ElementTypes . content_node? / 1 )
456+ |> Enum . map ( & process_node_to_iolist ( & 1 , opts ) )
457+ |> Enum . reject ( & ElementTypes . empty_content? / 1 )
458+ |> join_with_block_spacing ( )
459+ end
460+
461+ # Process inline children with smart spacing (existing logic)
462+ defp process_inline_children ( children , opts ) do
463+ iolist =
464+ children
465+ |> Enum . map ( & process_node_to_iolist ( & 1 , opts ) )
466+
467+ # Only apply smart spacing and trim when normalizing whitespace
468+ if opts . normalize_whitespace do
469+ iolist
470+ |> join_with_smart_spacing ( )
471+ |> IO . iodata_to_binary ( )
472+ |> String . trim ( )
473+ else
474+ # When not normalizing (e.g., in code blocks), just return the iolist as-is
475+ iolist
476+ end
477+ end
478+
479+ # Join block elements with proper spacing (double newlines)
480+ defp join_with_block_spacing ( [ ] ) , do: [ ]
481+
482+ defp join_with_block_spacing ( [ first | rest ] ) do
483+ Enum . reduce ( rest , [ first ] , fn item , acc ->
484+ [ acc , "\n \n " , item ]
485+ end )
486+ end
487+
488+ # Legacy function maintained for backward compatibility
410489 defp process_children_to_iolist ( children , opts ) do
411490 iolist =
412491 children
413492 |> Enum . map ( & process_node_to_iolist ( & 1 , opts ) )
414- |> Enum . intersperse ( " " )
415493
416- # Only trim if we're normalizing whitespace
494+ # Only apply smart spacing and trim when normalizing whitespace
417495 if opts . normalize_whitespace do
418- iolist |> IO . iodata_to_binary ( ) |> String . trim ( )
496+ iolist
497+ |> join_with_smart_spacing ( )
498+ |> IO . iodata_to_binary ( )
499+ |> String . trim ( )
419500 else
501+ # When not normalizing (e.g., in code blocks), just return the iolist as-is
420502 iolist
421503 end
422504 end
423505
506+ # Join nodes with spaces, but avoid spaces before punctuation
507+ defp join_with_smart_spacing ( [ ] ) , do: [ ]
508+
509+ defp join_with_smart_spacing ( [ first | rest ] ) do
510+ Enum . reduce ( rest , [ first ] , fn node , acc ->
511+ binary_node = IO . iodata_to_binary ( node )
512+ binary_acc = IO . iodata_to_binary ( acc )
513+
514+ cond do
515+ # Don't add space before punctuation
516+ match? ( << ?. , _ :: binary >> , binary_node ) or
517+ match? ( << ?: , _ :: binary >> , binary_node ) or
518+ match? ( << ?; , _ :: binary >> , binary_node ) or
519+ match? ( << ?! , _ :: binary >> , binary_node ) or
520+ match? ( << ?? , _ :: binary >> , binary_node ) or
521+ match? ( << ?) , _ :: binary >> , binary_node ) or
522+ match? ( << ?, , _ :: binary >> , binary_node ) ->
523+ [ acc , node ]
524+
525+ # Don't add space around BR placeholder
526+ String . ends_with? ( binary_acc , "{{BR}}{{/BR}}" ) ->
527+ [ acc , node ]
528+
529+ # Don't add space before BR placeholder
530+ String . starts_with? ( binary_node , "{{BR}}{{/BR}}" ) ->
531+ [ acc , node ]
532+
533+ # Don't add space for empty nodes
534+ binary_node == "" ->
535+ acc
536+
537+ # Add space in other cases
538+ true ->
539+ [ acc , " " , node ]
540+ end
541+ end )
542+ end
543+
424544 defp normalize_whitespace ( text ) do
425545 text
426546 |> String . split ( "\n " , trim: false )
0 commit comments