-
-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Expand file tree
/
Copy pathhtml2markdown.ts
More file actions
105 lines (93 loc) · 2.89 KB
/
html2markdown.ts
File metadata and controls
105 lines (93 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/**
* @license Copyright (c) 2003-2025, CKSource Holding sp. z o.o. All rights reserved.
* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-licensing-options
*/
/**
* @module markdown-gfm/html2markdown/html2markdown
*/
import { unified, type Plugin } from 'unified';
import rehypeParse from 'rehype-dom-parse';
import rehypeRemark from 'rehype-remark';
import remarkBreaks from 'remark-breaks';
import remarkGfm from 'remark-gfm';
import remarkStringify from 'remark-stringify';
import { visit } from 'unist-util-visit';
import { toHtml } from 'hast-util-to-html';
import type { Handle, State } from 'hast-util-to-mdast';
import type { Element, Node, Root, RootContent } from 'hast';
export class MarkdownGfmHtmlToMd {
private _processor: any;
private _keepRawTags: Array<string> = [];
constructor() {
this._buildProcessor();
}
public keep( tagName: string ): void {
this._keepRawTags.push( tagName.toLowerCase() );
this._buildProcessor();
}
public parse( html: string ): string {
return this._processor!
.processSync( html )
.toString()
.trim();
}
/**
* Returns handlers for raw HTML tags that should be kept in the Markdown output.
*/
private _getRawTagsHandlers(): Record<string, Handle> {
return this._keepRawTags.reduce( ( handlers: Record<string, Handle>, tagName: string ) => {
handlers[ tagName ] = ( state: State, node: RootContent ) => {
const result = {
type: 'html' as const,
value: toHtml( node, { allowDangerousHtml: true } )
};
state.patch( node, result );
return result;
};
return handlers;
}, {} as Record<string, Handle> );
}
private _buildProcessor() {
this._processor = unified()
// Parse HTML to an abstract syntax tree (AST).
.use( rehypeParse )
// Removes `<label>` element from TODO lists.
.use( removeLabelFromCheckboxes )
// Turns HTML syntax tree into Markdown syntax tree.
.use( rehypeRemark, {
// Keeps allowed HTML tags.
handlers: this._getRawTagsHandlers()
} )
// Adds support for GitHub Flavored Markdown (GFM).
.use( remarkGfm, {
singleTilde: true
} )
// Replaces line breaks with `<br>` tags.
.use( remarkBreaks )
// Serializes HTML syntax tree.
.use( remarkStringify, {
resourceLink: true,
emphasis: '_',
rule: '-',
handlers: {
break: () => '\n'
},
unsafe: [
{ character: '<' }
]
} );
}
}
/**
* Removes `<label>` element from TODO lists, so that `<input>` and `text` are direct children of `<li>`.
*/
function removeLabelFromCheckboxes(): ReturnType<Plugin> {
return function( tree: Node ): void {
visit( tree, 'element', ( node: Element, index: number, parent: Root | Element ) => {
if ( node.tagName === 'label' && parent.type === 'element' && parent.tagName === 'li' ) {
parent.children[ index ] = node.children[ 0 ];
parent.children.splice( index + 1, 1, ...node.children );
}
} );
};
}