-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Expand file tree
/
Copy pathhtml.ts
More file actions
166 lines (130 loc) · 5.31 KB
/
html.ts
File metadata and controls
166 lines (130 loc) · 5.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import { readFile } from 'node:fs/promises';
import { Cheerio, CheerioAPI, load } from 'cheerio';
import { AnyNode, CDATA, Comment, Element, Node, NodeWithChildren, Text } from 'domhandler';
export function nextElement(node?: Node): Element | null {
let result: Node | undefined | null = node;
do {
result = result?.nextSibling;
}
while (result && !(result instanceof Element));
return result instanceof Element ? result : null;
}
function findElementWith(direction: 'previous' | 'next', node: Node, test: (el: Element) => boolean): Element | null {
const prop = direction === 'previous' ? 'previousSibling' : 'nextSibling';
let result = null;
let candidate: Node | null = node;
while (candidate?.[prop]) {
candidate = candidate[prop];
if (candidate instanceof Element && test(candidate)) {
result = candidate;
break;
}
}
return result;
}
export function findPrevElementWith(node: Node, test: (el: Element) => boolean): Element | null {
return findElementWith('previous', node, test);
}
export function findNextElementWith(node: Node, test: (el: Element) => boolean): Element | null {
return findElementWith('next', node, test);
}
export async function loadText(text: Parameters<typeof load>[0]) {
return load(text, { xml: false });
}
export async function loadFile(file: string) {
return await loadText(await readFile(file));
}
function cloneAttrsString(node: Element) {
return Object.entries((node.attribs || {})).map(([key, value]) => {
const val = typeof value === 'string' ? `="${value}"` : '';
return `${key}${val}`;
}).join(' ');
}
export function replaceNode(
article: Cheerio<AnyNode>,
selector: string,
cb: ($node: Cheerio<Element>, attrs: string, content: string) => string
) {
const listStrong = article.find(selector);
for (let i = 0, length = listStrong.length; i < length; i++) {
const $node = listStrong.eq(i);
const newNode = cb($node, cloneAttrsString(listStrong[i]), $node.html() || '');
if (newNode) $node.replaceWith(newNode);
}
}
export function cleanText(text: string) {
return text.replace(/\n/g, ' ').trim();
}
type FakeNode = Node | string;
type FakeNodeWithLevel = [Node, number];
export async function htmlToText(
$: CheerioAPI,
list: Node[],
isFinalNode?: (node: FakeNode, level: number) => boolean
): Promise<string> {
let nodes: [FakeNode, number][] = list.map(item => ([item, 0]));
const contentNodes: string[] = [];
while (nodes.length) {
const [node, level] = nodes.pop();
if (!node) continue;
if (isFinalNode && isFinalNode(node, level)) break;
if (node instanceof Comment)
continue;
if (!(node instanceof Node)) {
contentNodes.push(node);
continue;
}
let result: (FakeNode | FakeNodeWithLevel)[] = [node];
if (node instanceof Element) {
const tag = node.tagName.toLowerCase();
if (tag === 'script' || tag === 'style' || tag === 'th')
continue;
if (tag === 'code') {
const text = $(node).text().trim();
if (!text.includes('\n')) {
contentNodes.push('`' + cleanText(text) + '`');
nodes.push([node.nextSibling, level]);
continue;
}
}
if (tag === 'li')
result = ['\n • ', ...result];
// if (tag === 'a' && $(node).text().trim())
// result = ['[', ...result, ']'];
if (tag === 'strong' || tag === 'b')
result = ['*', ...result, '*'];
if (tag === 'img') {
const text = $(node).attr('alt') || $(node).attr('title');
result = [text ? `<see ${text}>` : '<image>'];
}
}
if (node instanceof NodeWithChildren && node.firstChild) result = result.map((item) => {
if (item === node) return [node.firstChild, level + 1];
return item;
});
nodes.push(
[node.nextSibling, level],
...result.reverse().map(function(item): [FakeNode, number] {
if (item instanceof Node) {
if (node instanceof Element || node instanceof Text || node instanceof CDATA)
item = cleanText($(node).text());
}
return Array.isArray(item) ? [item[0], item[1]] : [item, level];
}));
}
return contentNodes
.join(' ')
// sample drop
.replace(/\/\/sampleStart/g, '')
.replace(/\/\/sampleEnd/g, '')
// newlines drop
.replace(/[^\S\r\n]+/g, ' ')
// drop unnecessary spaces.
// ToDO: if you have a more problems with code-snippet context (like with ":" in `class Foo : Bar` case)
// it's better move it to `cleanText` and add to `htmlToText` skipping empty lines for correct working trim.
.replace(/ ([;)])/g, '$1')
.replace(/ ([,?!:]( |$))/g, '$1') // Symbols with space after required. Exclude '?:', '!=' for ex
.replace(/( \.{3} )|( (\.( |$)))/g, '$1$3') // Space before dot. Exclude ' ... ' or '.method' for ex
.replace(/([(]) /g, '$1') // space after (
.trim();
}