Skip to content

Commit f00a912

Browse files
committed
refactor: extract title before WeChat content extraction
- Extract title from original HTML to preserve meta tags - Optimize processing order: title -> WeChat content -> lazy images - Improve code clarity with step-by-step comments
1 parent 93e378d commit f00a912

File tree

2 files changed

+71
-13
lines changed

2 files changed

+71
-13
lines changed

src/html.ts

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,50 @@
11
/**
22
* HTML processing utilities
3-
* Title extraction, content escaping, and lazy-image preprocessing.
3+
* Title extraction, content escaping, lazy-image preprocessing, and WeChat article extraction.
44
*/
55

6+
/**
7+
* Check if the HTML is a WeChat Official Account article.
8+
* Detects based on common patterns in WeChat article pages.
9+
*/
10+
export function isWeChatArticle(html: string): boolean {
11+
// Check for WeChat-specific meta tags and elements
12+
return (
13+
html.includes('id="js_content"') &&
14+
html.includes('rich_media_content')
15+
);
16+
}
17+
18+
/**
19+
* Extract WeChat article content from the #js_content element.
20+
* Returns the extracted content if found, otherwise returns the original HTML.
21+
*/
22+
export function extractWeChatContent(html: string): string {
23+
// Try to extract content from id="js_content"
24+
const contentMatch = html.match(/<div[^>]*id=["']js_content["'][^>]*>([\s\S]*?)<\/div>\s*(?:<\/div>|<script)/i);
25+
26+
if (contentMatch?.[1]) {
27+
const content = contentMatch[1].trim();
28+
29+
// Build a minimal HTML structure with the extracted content
30+
// This preserves the content while removing unnecessary page elements
31+
return `<!DOCTYPE html>
32+
<html>
33+
<head>
34+
<meta charset="utf-8">
35+
</head>
36+
<body>
37+
<div id="js_content">
38+
${content}
39+
</div>
40+
</body>
41+
</html>`;
42+
}
43+
44+
// If extraction fails, return original HTML
45+
return html;
46+
}
47+
648
/**
749
* Extract a meaningful page title from raw HTML.
850
*

src/index.ts

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import { handlePreflight, jsonResponse, errorResponse, textResponse } from './cors';
1616
import { robustFetch } from './fetch';
17-
import { extractTitle, preprocessHtml } from './html';
17+
import { extractTitle, preprocessHtml, isWeChatArticle, extractWeChatContent } from './html';
1818
import { collectImageUrls, rewriteImageUrls, uploadImages } from './r2';
1919
import { fetchTimeout, fetchMaxAttempts } from './config';
2020

@@ -109,19 +109,26 @@ export default {
109109
contentType = directContentType || 'text/html';
110110
fileName = directFileName || 'content.html';
111111

112-
// Encode content to ArrayBuffer
113-
body = new TextEncoder().encode(directContent).buffer as ArrayBuffer;
114-
115-
// For HTML content: preprocess and extract title
112+
// For HTML content: extract title first, then process content
116113
if (isHtmlContent(contentType)) {
117-
const processed = preprocessHtml(directContent);
118-
body = new TextEncoder().encode(processed).buffer as ArrayBuffer;
119-
120-
// Extract title if no custom fileName was provided
114+
// Step 1: Extract title from original HTML (before any processing)
121115
if (!directFileName) {
122116
const title = extractTitle(directContent, 'content');
123117
fileName = `${title}.html`;
124118
}
119+
120+
// Step 2: Extract WeChat content if applicable
121+
let processedContent = directContent;
122+
if (isWeChatArticle(directContent)) {
123+
processedContent = extractWeChatContent(directContent);
124+
}
125+
126+
// Step 3: Preprocess lazy-loaded images
127+
processedContent = preprocessHtml(processedContent);
128+
body = new TextEncoder().encode(processedContent).buffer as ArrayBuffer;
129+
} else {
130+
// Non-HTML content: encode directly
131+
body = new TextEncoder().encode(directContent).buffer as ArrayBuffer;
125132
}
126133
}
127134
// Branch 2: Fetch from URL
@@ -139,14 +146,23 @@ export default {
139146
body = await response.arrayBuffer();
140147
fileName = getFileName(targetUrl);
141148

142-
// For HTML content: preprocess lazy images and extract a better title
149+
// For HTML content: extract title first, then process content
143150
if (isHtmlContent(contentType)) {
144151
const rawHtml = new TextDecoder().decode(body);
145-
const processed = preprocessHtml(rawHtml);
146-
body = new TextEncoder().encode(processed).buffer as ArrayBuffer;
147152

153+
// Step 1: Extract title from original HTML (before any processing)
148154
const title = extractTitle(rawHtml, fileName.replace(/\.html$/, ''));
149155
fileName = `${title}.html`;
156+
157+
// Step 2: Extract WeChat content if applicable
158+
let processedHtml = rawHtml;
159+
if (isWeChatArticle(rawHtml)) {
160+
processedHtml = extractWeChatContent(rawHtml);
161+
}
162+
163+
// Step 3: Preprocess lazy-loaded images
164+
processedHtml = preprocessHtml(processedHtml);
165+
body = new TextEncoder().encode(processedHtml).buffer as ArrayBuffer;
150166
}
151167
} else {
152168
return errorResponse(env, 'No URL or content provided.');

0 commit comments

Comments
 (0)