refactor: extract title before WeChat content extraction

yanglbme · yanglbme · commit f00a9123e8cf · 2026-02-08T19:14:12.000+08:00
- Extract title from original HTML to preserve meta tags
- Optimize processing order: title -&gt; WeChat content -&gt; lazy images
- Improve code clarity with step-by-step comments
diff --git a/src/html.ts b/src/html.ts
@@ -1,8 +1,50 @@
 /**
  * HTML processing utilities
- * Title extraction, content escaping, and lazy-image preprocessing.
+ * Title extraction, content escaping, lazy-image preprocessing, and WeChat article extraction.
  */
 
+/**
+ * Check if the HTML is a WeChat Official Account article.
+ * Detects based on common patterns in WeChat article pages.
+ */
+export function isWeChatArticle(html: string): boolean {
+	// Check for WeChat-specific meta tags and elements
+	return (
+		html.includes('id="js_content"') &&
+		html.includes('rich_media_content')
+	);
+}
+
+/**
+ * Extract WeChat article content from the #js_content element.
+ * Returns the extracted content if found, otherwise returns the original HTML.
+ */
+export function extractWeChatContent(html: string): string {
+	// Try to extract content from id="js_content"
+	const contentMatch = html.match(/<div[^>]*id=["']js_content["'][^>]*>([\s\S]*?)<\/div>\s*(?:<\/div>|<script)/i);
+
+	if (contentMatch?.[1]) {
+		const content = contentMatch[1].trim();
+
+		// Build a minimal HTML structure with the extracted content
+		// This preserves the content while removing unnecessary page elements
+		return `<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+</head>
+<body>
+<div id="js_content">
+${content}
+</div>
+</body>
+</html>`;
+	}
+
+	// If extraction fails, return original HTML
+	return html;
+}
+
 /**
  * Extract a meaningful page title from raw HTML.
  *
diff --git a/src/index.ts b/src/index.ts
@@ -14,7 +14,7 @@
 
 import { handlePreflight, jsonResponse, errorResponse, textResponse } from './cors';
 import { robustFetch } from './fetch';
-import { extractTitle, preprocessHtml } from './html';
+import { extractTitle, preprocessHtml, isWeChatArticle, extractWeChatContent } from './html';
 import { collectImageUrls, rewriteImageUrls, uploadImages } from './r2';
 import { fetchTimeout, fetchMaxAttempts } from './config';
 
@@ -109,19 +109,26 @@ export default {
 				contentType = directContentType || 'text/html';
 				fileName = directFileName || 'content.html';
 
-				// Encode content to ArrayBuffer
-				body = new TextEncoder().encode(directContent).buffer as ArrayBuffer;
-
-				// For HTML content: preprocess and extract title
+				// For HTML content: extract title first, then process content
 				if (isHtmlContent(contentType)) {
-					const processed = preprocessHtml(directContent);
-					body = new TextEncoder().encode(processed).buffer as ArrayBuffer;
-
-					// Extract title if no custom fileName was provided
+					// Step 1: Extract title from original HTML (before any processing)
 					if (!directFileName) {
 						const title = extractTitle(directContent, 'content');
 						fileName = `${title}.html`;
 					}
+
+					// Step 2: Extract WeChat content if applicable
+					let processedContent = directContent;
+					if (isWeChatArticle(directContent)) {
+						processedContent = extractWeChatContent(directContent);
+					}
+
+					// Step 3: Preprocess lazy-loaded images
+					processedContent = preprocessHtml(processedContent);
+					body = new TextEncoder().encode(processedContent).buffer as ArrayBuffer;
+				} else {
+					// Non-HTML content: encode directly
+					body = new TextEncoder().encode(directContent).buffer as ArrayBuffer;
 				}
 			}
 			// Branch 2: Fetch from URL
@@ -139,14 +146,23 @@ export default {
 				body = await response.arrayBuffer();
 				fileName = getFileName(targetUrl);
 
-				// For HTML content: preprocess lazy images and extract a better title
+				// For HTML content: extract title first, then process content
 				if (isHtmlContent(contentType)) {
 					const rawHtml = new TextDecoder().decode(body);
-					const processed = preprocessHtml(rawHtml);
-					body = new TextEncoder().encode(processed).buffer as ArrayBuffer;
 
+					// Step 1: Extract title from original HTML (before any processing)
 					const title = extractTitle(rawHtml, fileName.replace(/\.html$/, ''));
 					fileName = `${title}.html`;
+
+					// Step 2: Extract WeChat content if applicable
+					let processedHtml = rawHtml;
+					if (isWeChatArticle(rawHtml)) {
+						processedHtml = extractWeChatContent(rawHtml);
+					}
+
+					// Step 3: Preprocess lazy-loaded images
+					processedHtml = preprocessHtml(processedHtml);
+					body = new TextEncoder().encode(processedHtml).buffer as ArrayBuffer;
 				}
 			} else {
 				return errorResponse(env, 'No URL or content provided.');