1414
1515import { handlePreflight , jsonResponse , errorResponse , textResponse } from './cors' ;
1616import { robustFetch } from './fetch' ;
17- import { extractTitle , preprocessHtml } from './html' ;
17+ import { extractTitle , preprocessHtml , isWeChatArticle , extractWeChatContent } from './html' ;
1818import { collectImageUrls , rewriteImageUrls , uploadImages } from './r2' ;
1919import { fetchTimeout , fetchMaxAttempts } from './config' ;
2020
@@ -109,19 +109,26 @@ export default {
109109 contentType = directContentType || 'text/html' ;
110110 fileName = directFileName || 'content.html' ;
111111
112- // Encode content to ArrayBuffer
113- body = new TextEncoder ( ) . encode ( directContent ) . buffer as ArrayBuffer ;
114-
115- // For HTML content: preprocess and extract title
112+ // For HTML content: extract title first, then process content
116113 if ( isHtmlContent ( contentType ) ) {
117- const processed = preprocessHtml ( directContent ) ;
118- body = new TextEncoder ( ) . encode ( processed ) . buffer as ArrayBuffer ;
119-
120- // Extract title if no custom fileName was provided
114+ // Step 1: Extract title from original HTML (before any processing)
121115 if ( ! directFileName ) {
122116 const title = extractTitle ( directContent , 'content' ) ;
123117 fileName = `${ title } .html` ;
124118 }
119+
120+ // Step 2: Extract WeChat content if applicable
121+ let processedContent = directContent ;
122+ if ( isWeChatArticle ( directContent ) ) {
123+ processedContent = extractWeChatContent ( directContent ) ;
124+ }
125+
126+ // Step 3: Preprocess lazy-loaded images
127+ processedContent = preprocessHtml ( processedContent ) ;
128+ body = new TextEncoder ( ) . encode ( processedContent ) . buffer as ArrayBuffer ;
129+ } else {
130+ // Non-HTML content: encode directly
131+ body = new TextEncoder ( ) . encode ( directContent ) . buffer as ArrayBuffer ;
125132 }
126133 }
127134 // Branch 2: Fetch from URL
@@ -139,14 +146,23 @@ export default {
139146 body = await response . arrayBuffer ( ) ;
140147 fileName = getFileName ( targetUrl ) ;
141148
142- // For HTML content: preprocess lazy images and extract a better title
149+ // For HTML content: extract title first, then process content
143150 if ( isHtmlContent ( contentType ) ) {
144151 const rawHtml = new TextDecoder ( ) . decode ( body ) ;
145- const processed = preprocessHtml ( rawHtml ) ;
146- body = new TextEncoder ( ) . encode ( processed ) . buffer as ArrayBuffer ;
147152
153+ // Step 1: Extract title from original HTML (before any processing)
148154 const title = extractTitle ( rawHtml , fileName . replace ( / \. h t m l $ / , '' ) ) ;
149155 fileName = `${ title } .html` ;
156+
157+ // Step 2: Extract WeChat content if applicable
158+ let processedHtml = rawHtml ;
159+ if ( isWeChatArticle ( rawHtml ) ) {
160+ processedHtml = extractWeChatContent ( rawHtml ) ;
161+ }
162+
163+ // Step 3: Preprocess lazy-loaded images
164+ processedHtml = preprocessHtml ( processedHtml ) ;
165+ body = new TextEncoder ( ) . encode ( processedHtml ) . buffer as ArrayBuffer ;
150166 }
151167 } else {
152168 return errorResponse ( env , 'No URL or content provided.' ) ;
0 commit comments