@@ -13,6 +13,20 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/;
1313
1414const WHITESPACE = / [ \s \n \r ] / ;
1515
16+ const CRAWLABLE_META_NAME_ATTRS = new Set ( [
17+ 'og:url' ,
18+ 'og:image' ,
19+ 'og:image:url' ,
20+ 'og:image:secure_url' ,
21+ 'og:video' ,
22+ 'og:video:url' ,
23+ 'og:video:secure_url' ,
24+ 'og:audio' ,
25+ 'og:audio:url' ,
26+ 'og:audio:secure_url' ,
27+ 'twitter:image'
28+ ] ) ;
29+
1630/**
1731 * @param {string } html
1832 * @param {string } base
@@ -81,6 +95,9 @@ export function crawl(html, base) {
8195
8296 const tag = html . slice ( start , i ) . toUpperCase ( ) ;
8397
98+ /** @type {Record<string, string> } */
99+ const attributes = { } ;
100+
84101 if ( tag === 'SCRIPT' || tag === 'STYLE' ) {
85102 while ( i < html . length ) {
86103 if (
@@ -95,9 +112,6 @@ export function crawl(html, base) {
95112 }
96113 }
97114
98- let href = '' ;
99- let rel = '' ;
100-
101115 while ( i < html . length ) {
102116 const start = i ;
103117
@@ -159,44 +173,7 @@ export function crawl(html, base) {
159173 }
160174
161175 value = decode ( value ) ;
162-
163- if ( name === 'href' ) {
164- if ( tag === 'BASE' ) {
165- base = resolve ( base , value ) ;
166- } else {
167- href = resolve ( base , value ) ;
168- }
169- } else if ( name === 'id' ) {
170- ids . push ( value ) ;
171- } else if ( name === 'name' ) {
172- if ( tag === 'A' ) ids . push ( value ) ;
173- } else if ( name === 'rel' ) {
174- rel = value ;
175- } else if ( name === 'src' ) {
176- if ( value ) hrefs . push ( resolve ( base , value ) ) ;
177- } else if ( name === 'srcset' ) {
178- const candidates = [ ] ;
179- let insideURL = true ;
180- value = value . trim ( ) ;
181- for ( let i = 0 ; i < value . length ; i ++ ) {
182- if (
183- value [ i ] === ',' &&
184- ( ! insideURL || ( insideURL && WHITESPACE . test ( value [ i + 1 ] ) ) )
185- ) {
186- candidates . push ( value . slice ( 0 , i ) ) ;
187- value = value . substring ( i + 1 ) . trim ( ) ;
188- i = 0 ;
189- insideURL = true ;
190- } else if ( WHITESPACE . test ( value [ i ] ) ) {
191- insideURL = false ;
192- }
193- }
194- candidates . push ( value ) ;
195- for ( const candidate of candidates ) {
196- const src = candidate . split ( WHITESPACE ) [ 0 ] ;
197- if ( src ) hrefs . push ( resolve ( base , src ) ) ;
198- }
199- }
176+ attributes [ name ] = value ;
200177 } else {
201178 i -= 1 ;
202179 }
@@ -205,8 +182,56 @@ export function crawl(html, base) {
205182 i += 1 ;
206183 }
207184
208- if ( href && ! / \b e x t e r n a l \b / i. test ( rel ) ) {
209- hrefs . push ( resolve ( base , href ) ) ;
185+ const { href, id, name, property, rel, src, srcset, content } = attributes ;
186+
187+ if ( href ) {
188+ if ( tag === 'BASE' ) {
189+ base = resolve ( base , href ) ;
190+ } else if ( ! rel || ! / \b e x t e r n a l \b / i. test ( rel ) ) {
191+ hrefs . push ( resolve ( base , href ) ) ;
192+ }
193+ }
194+
195+ if ( id ) {
196+ ids . push ( id ) ;
197+ }
198+
199+ if ( name && tag === 'A' ) {
200+ ids . push ( name ) ;
201+ }
202+
203+ if ( src ) {
204+ hrefs . push ( resolve ( base , src ) ) ;
205+ }
206+
207+ if ( srcset ) {
208+ let value = srcset ;
209+ const candidates = [ ] ;
210+ let insideURL = true ;
211+ value = value . trim ( ) ;
212+ for ( let i = 0 ; i < value . length ; i ++ ) {
213+ if ( value [ i ] === ',' && ( ! insideURL || ( insideURL && WHITESPACE . test ( value [ i + 1 ] ) ) ) ) {
214+ candidates . push ( value . slice ( 0 , i ) ) ;
215+ value = value . substring ( i + 1 ) . trim ( ) ;
216+ i = 0 ;
217+ insideURL = true ;
218+ } else if ( WHITESPACE . test ( value [ i ] ) ) {
219+ insideURL = false ;
220+ }
221+ }
222+ candidates . push ( value ) ;
223+ for ( const candidate of candidates ) {
224+ const src = candidate . split ( WHITESPACE ) [ 0 ] ;
225+ if ( src ) hrefs . push ( resolve ( base , src ) ) ;
226+ }
227+ }
228+
229+ if ( tag === 'META' && content ) {
230+ const attr = name ?? property ;
231+
232+ if ( attr && CRAWLABLE_META_NAME_ATTRS . has ( attr ) ) {
233+ hrefs . push ( resolve ( base , content ) ) ;
234+ }
210235 }
211236 }
212237 }
0 commit comments