11import { parseHTML } from 'linkedom'
2- import { parse , walk } from 'css-tree '
2+ import { parse , walk } from '@projectwallace/ css-parser '
33import { resolve_url } from '../../../lib/resolve-url.js'
44
55export const USER_AGENT = 'Project Wallace CSS Scraper/1.1 (+https://www.projectwallace.com/docs/css-scraper)'
@@ -8,19 +8,22 @@ function is_wayback_url(url: string) {
88 return / ^ (?: (?: h t t p s : ) ? \/ \/ ) ? w e b \. a r c h i v e \. o r g \/ w e b \/ \d { 14 } \/ .+ / . test ( url )
99}
1010
11+ function unquote ( str : string ) : string {
12+ return str . replaceAll ( / ( ^ [ ' " ] ) ( [ ' " ] $ ) / g, '' )
13+ }
14+
1115function get_import_urls ( css : string ) {
16+ let urls : string [ ] = [ ]
1217 let ast = parse ( css , {
13- parseAtrulePrelude : false ,
14- parseRulePrelude : false ,
15- parseValue : false ,
16- parseCustomProperty : false
18+ parse_selectors : false ,
19+ parse_values : false
1720 } )
18- let urls : string [ ] = [ ]
19-
20- walk ( ast , function ( node ) {
21- // Can not be a URL inside something else because otherwise this.atrule could never be an import
22- if ( node . type === 'Url' && this . atrule ?. name === 'import' ) {
23- urls . push ( node . value )
21+ walk ( ast , ( node ) => {
22+ if ( node . type_name === 'Atrule' && node . name === 'import' ) {
23+ let url = node . children . find ( ( child ) => child . type_name === 'Url' )
24+ if ( url ) {
25+ urls . push ( unquote ( url . value as string ) )
26+ }
2427 }
2528 } )
2629 return urls
@@ -31,17 +34,17 @@ async function get_css_file(url: string | URL, abort_signal: AbortSignal) {
3134 let response = await fetch ( url , {
3235 headers : {
3336 'User-Agent' : USER_AGENT ,
34- ' Accept' : 'text/css,*/*;q=0.1'
37+ Accept : 'text/css,*/*;q=0.1'
3538 } ,
3639 // If aborted early try to return an empty string so we can continue with just the content we have
37- signal : abort_signal ,
40+ signal : abort_signal
3841 } )
3942
4043 if ( ! response . ok ) {
4144 throw new Error ( response . statusText )
4245 }
4346 return response . text ( )
44- } catch ( error : unknown ) {
47+ } catch {
4548 return ''
4649 }
4750}
@@ -66,7 +69,7 @@ function get_styles(nodes: NodeListOf<Element>, base_url: string) {
6669 items . push ( {
6770 type : 'style' ,
6871 css,
69- url : base_url ,
72+ url : base_url
7073 } )
7174 } else if ( node . hasAttribute ( 'style' ) ) {
7275 let declarations = ( node . getAttribute ( 'style' ) || '' ) . trim ( )
@@ -84,15 +87,15 @@ function get_styles(nodes: NodeListOf<Element>, base_url: string) {
8487 class_name += '.'
8588 class_name += class_attr
8689 . split ( / \s + / g)
87- . filter ( s => {
90+ . filter ( ( s ) => {
8891 if ( s . length === 0 ) return false
8992 if ( s . length === 1 ) {
9093 let code = s . charCodeAt ( 0 )
9194 if ( code < 48 || code > 122 ) return false
9295 }
9396 return true
9497 } )
95- . map ( s => s . replaceAll ( / ( \[ | \] | : | \. | \/ ) / g, '\\$1' ) )
98+ . map ( ( s ) => s . replaceAll ( / ( \[ | \] | : | \. | \/ ) / g, '\\$1' ) )
9699 . join ( '.' )
97100 }
98101 let node_name = node . nodeName . toLocaleLowerCase ( )
@@ -115,9 +118,7 @@ function get_styles(nodes: NodeListOf<Element>, base_url: string) {
115118 return items
116119}
117120
118- export async function get_css ( url : string , {
119- timeout = 10000 ,
120- } = { } ) {
121+ export async function get_css ( url : string , { timeout = 10000 } = { } ) {
121122 let resolved_url = resolve_url ( url )
122123
123124 if ( resolved_url === undefined ) {
@@ -140,7 +141,7 @@ export async function get_css(url: string, {
140141 signal : abort_controller . signal ,
141142 headers : {
142143 'User-Agent' : USER_AGENT ,
143- ' Accept' : 'text/html,*/*;q=0.1'
144+ Accept : 'text/html,*/*;q=0.1'
144145 }
145146 } )
146147
@@ -160,23 +161,24 @@ export async function get_css(url: string, {
160161 error : {
161162 url,
162163 statusCode : 403 ,
163- message : "The origin server responded with a 403 Forbidden status code which means that scraping CSS is blocked. Is the URL publicly accessible?"
164+ message :
165+ 'The origin server responded with a 403 Forbidden status code which means that scraping CSS is blocked. Is the URL publicly accessible?'
164166 }
165167 }
166168 }
167169
168170 // Examples: localhost, sduhsdf.test
169171 if ( error . message === 'fetch failed' ) {
170- let message = " The origin server is refusing connections."
172+ let message = ' The origin server is refusing connections.'
171173 if ( url . includes ( 'localhost' ) || url . includes ( '192.168' ) || url . includes ( '127.0.0.1' ) ) {
172- message += " You are trying to scrape a local server. Make sure to use a public URL."
174+ message += ' You are trying to scrape a local server. Make sure to use a public URL.'
173175 }
174176
175177 return {
176178 error : {
177179 url,
178180 statusCode : 400 ,
179- message,
181+ message
180182 }
181183 }
182184 }
@@ -187,7 +189,7 @@ export async function get_css(url: string, {
187189 error : {
188190 url,
189191 statusCode : 404 ,
190- message : " The origin server responded with a 404 Not Found status code."
192+ message : ' The origin server responded with a 404 Not Found status code.'
191193 }
192194 }
193195 }
@@ -198,7 +200,7 @@ export async function get_css(url: string, {
198200 error : {
199201 url,
200202 statusCode : 500 ,
201- message : 'something went wrong' ,
203+ message : 'something went wrong'
202204 }
203205 }
204206 }
@@ -238,7 +240,8 @@ export async function get_css(url: string, {
238240
239241 let nodes = document . querySelectorAll ( 'link[rel*="stylesheet"][href], style, [style]' )
240242 let baseElement = document . querySelector ( 'base[href]' )
241- let baseUrl = ( baseElement !== null && baseElement . hasAttribute ( 'href' ) ) ? baseElement . getAttribute ( 'href' ) : resolved_url
243+ let baseUrl =
244+ baseElement !== null && baseElement . hasAttribute ( 'href' ) ? baseElement . getAttribute ( 'href' ) : resolved_url
242245 let items = get_styles ( nodes , baseUrl ?. toString ( ) || '' ) || [ ]
243246 let result = [ ]
244247
@@ -271,7 +274,9 @@ export async function get_css(url: string, {
271274 // And c'mon, don't @import inside your @import.
272275 let importUrls = get_import_urls ( item . css )
273276 if ( importUrls . length > 0 ) {
274- let cssRequests = importUrls . map ( ( importUrl ) => get_css_file ( resolve_url ( importUrl , url ) ! , abort_controller . signal ) )
277+ let cssRequests = importUrls . map ( ( importUrl ) =>
278+ get_css_file ( resolve_url ( importUrl , url ) ! , abort_controller . signal )
279+ )
275280 let importedFiles = await Promise . all ( cssRequests )
276281 importedFiles . forEach ( ( css , index ) => {
277282 result . push ( {
@@ -284,7 +289,6 @@ export async function get_css(url: string, {
284289 }
285290 }
286291
287-
288292 clearTimeout ( timeout_id )
289293
290294 return result
0 commit comments