@@ -7,11 +7,26 @@ import { Page } from "playwright";
7
7
8
8
let pageCounter = 0 ;
9
9
10
- export function getPageHtml ( page : Page ) {
10
+ export function getPageHtml ( page : Page , selector : string ) {
11
11
return page . evaluate ( ( selector ) => {
12
- const el = document . querySelector ( selector ) as HTMLElement | null ;
13
- return el ?. innerText || "" ;
14
- } , config . selector ) ;
12
+ // Check if the selector is an XPath
13
+ if ( selector . startsWith ( '/' ) ) {
14
+ const elements = document . evaluate ( selector , document , null , XPathResult . ANY_TYPE , null ) ;
15
+ let result = elements . iterateNext ( ) ;
16
+ return result ? result . textContent || "" : "" ;
17
+ } else {
18
+ // Handle as a CSS selector
19
+ const el = document . querySelector ( selector ) as HTMLElement | null ;
20
+ return el ?. innerText || "" ;
21
+ }
22
+ } , selector ) ;
23
+ }
24
+
25
+ export async function waitForXPath ( page : Page , xpath : string , timeout : number ) {
26
+ await page . waitForFunction ( xpath => {
27
+ const elements = document . evaluate ( xpath , document , null , XPathResult . ANY_TYPE , null ) ;
28
+ return elements . iterateNext ( ) !== null ;
29
+ } , xpath , { timeout } ) ;
15
30
}
16
31
17
32
if ( process . env . NO_CRAWL !== "true" ) {
@@ -35,11 +50,16 @@ if (process.env.NO_CRAWL !== "true") {
35
50
pageCounter ++ ;
36
51
log . info ( `Crawling: Page ${ pageCounter } / ${ config . maxPagesToCrawl } - URL: ${ request . loadedUrl } ...` ) ;
37
52
38
- await page . waitForSelector ( config . selector , {
39
- timeout : config . waitForSelectorTimeout ?? 1000 ,
40
- } ) ;
53
+ // Use custom handling for XPath selector
54
+ if ( config . selector . startsWith ( '/' ) ) {
55
+ await waitForXPath ( page , config . selector , config . waitForSelectorTimeout ?? 1000 ) ;
56
+ } else {
57
+ await page . waitForSelector ( config . selector , {
58
+ timeout : config . waitForSelectorTimeout ?? 1000 ,
59
+ } ) ;
60
+ }
41
61
42
- const html = await getPageHtml ( page ) ;
62
+ const html = await getPageHtml ( page , config . selector ) ;
43
63
44
64
// Save results as JSON to ./storage/datasets/default
45
65
await pushData ( { title, url : request . loadedUrl , html } ) ;
0 commit comments