Skip to content

Commit 9e8c329

Browse files
authored
Merge pull request #45 from LeonKohli/main
feat: use Xpath as selector
2 parents 2b5a2b8 + 0e8873d commit 9e8c329

File tree

1 file changed

+28
-8
lines changed

1 file changed

+28
-8
lines changed

src/main.ts

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,26 @@ import { Page } from "playwright";
77

88
let pageCounter = 0;
99

10-
export function getPageHtml(page: Page) {
10+
export function getPageHtml(page: Page, selector: string) {
1111
return page.evaluate((selector) => {
12-
const el = document.querySelector(selector) as HTMLElement | null;
13-
return el?.innerText || "";
14-
}, config.selector);
12+
// Check if the selector is an XPath
13+
if (selector.startsWith('/')) {
14+
const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
15+
let result = elements.iterateNext();
16+
return result ? result.textContent || "" : "";
17+
} else {
18+
// Handle as a CSS selector
19+
const el = document.querySelector(selector) as HTMLElement | null;
20+
return el?.innerText || "";
21+
}
22+
}, selector);
23+
}
24+
25+
export async function waitForXPath(page: Page, xpath: string, timeout: number) {
26+
await page.waitForFunction(xpath => {
27+
const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
28+
return elements.iterateNext() !== null;
29+
}, xpath, { timeout });
1530
}
1631

1732
if (process.env.NO_CRAWL !== "true") {
@@ -35,11 +50,16 @@ if (process.env.NO_CRAWL !== "true") {
3550
pageCounter++;
3651
log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
3752

38-
await page.waitForSelector(config.selector, {
39-
timeout: config.waitForSelectorTimeout ?? 1000,
40-
});
53+
// Use custom handling for XPath selector
54+
if (config.selector.startsWith('/')) {
55+
await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
56+
} else {
57+
await page.waitForSelector(config.selector, {
58+
timeout: config.waitForSelectorTimeout ?? 1000,
59+
});
60+
}
4161

42-
const html = await getPageHtml(page);
62+
const html = await getPageHtml(page, config.selector);
4363

4464
// Save results as JSON to ./storage/datasets/default
4565
await pushData({ title, url: request.loadedUrl, html });

0 commit comments

Comments
 (0)