Skip to content

Commit 10a71ed

Browse files
authored
Merge pull request #26 from guillermoscript/sitemap-support
Refactor getPageHtml function to handle selector not found case, using body as fallback. Add support for downloading URLs from sitemap.xml. Update comments to let know that sitemap is supported
2 parents 5e7cf02 + 5221360 commit 10a71ed

File tree

4 files changed

+44
-6
lines changed

4 files changed

+44
-6
lines changed

.DS_Store

-6 KB
Binary file not shown.

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ See [config.ts](src/config.ts) for all available options. Here is a sample of th
6868

6969
```ts
7070
type Config = {
71-
/** URL to start the crawl */
71+
/** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
7272
url: string;
7373
/** Pattern to match against for links on a page to subsequently crawl */
7474
match: string;
@@ -78,6 +78,12 @@ type Config = {
7878
maxPagesToCrawl: number;
7979
/** File name for the finished data */
8080
outputFileName: string;
81+
/** Optional resources to exclude
82+
*
83+
* @example
84+
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
85+
*/
86+
resourceExclusions?: string[];
8187
};
8288
```
8389

src/config.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ const Page: z.ZodType<Page> = z.any();
66

77
export const configSchema = z.object({
88
/**
9-
* URL to start the crawl
9+
* URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap
1010
* @example "https://www.builder.io/c/docs/developers"
11+
* @example "https://www.builder.io/sitemap.xml"
1112
* @default ""
1213
*/
1314
url: z.string(),
@@ -51,6 +52,12 @@ export const configSchema = z.object({
5152
.optional(),
5253
/** Optional timeout for waiting for a selector to appear */
5354
waitForSelectorTimeout: z.number().int().nonnegative().optional(),
55+
/** Optional resources to exclude
56+
*
57+
* @example
58+
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
59+
*/
60+
resourceExclusions: z.array(z.string()).optional(),
5461
});
5562

5663
export type Config = z.infer<typeof configSchema>;

src/core.ts

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// For more information, see https://crawlee.dev/
2-
import { PlaywrightCrawler } from "crawlee";
2+
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
33
import { readFile, writeFile } from "fs/promises";
44
import { glob } from "glob";
55
import {Config, configSchema} from "./config.js";
@@ -45,7 +45,7 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
4545
);
4646
}
4747

48-
export async function crawl(config: Config) {
48+
export async function crawl(config: Config) {
4949
configSchema.parse(config);
5050

5151
if (process.env.NO_CRAWL !== "true") {
@@ -105,10 +105,35 @@ export async function crawl(config: Config) {
105105
maxRequestsPerCrawl: config.maxPagesToCrawl,
106106
// Uncomment this option to see the browser window.
107107
// headless: false,
108+
preNavigationHooks: [
109+
// Abort requests for certain resource types
110+
async ({ page, log }) => {
111+
// If there are no resource exclusions, return
112+
const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
113+
if (RESOURCE_EXCLUSTIONS.length === 0) {
114+
return;
115+
}
116+
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
117+
log.info(`Aborting requests for as this is a resource excluded route`);
118+
}
119+
],
108120
});
109121

110-
// Add first URL to the queue and start the crawl.
111-
await crawler.run([config.url]);
122+
const SITEMAP_SUFFIX = "sitemap.xml";
123+
const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
124+
125+
if (isUrlASitemap) {
126+
const listOfUrls = await downloadListOfUrls({ url: config.url });
127+
128+
// Add the initial URL to the crawling queue.
129+
await crawler.addRequests(listOfUrls);
130+
131+
// Run the crawler
132+
await crawler.run();
133+
} else {
134+
// Add first URL to the queue and start the crawl.
135+
await crawler.run([config.url]);
136+
}
112137
}
113138
}
114139

0 commit comments

Comments
 (0)