Skip to content

Commit 98a645a

Browse files
Merge branch 'main' into multiple-files
2 parents 69d895e + 0c53280 commit 98a645a

File tree

9 files changed

+74
-48
lines changed

9 files changed

+74
-48
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
- run: npm run build
2121
- uses: preactjs/compressed-size-action@v2
2222
with:
23-
pattern: ".dist/**/*.{js,ts,json}"
23+
pattern: ".dist/**/*.{js,ts,json}"

.github/workflows/release.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ jobs:
1313
- uses: actions/checkout@v2
1414
- uses: actions/setup-node@v2
1515
with:
16-
cache: npm
17-
node-version: 18
16+
cache: npm
17+
node-version: 18
1818
- run: npm i
1919
- run: npm run build
2020
- run: npm run semantic-release
2121
env:
2222
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23-
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
23+
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}

.github/workflows/test.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: Test workflow
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
prettier_check:
7+
runs-on: ubuntu-latest
8+
9+
steps:
10+
- uses: actions/checkout@v3
11+
- name: Set up Node.js
12+
uses: actions/setup-node@v2
13+
with:
14+
node-version: "20"
15+
- name: Install Dependencies
16+
run: npm ci
17+
- name: Run prettier
18+
run: npm run prettier:check

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,16 @@ type Config = {
7878
maxPagesToCrawl: number;
7979
/** File name for the finished data */
8080
outputFileName: string;
81-
/** Optional resources to exclude
82-
*
81+
/** Optional resources to exclude
82+
*
8383
* @example
8484
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
85-
*/
86-
resourceExclusions?: string[];
85+
*/
86+
resourceExclusions?: string[];
8787
/** Optional maximum file size in megabytes to include in the output file */
8888
maxFileSize?: number,
8989
/** Optional maximum number tokens to include in the output file */
90-
maxTokens?: number().,
90+
maxTokens?: number,
9191
};
9292
```
9393

containerapp/data/config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
55
match: "https://www.builder.io/c/docs/**",
66
maxPagesToCrawl: 50,
77
outputFileName: "../data/output.json",
8-
};
8+
};

package-lock.json

Lines changed: 4 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@builder.io/gpt-crawler",
3-
"version": "1.0.0",
3+
"version": "1.1.0",
44
"type": "module",
55
"bin": {
66
"gpt-crawler": "./dist/src/cli.js"
@@ -14,7 +14,6 @@
1414
"gpt-tokenizer": "^2.1.2",
1515
"inquirer": "^9.2.12",
1616
"playwright": "*",
17-
"prettier": "^3.1.0",
1817
"zod": "^3.22.4"
1918
},
2019
"devDependencies": {
@@ -23,6 +22,7 @@
2322
"@semantic-release/git": "^10.0.1",
2423
"@types/inquirer": "^9.0.7",
2524
"@types/node": "^20.0.0",
25+
"prettier": "^3.1.0",
2626
"semantic-release": "^22.0.8",
2727
"ts-node": "^10.8.0",
2828
"typescript": "^5.0.0"
@@ -35,7 +35,8 @@
3535
"start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
3636
"start:prod": "node dist/src/main.js",
3737
"build": "tsc",
38-
"fmt": "prettier --write ."
38+
"fmt": "prettier --write .",
39+
"prettier:check": "prettier --check ."
3940
},
4041
"author": "It's not you it's me",
4142
"license": "ISC"

src/config.ts

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { z } from 'zod';
1+
import { z } from "zod";
22

33
import type { Page } from "playwright";
44

@@ -36,27 +36,30 @@ export const configSchema = z.object({
3636
*/
3737
outputFileName: z.string(),
3838
/** Optional cookie to be set. E.g. for Cookie Consent */
39-
cookie: z.object({
40-
name: z.string(),
41-
value: z.string(),
42-
}).optional(),
39+
cookie: z
40+
.object({
41+
name: z.string(),
42+
value: z.string(),
43+
})
44+
.optional(),
4345
/** Optional function to run for each page found */
44-
onVisitPage: z.function()
45-
.args(z.object({
46+
onVisitPage: z
47+
.function()
48+
.args(
49+
z.object({
4650
page: Page,
47-
pushData: z.function()
48-
.args(z.any())
49-
.returns(z.promise(z.void()))
50-
}))
51-
.returns(z.promise(z.void()))
52-
.optional(),
51+
pushData: z.function().args(z.any()).returns(z.promise(z.void())),
52+
}),
53+
)
54+
.returns(z.promise(z.void()))
55+
.optional(),
5356
/** Optional timeout for waiting for a selector to appear */
5457
waitForSelectorTimeout: z.number().int().nonnegative().optional(),
55-
/** Optional resources to exclude
56-
*
57-
* @example
58-
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
59-
*/
58+
/** Optional resources to exclude
59+
*
60+
* @example
61+
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
62+
*/
6063
resourceExclusions: z.array(z.string()).optional(),
6164

6265
/** Optional maximum file size in megabytes to include in the output file
@@ -70,4 +73,3 @@ export const configSchema = z.object({
7073
});
7174

7275
export type Config = z.infer<typeof configSchema>;
73-

src/core.ts

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
33
import { readFile, writeFile } from "fs/promises";
44
import { glob } from "glob";
5-
import {Config, configSchema} from "./config.js";
5+
import { Config, configSchema } from "./config.js";
66
import { Page } from "playwright";
77
import {
88
isWithinTokenLimit,
@@ -19,7 +19,7 @@ export function getPageHtml(page: Page, selector = "body") {
1919
document,
2020
null,
2121
XPathResult.ANY_TYPE,
22-
null
22+
null,
2323
);
2424
let result = elements.iterateNext();
2525
return result ? result.textContent || "" : "";
@@ -39,16 +39,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
3939
document,
4040
null,
4141
XPathResult.ANY_TYPE,
42-
null
42+
null,
4343
);
4444
return elements.iterateNext() !== null;
4545
},
4646
xpath,
47-
{ timeout }
47+
{ timeout },
4848
);
4949
}
5050

51-
export async function crawl(config: Config) {
51+
export async function crawl(config: Config) {
5252
configSchema.parse(config);
5353

5454
if (process.env.NO_CRAWL !== "true") {
@@ -70,7 +70,7 @@ export async function crawl(config: Config) {
7070
const title = await page.title();
7171
pageCounter++;
7272
log.info(
73-
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
73+
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
7474
);
7575

7676
// Use custom handling for XPath selector
@@ -79,7 +79,7 @@ export async function crawl(config: Config) {
7979
await waitForXPath(
8080
page,
8181
config.selector,
82-
config.waitForSelectorTimeout ?? 1000
82+
config.waitForSelectorTimeout ?? 1000,
8383
);
8484
} else {
8585
await page.waitForSelector(config.selector, {
@@ -116,21 +116,25 @@ export async function crawl(config: Config) {
116116
if (RESOURCE_EXCLUSTIONS.length === 0) {
117117
return;
118118
}
119-
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
120-
log.info(`Aborting requests for as this is a resource excluded route`);
121-
}
119+
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
120+
route.abort("aborted"),
121+
);
122+
log.info(
123+
`Aborting requests for as this is a resource excluded route`,
124+
);
125+
},
122126
],
123127
});
124128

125129
const SITEMAP_SUFFIX = "sitemap.xml";
126130
const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
127-
131+
128132
if (isUrlASitemap) {
129133
const listOfUrls = await downloadListOfUrls({ url: config.url });
130-
134+
131135
// Add the initial URL to the crawling queue.
132136
await crawler.addRequests(listOfUrls);
133-
137+
134138
// Run the crawler
135139
await crawler.run();
136140
} else {

0 commit comments

Comments
 (0)