Skip to content

Commit 4ccf3b3

Browse files
authored
Merge pull request #52 from adityak74/api-server-crawler
feat: create crawler api server
2 parents c34bde5 + 7707146 commit 4ccf3b3

File tree

9 files changed

+8931
-502
lines changed

9 files changed

+8931
-502
lines changed

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
API_PORT=5000
2+
API_HOST=localhost
3+
MAX_PAGES_TO_CRAWL=45
4+
NODE_ENV=development

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ storage
1414

1515
# any output from the crawler
1616
*.json
17+
.env
1718
pnpm-lock.yaml

package-lock.json

Lines changed: 8810 additions & 496 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,29 +8,42 @@
88
"description": "Crawl a site to generate knowledge files to create your own custom GPT",
99
"dependencies": {
1010
"commander": "^11.1.0",
11+
"cors": "^2.8.5",
1112
"crawlee": "^3.0.0",
13+
"dotenv": "^16.3.1",
14+
"express": "^4.18.2",
15+
"express-fileupload": "^1.4.3",
1216
"cross-env": "^7.0.3",
1317
"glob": "^10.3.10",
1418
"gpt-tokenizer": "^2.1.2",
1519
"inquirer": "^9.2.12",
1620
"playwright": "*",
17-
"zod": "^3.22.4"
21+
"prettier": "^3.1.0",
22+
"swagger-ui-express": "^5.0.0"
1823
},
1924
"devDependencies": {
2025
"@apify/tsconfig": "^0.1.0",
26+
"@types/cors": "^2.8.17",
27+
"@types/express": "^4.17.21",
28+
"@types/express-fileupload": "^1.4.4",
2129
"@semantic-release/changelog": "^6.0.3",
2230
"@semantic-release/git": "^10.0.1",
2331
"@types/inquirer": "^9.0.7",
2432
"@types/node": "^20.0.0",
2533
"prettier": "^3.1.0",
2634
"semantic-release": "^22.0.8",
2735
"ts-node": "^10.8.0",
28-
"typescript": "^5.0.0"
36+
"typescript": "^5.0.0",
37+
"@types/swagger-ui-express": "^4.1.6",
38+
"swagger-autogen": "^2.23.7",
39+
"zod": "^3.22.4"
2940
},
3041
"scripts": {
3142
"semantic-release": "semantic-release",
3243
"preinstall": "npx playwright install",
3344
"start": "npm run start:dev",
45+
"start:server": "NODE_ENV=development npm run build && node dist/src/server.js",
46+
"start:server:prod": "npm run build && node dist/src/server.js",
3447
"start:cli": "cross-env NODE_ENV=development npm run build && node dist/src/cli.js",
3548
"start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
3649
"start:prod": "node dist/src/main.js",

src/config.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import { z } from "zod";
2-
32
import type { Page } from "playwright";
3+
import { configDotenv } from "dotenv";
4+
5+
configDotenv();
46

57
const Page: z.ZodType<Page> = z.any();
68

src/core.ts

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import { glob } from "glob";
55
import { Config, configSchema } from "./config.js";
66
import { Page } from "playwright";
77
import { isWithinTokenLimit } from "gpt-tokenizer";
8+
import { PathLike } from "fs";
89

910
let pageCounter = 0;
11+
let crawler: PlaywrightCrawler;
1012

1113
export function getPageHtml(page: Page, selector = "body") {
1214
return page.evaluate((selector) => {
@@ -52,7 +54,7 @@ export async function crawl(config: Config) {
5254
if (process.env.NO_CRAWL !== "true") {
5355
// PlaywrightCrawler crawls the web using a headless
5456
// browser controlled by the Playwright library.
55-
const crawler = new PlaywrightCrawler({
57+
crawler = new PlaywrightCrawler({
5658
// Use the requestHandler to process each of the crawled pages.
5759
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
5860
const title = await page.title();
@@ -145,6 +147,7 @@ export async function crawl(config: Config) {
145147
}
146148

147149
export async function write(config: Config) {
150+
let nextFileNameString: PathLike = "";
148151
const jsonFiles = await glob("storage/datasets/default/*.json", {
149152
absolute: true,
150153
});
@@ -165,8 +168,14 @@ export async function write(config: Config) {
165168
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;
166169

167170
const writeBatchToFile = async (): Promise<void> => {
168-
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
169-
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
171+
nextFileNameString = nextFileName();
172+
await writeFile(
173+
nextFileNameString,
174+
JSON.stringify(currentResults, null, 2),
175+
);
176+
console.log(
177+
`Wrote ${currentResults.length} items to ${nextFileNameString}`,
178+
);
170179
currentResults = [];
171180
currentSize = 0;
172181
fileCounter++;
@@ -215,4 +224,31 @@ export async function write(config: Config) {
215224
if (currentResults.length > 0) {
216225
await writeBatchToFile();
217226
}
227+
228+
return nextFileNameString;
218229
}
230+
231+
class GPTCrawlerCore {
232+
config: Config;
233+
234+
constructor(config: Config) {
235+
this.config = config;
236+
}
237+
238+
async crawl() {
239+
await crawl(this.config);
240+
}
241+
242+
async write(): Promise<PathLike> {
243+
// we need to wait for the file path as the path can change
244+
return new Promise((resolve, reject) => {
245+
write(this.config)
246+
.then((outputFilePath) => {
247+
resolve(outputFilePath);
248+
})
249+
.catch(reject);
250+
});
251+
}
252+
}
253+
254+
export default GPTCrawlerCore;

src/server.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import express from "express";
2+
import cors from "cors";
3+
import { readFile } from "fs/promises";
4+
import { Config, configSchema } from "./config.js";
5+
import { configDotenv } from "dotenv";
6+
import swaggerUi from "swagger-ui-express";
7+
// @ts-ignore
8+
import swaggerDocument from "../swagger-output.json" assert { type: "json" };
9+
import GPTCrawlerCore from "./core.js";
10+
import { PathLike } from "fs";
11+
12+
configDotenv();
13+
14+
const app = express();
15+
const port = Number(process.env.API_PORT) || 3000;
16+
const hostname = process.env.API_HOST || "localhost";
17+
18+
app.use(cors());
19+
app.use(express.json());
20+
app.use("/api-docs", swaggerUi.serve, swaggerUi.setup(swaggerDocument));
21+
22+
// Define a POST route to accept config and run the crawler
23+
app.post("/crawl", async (req, res) => {
24+
const config: Config = req.body;
25+
try {
26+
const validatedConfig = configSchema.parse(config);
27+
const crawler = new GPTCrawlerCore(validatedConfig);
28+
await crawler.crawl();
29+
const outputFileName: PathLike = await crawler.write();
30+
const outputFileContent = await readFile(outputFileName, "utf-8");
31+
res.contentType("application/json");
32+
return res.send(outputFileContent);
33+
} catch (error) {
34+
return res
35+
.status(500)
36+
.json({ message: "Error occurred during crawling", error });
37+
}
38+
});
39+
40+
app.listen(port, hostname, () => {
41+
console.log(`API server listening at http://${hostname}:${port}`);
42+
});
43+
44+
export default app;

swagger.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import swaggerAutogen from "swagger-autogen";
2+
3+
const doc = {
4+
info: {
5+
title: "GPT Crawler API",
6+
description: "GPT Crawler",
7+
},
8+
host: "localhost:5000",
9+
};
10+
11+
const outputFile = "swagger-output.json";
12+
const routes = ["./src/server.ts"];
13+
14+
swaggerAutogen()(outputFile, routes, doc);

tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"module": "ES2022",
55
"target": "ES2022",
66
"outDir": "dist",
7+
"moduleResolution": "node",
78
"resolveJsonModule": true,
89
"noUnusedLocals": false,
910
"skipLibCheck": true,

0 commit comments

Comments
 (0)