Skip to content

Commit 7707146

Browse files
committed
fix: refactor
1 parent 8e15bb3 commit 7707146

File tree

3 files changed

+46
-10
lines changed

3 files changed

+46
-10
lines changed

src/core.ts

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import { glob } from "glob";
55
import { Config, configSchema } from "./config.js";
66
import { Page } from "playwright";
77
import { isWithinTokenLimit } from "gpt-tokenizer";
8+
import { PathLike } from "fs";
89

910
let pageCounter = 0;
11+
let crawler: PlaywrightCrawler;
1012

1113
export function getPageHtml(page: Page, selector = "body") {
1214
return page.evaluate((selector) => {
@@ -52,7 +54,7 @@ export async function crawl(config: Config) {
5254
if (process.env.NO_CRAWL !== "true") {
5355
// PlaywrightCrawler crawls the web using a headless
5456
// browser controlled by the Playwright library.
55-
const crawler = new PlaywrightCrawler({
57+
crawler = new PlaywrightCrawler({
5658
// Use the requestHandler to process each of the crawled pages.
5759
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
5860
if (config.cookie) {
@@ -143,6 +145,7 @@ export async function crawl(config: Config) {
143145
}
144146

145147
export async function write(config: Config) {
148+
let nextFileNameString: PathLike = "";
146149
const jsonFiles = await glob("storage/datasets/default/*.json", {
147150
absolute: true,
148151
});
@@ -163,8 +166,14 @@ export async function write(config: Config) {
163166
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;
164167

165168
const writeBatchToFile = async (): Promise<void> => {
166-
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
167-
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
169+
nextFileNameString = nextFileName();
170+
await writeFile(
171+
nextFileNameString,
172+
JSON.stringify(currentResults, null, 2),
173+
);
174+
console.log(
175+
`Wrote ${currentResults.length} items to ${nextFileNameString}`,
176+
);
168177
currentResults = [];
169178
currentSize = 0;
170179
fileCounter++;
@@ -213,4 +222,31 @@ export async function write(config: Config) {
213222
if (currentResults.length > 0) {
214223
await writeBatchToFile();
215224
}
225+
226+
return nextFileNameString;
216227
}
228+
229+
class GPTCrawlerCore {
230+
config: Config;
231+
232+
constructor(config: Config) {
233+
this.config = config;
234+
}
235+
236+
async crawl() {
237+
await crawl(this.config);
238+
}
239+
240+
async write(): Promise<PathLike> {
241+
// we need to wait for the file path as the path can change
242+
return new Promise((resolve, reject) => {
243+
write(this.config)
244+
.then((outputFilePath) => {
245+
resolve(outputFilePath);
246+
})
247+
.catch(reject);
248+
});
249+
}
250+
}
251+
252+
export default GPTCrawlerCore;

src/server.ts

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import express from "express";
22
import cors from "cors";
33
import { readFile } from "fs/promises";
4-
import { crawl, write } from "./core.js";
54
import { Config, configSchema } from "./config.js";
65
import { configDotenv } from "dotenv";
76
import swaggerUi from "swagger-ui-express";
87
// @ts-ignore
98
import swaggerDocument from "../swagger-output.json" assert { type: "json" };
9+
import GPTCrawlerCore from "./core.js";
10+
import { PathLike } from "fs";
1011

1112
configDotenv();
1213

@@ -23,12 +24,10 @@ app.post("/crawl", async (req, res) => {
2324
const config: Config = req.body;
2425
try {
2526
const validatedConfig = configSchema.parse(config);
26-
await crawl(validatedConfig);
27-
await write(validatedConfig);
28-
const outputFileContent = await readFile(
29-
validatedConfig.outputFileName,
30-
"utf-8",
31-
);
27+
const crawler = new GPTCrawlerCore(validatedConfig);
28+
await crawler.crawl();
29+
const outputFileName: PathLike = await crawler.write();
30+
const outputFileContent = await readFile(outputFileName, "utf-8");
3231
res.contentType("application/json");
3332
return res.send(outputFileContent);
3433
} catch (error) {

tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"module": "ES2022",
55
"target": "ES2022",
66
"outDir": "dist",
7+
"moduleResolution": "node",
78
"resolveJsonModule": true,
89
"noUnusedLocals": false,
910
"skipLibCheck": true,

0 commit comments

Comments
 (0)