@@ -5,8 +5,10 @@ import { glob } from "glob";
5
5
import { Config , configSchema } from "./config.js" ;
6
6
import { Page } from "playwright" ;
7
7
import { isWithinTokenLimit } from "gpt-tokenizer" ;
8
+ import { PathLike } from "fs" ;
8
9
9
10
let pageCounter = 0 ;
11
+ let crawler : PlaywrightCrawler ;
10
12
11
13
export function getPageHtml ( page : Page , selector = "body" ) {
12
14
return page . evaluate ( ( selector ) => {
@@ -52,7 +54,7 @@ export async function crawl(config: Config) {
52
54
if ( process . env . NO_CRAWL !== "true" ) {
53
55
// PlaywrightCrawler crawls the web using a headless
54
56
// browser controlled by the Playwright library.
55
- const crawler = new PlaywrightCrawler ( {
57
+ crawler = new PlaywrightCrawler ( {
56
58
// Use the requestHandler to process each of the crawled pages.
57
59
async requestHandler ( { request, page, enqueueLinks, log, pushData } ) {
58
60
const title = await page . title ( ) ;
@@ -145,6 +147,7 @@ export async function crawl(config: Config) {
145
147
}
146
148
147
149
export async function write ( config : Config ) {
150
+ let nextFileNameString : PathLike = "" ;
148
151
const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
149
152
absolute : true ,
150
153
} ) ;
@@ -165,8 +168,14 @@ export async function write(config: Config) {
165
168
`${ config . outputFileName . replace ( / \. j s o n $ / , "" ) } -${ fileCounter } .json` ;
166
169
167
170
const writeBatchToFile = async ( ) : Promise < void > => {
168
- await writeFile ( nextFileName ( ) , JSON . stringify ( currentResults , null , 2 ) ) ;
169
- console . log ( `Wrote ${ currentResults . length } items to ${ nextFileName ( ) } ` ) ;
171
+ nextFileNameString = nextFileName ( ) ;
172
+ await writeFile (
173
+ nextFileNameString ,
174
+ JSON . stringify ( currentResults , null , 2 ) ,
175
+ ) ;
176
+ console . log (
177
+ `Wrote ${ currentResults . length } items to ${ nextFileNameString } ` ,
178
+ ) ;
170
179
currentResults = [ ] ;
171
180
currentSize = 0 ;
172
181
fileCounter ++ ;
@@ -215,4 +224,31 @@ export async function write(config: Config) {
215
224
if ( currentResults . length > 0 ) {
216
225
await writeBatchToFile ( ) ;
217
226
}
227
+
228
+ return nextFileNameString ;
218
229
}
230
+
231
+ class GPTCrawlerCore {
232
+ config : Config ;
233
+
234
+ constructor ( config : Config ) {
235
+ this . config = config ;
236
+ }
237
+
238
+ async crawl ( ) {
239
+ await crawl ( this . config ) ;
240
+ }
241
+
242
+ async write ( ) : Promise < PathLike > {
243
+ // we need to wait for the file path as the path can change
244
+ return new Promise ( ( resolve , reject ) => {
245
+ write ( this . config )
246
+ . then ( ( outputFilePath ) => {
247
+ resolve ( outputFilePath ) ;
248
+ } )
249
+ . catch ( reject ) ;
250
+ } ) ;
251
+ }
252
+ }
253
+
254
+ export default GPTCrawlerCore ;
0 commit comments