@@ -5,8 +5,10 @@ import { glob } from "glob";
5
5
import { Config , configSchema } from "./config.js" ;
6
6
import { Page } from "playwright" ;
7
7
import { isWithinTokenLimit } from "gpt-tokenizer" ;
8
+ import { PathLike } from "fs" ;
8
9
9
10
let pageCounter = 0 ;
11
+ let crawler : PlaywrightCrawler ;
10
12
11
13
export function getPageHtml ( page : Page , selector = "body" ) {
12
14
return page . evaluate ( ( selector ) => {
@@ -52,7 +54,7 @@ export async function crawl(config: Config) {
52
54
if ( process . env . NO_CRAWL !== "true" ) {
53
55
// PlaywrightCrawler crawls the web using a headless
54
56
// browser controlled by the Playwright library.
55
- const crawler = new PlaywrightCrawler ( {
57
+ crawler = new PlaywrightCrawler ( {
56
58
// Use the requestHandler to process each of the crawled pages.
57
59
async requestHandler ( { request, page, enqueueLinks, log, pushData } ) {
58
60
if ( config . cookie ) {
@@ -143,6 +145,7 @@ export async function crawl(config: Config) {
143
145
}
144
146
145
147
export async function write ( config : Config ) {
148
+ let nextFileNameString : PathLike = "" ;
146
149
const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
147
150
absolute : true ,
148
151
} ) ;
@@ -163,8 +166,14 @@ export async function write(config: Config) {
163
166
`${ config . outputFileName . replace ( / \. j s o n $ / , "" ) } -${ fileCounter } .json` ;
164
167
165
168
const writeBatchToFile = async ( ) : Promise < void > => {
166
- await writeFile ( nextFileName ( ) , JSON . stringify ( currentResults , null , 2 ) ) ;
167
- console . log ( `Wrote ${ currentResults . length } items to ${ nextFileName ( ) } ` ) ;
169
+ nextFileNameString = nextFileName ( ) ;
170
+ await writeFile (
171
+ nextFileNameString ,
172
+ JSON . stringify ( currentResults , null , 2 ) ,
173
+ ) ;
174
+ console . log (
175
+ `Wrote ${ currentResults . length } items to ${ nextFileNameString } ` ,
176
+ ) ;
168
177
currentResults = [ ] ;
169
178
currentSize = 0 ;
170
179
fileCounter ++ ;
@@ -213,4 +222,31 @@ export async function write(config: Config) {
213
222
if ( currentResults . length > 0 ) {
214
223
await writeBatchToFile ( ) ;
215
224
}
225
+
226
+ return nextFileNameString ;
216
227
}
228
+
229
+ class GPTCrawlerCore {
230
+ config : Config ;
231
+
232
+ constructor ( config : Config ) {
233
+ this . config = config ;
234
+ }
235
+
236
+ async crawl ( ) {
237
+ await crawl ( this . config ) ;
238
+ }
239
+
240
+ async write ( ) : Promise < PathLike > {
241
+ // we need to wait for the file path as the path can change
242
+ return new Promise ( ( resolve , reject ) => {
243
+ write ( this . config )
244
+ . then ( ( outputFilePath ) => {
245
+ resolve ( outputFilePath ) ;
246
+ } )
247
+ . catch ( reject ) ;
248
+ } ) ;
249
+ }
250
+ }
251
+
252
+ export default GPTCrawlerCore ;
0 commit comments