1- import { existsSync , mkdirSync , rmSync , writeFileSync } from "fs"
1+ import {
2+ existsSync ,
3+ mkdirSync ,
4+ readFileSync ,
5+ readdirSync ,
6+ rmSync ,
7+ writeFileSync ,
8+ } from "fs"
29import { glob } from "glob"
10+ import path from "path"
11+
312import { DBNode } from "../../core/dbNode"
413import { IFileProcessor } from "./processor/file.types"
514import { SourceFile } from "./sourceFile"
615import { ISourceFile } from "./sourceFile.types"
716
817export class Codebase {
918 private _path : string
10- private _dataPath : string
19+ private _dataDirPath : string
20+ private _dataDirName : string
21+ private _embeddingsDirPath : string
22+ private _embeddingsDirName : string
23+
1124 private _batchSize : number
1225 private _fileProcessor : IFileProcessor
1326
@@ -19,7 +32,11 @@ export class Codebase {
1932 if ( path . endsWith ( "/" ) ) path = path . replace ( / \/ + $ / , "" )
2033
2134 this . _path = path
22- this . _dataPath = ""
35+ this . _dataDirName = ""
36+ this . _dataDirPath = ""
37+ this . _embeddingsDirName = ""
38+ this . _embeddingsDirPath = ""
39+
2340 this . _batchSize = batchSize
2441 this . _fileProcessor = fileProcessor
2542
@@ -28,32 +45,36 @@ export class Codebase {
2845 this . makeFilesBatches ( )
2946 }
3047
31- private makePath ( path : string ) : string {
32- return `${ this . _path } /${ path } `
33- }
34-
3548 private initializeDataDirectory ( removeExisting = true ) : void {
36- this . _dataPath = "data"
49+ this . _dataDirName = `data-${ Date . now ( ) } `
50+ this . _dataDirPath = path . resolve ( this . _path , this . _dataDirName )
51+
52+ this . _embeddingsDirName = `${ this . _dataDirName } /embeddings`
53+ this . _embeddingsDirPath = path . resolve ( this . _path , this . _embeddingsDirName )
54+
55+ /* Handle data directory */
56+ if ( removeExisting && existsSync ( this . _dataDirPath ) )
57+ rmSync ( this . _dataDirPath , { recursive : true } )
58+ mkdirSync ( this . _dataDirPath )
3759
38- const path = this . makePath ( this . _dataPath )
39- if ( removeExisting && existsSync ( path ) ) rmSync ( path , { recursive : true } )
40- mkdirSync ( path )
60+ /* Handle embeddings directory */
61+ mkdirSync ( this . _embeddingsDirPath )
4162 }
4263
4364 private prepareFilesMetadata ( ) {
4465 const extensions = [ "ts" , "tsx" , "js" , "jsx" ]
4566
46- // console.log(`🕒 Preparing metadata for files: *.${extensions.join(", *.")}`)
67+ console . log ( `🕒 Preparing metadata for files: *.${ extensions . join ( ", *." ) } ` )
4768 {
4869 const globPatterns = extensions . map ( ( x ) => `**/*.${ x } ` )
4970 for ( const pattern of globPatterns ) {
5071 const files = glob
51- . sync ( this . makePath ( pattern ) )
72+ . sync ( ` ${ this . _path } / ${ pattern } ` )
5273 . map ( ( x ) => new SourceFile ( x ) )
5374 this . _files . push ( ...files )
5475 }
5576 }
56- // console.log(`✅ Prepared metadata for ${this._files.length} files\n`)
77+ console . log ( `✅ Prepared metadata for ${ this . _files . length } files\n` )
5778 }
5879
5980 private makeFilesBatches ( ) {
@@ -74,7 +95,7 @@ export class Codebase {
7495 if ( entries . length === 0 ) return 0
7596 const batch = Object . fromEntries ( entries )
7697 writeFileSync (
77- this . makePath ( ` ${ this . _dataPath } / ${ fileName } ` ) ,
98+ path . resolve ( this . _dataDirPath , fileName ) ,
7899 JSON . stringify ( batch , null , 2 )
79100 )
80101
@@ -88,7 +109,7 @@ export class Codebase {
88109 ) : Promise < number > {
89110 let nNodesProcessed = 0
90111
91- // console.log(`🕒 Processing ${start}-${end} files`)
112+ console . log ( `🕒 Processing ${ start } -${ end } files` )
92113 {
93114 let nodes : Record < string , DBNode > = { }
94115
@@ -114,9 +135,9 @@ export class Codebase {
114135
115136 nNodesProcessed = Object . keys ( nodes ) . length
116137 }
117- // console.log(
118- // `✅ Processed ${start}-${end} files (${nNodesProcessed} nodes)\n`
119- // )
138+ console . log (
139+ `✅ Processed ${ start } -${ end } files (${ nNodesProcessed } nodes)\n`
140+ )
120141
121142 return nNodesProcessed
122143 }
@@ -133,14 +154,66 @@ export class Codebase {
133154 * @returns Promise<void>
134155 */
135156 async process ( ) : Promise < void > {
136- // console.log("🕒 Preparing Nodes\n")
157+ console . log ( "🕒 Preparing Nodes\n" )
137158
138159 let nodesProcessed = 0
139160 for ( const [ index , batch ] of this . _batches . entries ( ) ) {
140161 const [ start , end ] = batch
141162 nodesProcessed += await this . processFilesBatch ( index , start , end )
142163 }
143164
144- // console.log(`✅ Prepared ${nodesProcessed} nodes`)
165+ console . log ( `✅ Prepared ${ nodesProcessed } nodes` )
166+ }
167+
168+ async embed ( ) : Promise < void > {
169+ console . log ( "🕒 Preparing Embeddings" )
170+
171+ if ( existsSync ( this . _embeddingsDirPath ) )
172+ rmSync ( this . _embeddingsDirPath , { recursive : true } )
173+ mkdirSync ( this . _embeddingsDirPath )
174+
175+ const files = readdirSync ( this . _dataDirPath )
176+ . filter ( ( x ) => x . endsWith ( ".json" ) )
177+ . map ( ( x ) => path . resolve ( this . _dataDirPath , x ) ) // convert path like "batch-1.json" to "./data/batch-1.json"
178+
179+ const embeddingsPerNode = 2
180+ const maxAllowedEmbeddingsPerMinute = 2800 // openai limitation for embeddings
181+ const nFilesPerBatch = Math . floor (
182+ maxAllowedEmbeddingsPerMinute / this . _batchSize / embeddingsPerNode
183+ )
184+
185+ let batch = 0
186+ for ( let i = 0 ; i < files . length ; i += nFilesPerBatch ) {
187+ const start = i
188+ const end = Math . min ( i + nFilesPerBatch , files . length )
189+
190+ console . log ( `\n🕒 Embedding ${ start } -${ end } files` )
191+
192+ let nodes : Record < string , DBNode > = { }
193+ for ( const file of files . slice ( start , end ) ) {
194+ // to convert file content from a plain string to js object
195+ const data = JSON . parse ( readFileSync ( file , "utf-8" ) )
196+ Object . assign ( nodes , data )
197+ }
198+
199+ const jobs = Object . values ( nodes ) . map ( async ( x ) => {
200+ nodes [ x . id ] = await DBNode . fillEmbeddings ( new DBNode ( x ) )
201+ } )
202+ await Promise . all ( jobs )
203+
204+ writeFileSync (
205+ `${ this . _embeddingsDirPath } /batch-${ batch ++ } .json` ,
206+ JSON . stringify ( nodes , null , 2 )
207+ )
208+
209+ console . log ( `✅ Embedded ${ start } -${ end } files\n` )
210+
211+ if ( i + nFilesPerBatch < files . length ) {
212+ console . log ( `🕒 Waiting for 60 seconds` )
213+ await new Promise ( ( resolve ) => setTimeout ( resolve , 60 * 1000 ) )
214+ }
215+ }
216+
217+ console . log ( `✅ Prepared embeddings for nodes` )
145218 }
146219}
0 commit comments