Skip to content

Commit 5f8975d

Browse files
committed
Added embedding of nodes with better handling of paths in Codebase
1 parent ec6111c commit 5f8975d

File tree

3 files changed

+96
-23
lines changed

3 files changed

+96
-23
lines changed

rocket-chatter-ingestion-server/src/core/llm.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ import { OPENAI_KEY } from "../constants"
33

44
const openai = new OpenAI({ apiKey: OPENAI_KEY })
55

6-
76
export namespace LLM {
87
export async function generateEmbeddings(data: string): Promise<number[]> {
98
try {

rocket-chatter-ingestion-server/src/process/prepare/codebase.ts

Lines changed: 94 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,26 @@
1-
import { existsSync, mkdirSync, rmSync, writeFileSync } from "fs"
1+
import {
2+
existsSync,
3+
mkdirSync,
4+
readFileSync,
5+
readdirSync,
6+
rmSync,
7+
writeFileSync,
8+
} from "fs"
29
import { glob } from "glob"
10+
import path from "path"
11+
312
import { DBNode } from "../../core/dbNode"
413
import { IFileProcessor } from "./processor/file.types"
514
import { SourceFile } from "./sourceFile"
615
import { ISourceFile } from "./sourceFile.types"
716

817
export class Codebase {
918
private _path: string
10-
private _dataPath: string
19+
private _dataDirPath: string
20+
private _dataDirName: string
21+
private _embeddingsDirPath: string
22+
private _embeddingsDirName: string
23+
1124
private _batchSize: number
1225
private _fileProcessor: IFileProcessor
1326

@@ -19,7 +32,11 @@ export class Codebase {
1932
if (path.endsWith("/")) path = path.replace(/\/+$/, "")
2033

2134
this._path = path
22-
this._dataPath = ""
35+
this._dataDirName = ""
36+
this._dataDirPath = ""
37+
this._embeddingsDirName = ""
38+
this._embeddingsDirPath = ""
39+
2340
this._batchSize = batchSize
2441
this._fileProcessor = fileProcessor
2542

@@ -28,32 +45,36 @@ export class Codebase {
2845
this.makeFilesBatches()
2946
}
3047

31-
private makePath(path: string): string {
32-
return `${this._path}/${path}`
33-
}
34-
3548
private initializeDataDirectory(removeExisting = true): void {
36-
this._dataPath = "data"
49+
this._dataDirName = `data-${Date.now()}`
50+
this._dataDirPath = path.resolve(this._path, this._dataDirName)
51+
52+
this._embeddingsDirName = `${this._dataDirName}/embeddings`
53+
this._embeddingsDirPath = path.resolve(this._path, this._embeddingsDirName)
54+
55+
/* Handle data directory */
56+
if (removeExisting && existsSync(this._dataDirPath))
57+
rmSync(this._dataDirPath, { recursive: true })
58+
mkdirSync(this._dataDirPath)
3759

38-
const path = this.makePath(this._dataPath)
39-
if (removeExisting && existsSync(path)) rmSync(path, { recursive: true })
40-
mkdirSync(path)
60+
/* Handle embeddings directory */
61+
mkdirSync(this._embeddingsDirPath)
4162
}
4263

4364
private prepareFilesMetadata() {
4465
const extensions = ["ts", "tsx", "js", "jsx"]
4566

46-
// console.log(`🕒 Preparing metadata for files: *.${extensions.join(", *.")}`)
67+
console.log(`🕒 Preparing metadata for files: *.${extensions.join(", *.")}`)
4768
{
4869
const globPatterns = extensions.map((x) => `**/*.${x}`)
4970
for (const pattern of globPatterns) {
5071
const files = glob
51-
.sync(this.makePath(pattern))
72+
.sync(`${this._path}/${pattern}`)
5273
.map((x) => new SourceFile(x))
5374
this._files.push(...files)
5475
}
5576
}
56-
// console.log(`✅ Prepared metadata for ${this._files.length} files\n`)
77+
console.log(`✅ Prepared metadata for ${this._files.length} files\n`)
5778
}
5879

5980
private makeFilesBatches() {
@@ -74,7 +95,7 @@ export class Codebase {
7495
if (entries.length === 0) return 0
7596
const batch = Object.fromEntries(entries)
7697
writeFileSync(
77-
this.makePath(`${this._dataPath}/${fileName}`),
98+
path.resolve(this._dataDirPath, fileName),
7899
JSON.stringify(batch, null, 2)
79100
)
80101

@@ -88,7 +109,7 @@ export class Codebase {
88109
): Promise<number> {
89110
let nNodesProcessed = 0
90111

91-
// console.log(`🕒 Processing ${start}-${end} files`)
112+
console.log(`🕒 Processing ${start}-${end} files`)
92113
{
93114
let nodes: Record<string, DBNode> = {}
94115

@@ -114,9 +135,9 @@ export class Codebase {
114135

115136
nNodesProcessed = Object.keys(nodes).length
116137
}
117-
// console.log(
118-
// `✅ Processed ${start}-${end} files (${nNodesProcessed} nodes)\n`
119-
// )
138+
console.log(
139+
`✅ Processed ${start}-${end} files (${nNodesProcessed} nodes)\n`
140+
)
120141

121142
return nNodesProcessed
122143
}
@@ -133,14 +154,66 @@ export class Codebase {
133154
* @returns Promise<void>
134155
*/
135156
async process(): Promise<void> {
136-
// console.log("🕒 Preparing Nodes\n")
157+
console.log("🕒 Preparing Nodes\n")
137158

138159
let nodesProcessed = 0
139160
for (const [index, batch] of this._batches.entries()) {
140161
const [start, end] = batch
141162
nodesProcessed += await this.processFilesBatch(index, start, end)
142163
}
143164

144-
// console.log(`✅ Prepared ${nodesProcessed} nodes`)
165+
console.log(`✅ Prepared ${nodesProcessed} nodes`)
166+
}
167+
168+
async embed(): Promise<void> {
169+
console.log("🕒 Preparing Embeddings")
170+
171+
if (existsSync(this._embeddingsDirPath))
172+
rmSync(this._embeddingsDirPath, { recursive: true })
173+
mkdirSync(this._embeddingsDirPath)
174+
175+
const files = readdirSync(this._dataDirPath)
176+
.filter((x) => x.endsWith(".json"))
177+
.map((x) => path.resolve(this._dataDirPath, x)) // convert path like "batch-1.json" to "./data/batch-1.json"
178+
179+
const embeddingsPerNode = 2
180+
const maxAllowedEmbeddingsPerMinute = 2800 // openai limitation for embeddings
181+
const nFilesPerBatch = Math.floor(
182+
maxAllowedEmbeddingsPerMinute / this._batchSize / embeddingsPerNode
183+
)
184+
185+
let batch = 0
186+
for (let i = 0; i < files.length; i += nFilesPerBatch) {
187+
const start = i
188+
const end = Math.min(i + nFilesPerBatch, files.length)
189+
190+
console.log(`\n🕒 Embedding ${start}-${end} files`)
191+
192+
let nodes: Record<string, DBNode> = {}
193+
for (const file of files.slice(start, end)) {
194+
// to convert file content from a plain string to js object
195+
const data = JSON.parse(readFileSync(file, "utf-8"))
196+
Object.assign(nodes, data)
197+
}
198+
199+
const jobs = Object.values(nodes).map(async (x) => {
200+
nodes[x.id] = await DBNode.fillEmbeddings(new DBNode(x))
201+
})
202+
await Promise.all(jobs)
203+
204+
writeFileSync(
205+
`${this._embeddingsDirPath}/batch-${batch++}.json`,
206+
JSON.stringify(nodes, null, 2)
207+
)
208+
209+
console.log(`✅ Embedded ${start}-${end} files\n`)
210+
211+
if (i + nFilesPerBatch < files.length) {
212+
console.log(`🕒 Waiting for 60 seconds`)
213+
await new Promise((resolve) => setTimeout(resolve, 60 * 1000))
214+
}
215+
}
216+
217+
console.log(`✅ Prepared embeddings for nodes`)
145218
}
146219
}

rocket-chatter-ingestion-server/src/use.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import { FileProcessor } from "./process/prepare/processor/file"
33

44
async function main() {
55
const codebase = new Codebase("./project", new FileProcessor())
6-
codebase.process()
6+
await codebase.process()
7+
await codebase.embed()
78
}
89

910
main()

0 commit comments

Comments
 (0)