Skip to content

Commit 95c7aa3

Browse files
committed
Added a basic implementation for recursively fetching all the Rocket.Chat's documentation from https://developer.rocket.chat/docs`
1 parent a88f059 commit 95c7aa3

File tree

9 files changed

+2025
-596
lines changed

9 files changed

+2025
-596
lines changed

ingestion/bun.lockb

57.5 KB
Binary file not shown.

ingestion/package-lock.json

Lines changed: 1782 additions & 583 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ingestion/package.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"main": "src/main.ts",
33
"scripts": {
4-
"start": "ts-node ./src/main.ts",
5-
"dev": "ts-node ./src/test.ts"
4+
"start": "tsx ./src/main.ts",
5+
"dev": "clear && tsx ./src/test.ts"
66
},
77
"devDependencies": {
88
"@babel/parser": "^7.24.7",
@@ -14,6 +14,8 @@
1414
"@types/uuid": "^10.0.0",
1515
"nodemon": "^3.1.0",
1616
"ts-node": "^10.9.2",
17+
"tsx": "^4.19.0",
18+
"@types/jsdom": "^21.1.7",
1719
"typescript": "^5.4.2"
1820
},
1921
"dependencies": {
@@ -23,9 +25,11 @@
2325
"dotenv": "^16.4.5",
2426
"express": "^4.19.1",
2527
"glob": "^10.4.2",
28+
"jsdom": "^25.0.0",
2629
"nanoid": "^5.0.7",
2730
"neo4j-driver": "4.4.0",
2831
"openai": "^4.29.2",
32+
"puppeteer": "^23.3.0",
2933
"recast": "^0.23.9",
3034
"uuid": "^10.0.0"
3135
}

ingestion/src/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { configDotenv } from "dotenv"
22
configDotenv()
33

4+
export const DOCUMENTATION_URL = "https://developer.rocket.chat/docs"
45
export const REPO_URI = "https://github.com/RocketChat/Rocket.Chat"
56
export const RC_APP_URI = process.env["RC_APP_URI"] ?? ""
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
export type DevDocDBNodeRelationType = "CONTAINS"
2+
export type DevDocDBNodeRelation = {
3+
target: string
4+
relation: DevDocDBNodeRelationType
5+
}
6+
7+
export type DevDocDBNode = {
8+
id: string
9+
relations: DevDocDBNodeRelation[]
10+
11+
element: string
12+
13+
content: string
14+
contentEmbeddings: number[]
15+
}

ingestion/src/main.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import { exec } from "child_process"
22
import { v4 as uuid } from "uuid"
33

4-
import { REPO_URI } from "./constants"
5-
import { insertDataIntoDB } from "./process/ingest/ingest"
6-
import { Codebase } from "./process/prepare/codebase"
7-
import { FileProcessor } from "./process/prepare/processor/file"
4+
import { Documentation } from "./process/documentation/prepare"
85

96
namespace Algorithms {
107
export async function execCommand(command: string) {
@@ -27,13 +24,16 @@ namespace Algorithms {
2724
async function main() {
2825
const sessionID = uuid()
2926

30-
await Algorithms.execCommand(`git clone ${REPO_URI} ${sessionID}`)
31-
{
32-
const codebase = new Codebase(sessionID, new FileProcessor(), 1)
33-
await codebase.process()
34-
await insertDataIntoDB(codebase.dataDirPath)
35-
}
36-
await Algorithms.execCommand(`rm -rf ${sessionID}`)
27+
// await Algorithms.execCommand(`git clone ${REPO_URI} ${sessionID}`)
28+
// {
29+
// const codebase = new Codebase(sessionID, new FileProcessor(), 1)
30+
// await codebase.process()
31+
// await insertDataIntoDB(codebase.dataDirPath)
32+
// }
33+
// await Algorithms.execCommand(`rm -rf ${sessionID}`)
34+
35+
const docs = new Documentation()
36+
await docs.prepare()
3737
}
3838

3939
main()
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import { JSDOM } from "jsdom"
2+
3+
import { customAlphabet } from "nanoid"
4+
import { DevDocDBNode } from "../../core/devDocsDBNode"
5+
import {
6+
DocumentPageElement_t,
7+
IDocumentationPage,
8+
} from "./documentationPage.types"
9+
10+
export class DocumentationPage implements IDocumentationPage {
11+
private readonly _url: string
12+
get url() {
13+
return this._url
14+
}
15+
16+
constructor(url: string) {
17+
this._url = url
18+
}
19+
20+
private parseHtmlToHierarchy(root: HTMLDivElement): DocumentPageElement_t[] {
21+
const elements = Array.from(
22+
root.querySelectorAll("h1, h2, h3, h4, h5, h6, p, pre")
23+
)
24+
25+
function parseElements(elements: Element[]): DocumentPageElement_t[] {
26+
const hierarchy: DocumentPageElement_t[] = []
27+
const stack: DocumentPageElement_t[] = []
28+
29+
elements.forEach((el) => {
30+
const tagName = el.tagName.toLowerCase()
31+
const content = el.textContent?.trim() || ""
32+
33+
const newElement: DocumentPageElement_t = {
34+
id: customAlphabet("1234567890abcdef", 10)(),
35+
element: tagName,
36+
content: "",
37+
children: [],
38+
}
39+
40+
if (!tagName.startsWith("h")) {
41+
newElement.content = content
42+
if (stack.length > 0) {
43+
stack[stack.length - 1].children!.push(newElement)
44+
}
45+
} else {
46+
// Extract the heading level from tagName ('h1' -> 1, 'h2' -> 2, etc.)
47+
const headingLevel = parseInt(tagName[1])
48+
49+
newElement.content = content
50+
51+
// Adjust the stack based on heading level
52+
while (
53+
stack.length > 0 &&
54+
getHeadingLevel(stack[stack.length - 1].element) >= headingLevel
55+
) {
56+
stack.pop()
57+
}
58+
59+
if (stack.length === 0) {
60+
// Top-level heading
61+
hierarchy.push(newElement)
62+
} else {
63+
// Add as a child of the last heading in the stack
64+
stack[stack.length - 1].children.push(newElement)
65+
}
66+
67+
stack.push(newElement)
68+
}
69+
})
70+
71+
return hierarchy
72+
}
73+
74+
// Helper function to extract heading level from 'h1', 'h2', ..., 'h6'
75+
function getHeadingLevel(tagName: string): number {
76+
return parseInt(tagName[1], 10)
77+
}
78+
79+
return parseElements(elements)
80+
}
81+
82+
private convertHeirarchyToDevDocsDBNodes(
83+
hierarchy: DocumentPageElement_t[]
84+
): DevDocDBNode[] {
85+
const devDocsDBNodes: DevDocDBNode[] = []
86+
87+
function traverseHierarchy(node: DocumentPageElement_t): DevDocDBNode {
88+
const devDocDBNode: DevDocDBNode = {
89+
id: node.id,
90+
element: node.element,
91+
relations: [],
92+
93+
content: node.content || "",
94+
contentEmbeddings: [],
95+
}
96+
97+
if (node.children) {
98+
node.children.forEach((child) => {
99+
const childNode = traverseHierarchy(child)
100+
devDocDBNode.relations.push({
101+
target: childNode.id,
102+
relation: "CONTAINS",
103+
})
104+
devDocsDBNodes.push(childNode)
105+
})
106+
}
107+
108+
return devDocDBNode
109+
}
110+
111+
hierarchy.forEach((node) => {
112+
const devDocDBNode = traverseHierarchy(node)
113+
devDocsDBNodes.push(devDocDBNode)
114+
})
115+
116+
return devDocsDBNodes
117+
}
118+
119+
async fetchNodes() {
120+
const res = await fetch(this.url)
121+
const body = await res.text()
122+
const { window, document } = new JSDOM(body).window
123+
124+
const content = document.evaluate(
125+
`//*[@id="doc_content_block"]`,
126+
document,
127+
null,
128+
window.XPathResult.FIRST_ORDERED_NODE_TYPE,
129+
null
130+
).singleNodeValue as HTMLDivElement
131+
if (!content) throw new Error("Content not found")
132+
133+
const hierarchy = this.parseHtmlToHierarchy(content)
134+
const nodes = this.convertHeirarchyToDevDocsDBNodes(hierarchy)
135+
136+
return nodes
137+
}
138+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { DevDocDBNode } from "../../core/devDocsDBNode"
2+
3+
export type DocumentPageElement_t = {
4+
id: string
5+
element: string
6+
content: string
7+
children: DocumentPageElement_t[]
8+
}
9+
10+
export interface IDocumentationPage {
11+
fetchNodes(): Promise<DevDocDBNode[]>
12+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import { writeFileSync } from "fs"
2+
import puppeteer from "puppeteer"
3+
4+
import { DOCUMENTATION_URL } from "../../constants"
5+
import { DevDocDBNode } from "../../core/devDocsDBNode"
6+
import { DocumentationPage } from "./documentationPage"
7+
8+
export class Documentation {
9+
private async gatherDocumentationLinks() {
10+
const browser = await puppeteer.launch({
11+
headless: true, // Ensure it is headless for CI environments
12+
args: ["--no-sandbox", "--disable-setuid-sandbox"], // Required for CI environments like GitHub Actions
13+
})
14+
15+
const page = await browser.newPage()
16+
await page.goto(DOCUMENTATION_URL)
17+
18+
try {
19+
const links = await page.$$eval(".leftsidebarnav a", (links) =>
20+
links.map((x) => x.href).filter(Boolean)
21+
)
22+
return links
23+
} finally {
24+
await browser.close()
25+
}
26+
}
27+
28+
private async prepareDevDocsNodes(): Promise<DevDocDBNode[]> {
29+
const urls = await this.gatherDocumentationLinks()
30+
31+
const docPages = urls.map((x) => new DocumentationPage(x))
32+
33+
const failed: string[] = []
34+
const nodes = await Promise.all(
35+
docPages.map(async (x) => {
36+
try {
37+
return await x.fetchNodes()
38+
} catch {
39+
failed.push(x.url)
40+
return []
41+
}
42+
})
43+
)
44+
45+
if (failed.length > 0) {
46+
console.clear()
47+
console.warn(
48+
`Failed to fetch nodes for the following URLs: ${failed.join(", ")}`
49+
)
50+
}
51+
52+
const flattenedNodes = nodes.flat()
53+
return flattenedNodes
54+
}
55+
56+
async prepare() {
57+
const nodes = await this.prepareDevDocsNodes()
58+
writeFileSync("nodes.json", JSON.stringify(nodes, null, 3))
59+
}
60+
}

0 commit comments

Comments
 (0)