Skip to content

Commit 24a972e

Browse files
committed
fix: various bugs
1 parent 30fe6df commit 24a972e

File tree

2 files changed

+78
-50
lines changed

2 files changed

+78
-50
lines changed

src/docLoader.ts

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { HTMLReader } from "@llamaindex/readers/html";
2-
import { glob } from "glob";
2+
import { glob, GlobOptions } from "glob";
33
import path from "path";
44
import fs from "fs/promises";
55
import { Document } from "llamaindex";
@@ -15,51 +15,60 @@ export async function loadCrateDocs(
1515
): Promise<Document<Metadata>[]> {
1616
const docsPath = path.join(rootDocsPath, crateName);
1717
const pattern: string = "**/*.html";
18-
const options: import("glob").GlobOptions = {
18+
const options: GlobOptions = {
1919
cwd: docsPath,
2020
ignore: "**/index.html",
2121
withFileTypes: false,
2222
};
2323

24-
const htmlFiles: string[] = await glob(pattern, options) as string[];
24+
const htmlFiles: string[] = (await glob(pattern, options)) as string[];
2525
console.log("found # html files (initial):", htmlFiles.length);
2626

27-
// Group files by basename and count occurrences
28-
const basenameGroups: { [key: string]: string[] } = {};
29-
htmlFiles.forEach((file: string) => {
30-
const basename: string = path.basename(file);
31-
if (!basenameGroups[basename]) {
32-
basenameGroups[basename] = [];
33-
}
34-
basenameGroups[basename].push(file);
35-
});
27+
// Check the dangerous flag
28+
const includeAllDangerously = process.env.INCLUDE_ALL_DOCS_DANGEROUSLY === "true";
29+
let filesToProcess: string[] = [];
30+
31+
if (includeAllDangerously) {
32+
console.warn(
33+
`WARNING: INCLUDE_ALL_DOCS_DANGEROUSLY is enabled for ${crateName}. Processing all ${htmlFiles.length} HTML files. This may incur significant time or cost (e.g., embedding calls).`
34+
);
35+
filesToProcess = htmlFiles;
36+
} else {
37+
// Group files by basename and count occurrences
38+
const basenameGroups: { [key: string]: string[] } = {};
39+
htmlFiles.forEach((file: string) => {
40+
const basename: string = path.basename(file);
41+
if (!basenameGroups[basename]) {
42+
basenameGroups[basename] = [];
43+
}
44+
basenameGroups[basename].push(file);
45+
});
3646

37-
// Filter for duplicate basenames and pick largest file
38-
const uniqueDuplicateFiles: string[] = [];
39-
for (const [basename, files] of Object.entries(basenameGroups)) {
40-
if (files.length > 1) {
41-
const largestFile = await files.reduce(
42-
async (largestPromise, current) => {
43-
const largest = await largestPromise;
44-
const largestStats = await fs.stat(path.join(docsPath, largest));
45-
const currentStats = await fs.stat(path.join(docsPath, current));
46-
return largestStats.size > currentStats.size ? largest : current;
47-
},
48-
Promise.resolve(files[0])
49-
);
50-
uniqueDuplicateFiles.push(largestFile);
47+
// Filter for duplicate basenames and pick largest file
48+
for (const [basename, files] of Object.entries(basenameGroups)) {
49+
if (files.length > 1) {
50+
const largestFile = await files.reduce(
51+
async (largestPromise, current) => {
52+
const largest = await largestPromise;
53+
const largestStats = await fs.stat(path.join(docsPath, largest));
54+
const currentStats = await fs.stat(path.join(docsPath, current));
55+
return largestStats.size > currentStats.size ? largest : current;
56+
},
57+
Promise.resolve(files[0])
58+
);
59+
filesToProcess.push(largestFile);
60+
}
5161
}
62+
console.log(
63+
"found # unique duplicate html files for crate " + crateName + ":",
64+
filesToProcess.length
65+
);
5266
}
5367

54-
console.log(
55-
"found # unique duplicate html files for crate " + crateName + ":",
56-
uniqueDuplicateFiles.length
57-
);
58-
59-
// Process the unique duplicate files with HTMLReader
68+
// Process the selected files with HTMLReader
6069
const htmlReader: HTMLReader = new HTMLReader();
6170
const docs: Document<Metadata>[][] = await Promise.all(
62-
uniqueDuplicateFiles.map(async (filePath: string) => {
71+
filesToProcess.map(async (filePath: string) => {
6372
const fullFilePath: string = path.join(docsPath, filePath);
6473
const fileDocs: Document<Metadata>[] = await htmlReader.loadData(fullFilePath);
6574
console.log("Loaded:", filePath);
@@ -73,7 +82,7 @@ export async function loadCrateDocs(
7382
);
7483

7584
const foundDocs: Document<Metadata>[] = docs.flat();
76-
console.log("found # docs from unique duplicates:", foundDocs.length);
85+
console.log("found # docs from processed files:", foundDocs.length);
7786
return foundDocs;
7887
}
7988

src/index.ts

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class RustDocsServer {
2626
private persistDir: string;
2727
private docsPath: string;
2828
private crateName: string;
29+
private toolName: string;
2930

3031
constructor(docsPath: string, crateName: string) {
3132
this.server = new Server(
@@ -44,24 +45,39 @@ class RustDocsServer {
4445
this.persistDir = path.join(__dirname, "storage", this.crateName);
4546
this.docsPath = docsPath;
4647
this.setupToolHandlers();
48+
this.toolName = `query_rust_docs_${this.crateName}`;
4749
}
4850

4951
private async initializeIndex() {
5052
try {
51-
const indexExists = fs.existsSync(
52-
path.join(this.persistDir, "vector_store.json")
53-
);
54-
55-
const storageContext = await storageContextFromDefaults({
56-
persistDir: this.persistDir,
57-
});
53+
const indexExists =
54+
fs.existsSync(path.join(this.persistDir, "vector_store.json")) &&
55+
fs.existsSync(path.join(this.persistDir, "doc_store.json")) &&
56+
fs.existsSync(path.join(this.persistDir, "index_store.json"));
5857

5958
if (indexExists) {
6059
console.log("Loading existing vector store from", this.persistDir);
60+
const storageContext = await storageContextFromDefaults({
61+
persistDir: this.persistDir,
62+
});
6163
this.index = await VectorStoreIndex.init({ storageContext });
6264
return;
65+
} else {
66+
// Clear only the files inside the persist directory, not the directory itself
67+
if (fs.existsSync(this.persistDir)) {
68+
const files = fs.readdirSync(this.persistDir);
69+
for (const file of files) {
70+
fs.unlinkSync(path.join(this.persistDir, file));
71+
}
72+
} else {
73+
// Ensure the directory exists if it was never created
74+
fs.mkdirSync(this.persistDir, { recursive: true });
75+
}
6376
}
6477

78+
const storageContext = await storageContextFromDefaults({
79+
persistDir: this.persistDir,
80+
});
6581
this.index = await VectorStoreIndex.fromDocuments(
6682
await loadCrateDocs(this.docsPath, this.crateName),
6783
{
@@ -80,7 +96,7 @@ class RustDocsServer {
8096
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
8197
tools: [
8298
{
83-
name: "query_rust_docs",
99+
name: this.toolName,
84100
description: `Query the official Rust documentation for the '${this.crateName}' crate. Use this tool to retrieve detailed information about '${this.crateName}'’s API, including structs, traits, enums, constants, and functions. Ideal for answering technical questions about how to use '${this.crateName}' in Rust projects, such as understanding specific methods, configuration options, or integration details. Additionally, leverage this tool to ensure accuracy of written code by verifying API usage and to resolve Clippy or lint errors by clarifying correct implementations. For example, use it for questions like "How do I configure routing in ${this.crateName}?", "What does this ${this.crateName} struct do?", "Is this ${this.crateName} method call correct?", or "How do I fix a Clippy warning about ${this.crateName} in my code?"`,
85101
inputSchema: {
86102
type: "object",
@@ -100,36 +116,36 @@ class RustDocsServer {
100116
},
101117
],
102118
}));
103-
119+
104120
this.server.setRequestHandler(
105121
CallToolRequestSchema,
106122
async (request: CallToolRequest) => {
107-
if (request.params.name !== "query_rust_docs") {
123+
if (request.params.name !== this.toolName) {
108124
throw new McpError(
109125
ErrorCode.MethodNotFound,
110126
`Unknown tool: ${request.params.name}`
111127
);
112128
}
113-
129+
114130
const { question, crate } = request.params.arguments as {
115131
question: string;
116132
crate: string;
117133
};
118-
134+
119135
if (crate !== this.crateName) {
120136
throw new McpError(
121137
ErrorCode.InvalidParams,
122138
`This server only supports queries for '${this.crateName}', not '${crate}'`
123139
);
124140
}
125-
141+
126142
try {
127143
const queryEngine = this.index.asQueryEngine();
128144
const response: EngineResponse = await queryEngine.query({
129145
query: question,
130146
stream: false,
131147
});
132-
148+
133149
return {
134150
content: [
135151
{
@@ -162,9 +178,12 @@ if (!process.env.CRATE_NAME) {
162178
if (!process.env.DOCS_PATH) {
163179
throw new Error("DOCS_PATH environment variable must be set");
164180
}
165-
if (!process.env.OPENAI_API_KEY) {
181+
if (!process.env.OPENAI_API_KEY) {
166182
throw new Error("OPENAI_API_KEY environment variable must be set");
167183
}
168184

169-
const server = new RustDocsServer(process.env.DOCS_PATH, process.env.CRATE_NAME);
185+
const server = new RustDocsServer(
186+
process.env.DOCS_PATH,
187+
process.env.CRATE_NAME
188+
);
170189
server.run().catch(console.error);

0 commit comments

Comments
 (0)