-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat: add deterministic naming for Qdrant collections #7943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import { QdrantClient } from "@qdrant/js-client-rest" | ||
| import { createHash } from "crypto" | ||
| import * as fs from "fs" | ||
|
|
||
| import { QdrantVectorStore } from "../qdrant-client" | ||
| import { getWorkspacePath } from "../../../../utils/path" | ||
|
|
@@ -8,6 +9,7 @@ import { DEFAULT_MAX_SEARCH_RESULTS, DEFAULT_SEARCH_MIN_SCORE } from "../../cons | |
| // Mocks | ||
| vitest.mock("@qdrant/js-client-rest") | ||
| vitest.mock("crypto") | ||
| vitest.mock("fs") | ||
| vitest.mock("../../../../utils/path") | ||
| vitest.mock("../../../../i18n", () => ({ | ||
| t: (key: string, params?: any) => { | ||
|
|
@@ -68,6 +70,10 @@ describe("QdrantVectorStore", () => { | |
| // Mock getWorkspacePath | ||
| ;(getWorkspacePath as any).mockReturnValue(mockWorkspacePath) | ||
|
|
||
| // Mock fs functions - default to no git repo and no config file | ||
| ;(fs.existsSync as any).mockReturnValue(false) | ||
| ;(fs.readFileSync as any).mockReturnValue("") | ||
|
|
||
| vectorStore = new QdrantVectorStore(mockWorkspacePath, mockQdrantUrl, mockVectorSize, mockApiKey) | ||
| }) | ||
|
|
||
|
|
@@ -82,13 +88,101 @@ describe("QdrantVectorStore", () => { | |
| "User-Agent": "Roo-Code", | ||
| }, | ||
| }) | ||
| // When no git repo or custom config, should fall back to workspace-based hash | ||
| expect(createHash).toHaveBeenCalledWith("sha256") | ||
| expect(mockCreateHashInstance.update).toHaveBeenCalledWith(mockWorkspacePath) | ||
| expect(mockCreateHashInstance.digest).toHaveBeenCalledWith("hex") | ||
| // Access private member for testing constructor logic (not ideal, but necessary here) | ||
| expect((vectorStore as any).collectionName).toBe(expectedCollectionName) | ||
| expect((vectorStore as any).vectorSize).toBe(mockVectorSize) | ||
| }) | ||
|
|
||
| it("should use custom collection name from .roo/codebase-index.json if available", () => { | ||
| // Mock the config file to exist and contain a custom collection name | ||
| ;(fs.existsSync as any).mockImplementation((path: string) => { | ||
| return path.includes(".roo/codebase-index.json") | ||
| }) | ||
| ;(fs.readFileSync as any).mockReturnValue( | ||
| JSON.stringify({ | ||
| collectionName: "my-custom-collection", | ||
| }), | ||
| ) | ||
|
|
||
| const customVectorStore = new QdrantVectorStore(mockWorkspacePath, mockQdrantUrl, mockVectorSize, mockApiKey) | ||
|
|
||
| // Should use the sanitized custom collection name | ||
| expect((customVectorStore as any).collectionName).toBe("my-custom-collection") | ||
| }) | ||
|
|
||
| it("should use git repository URL for deterministic naming when available", () => { | ||
| // Mock git config to exist | ||
| ;(fs.existsSync as any).mockImplementation((path: string) => { | ||
| return path.includes(".git") | ||
| }) | ||
| ;(fs.readFileSync as any).mockImplementation((path: string) => { | ||
| if (path.includes(".git/config")) { | ||
| return `[remote "origin"] | ||
| url = [email protected]:user/repo.git` | ||
| } | ||
| return "" | ||
| }) | ||
|
|
||
| // Mock createHash for the git URL | ||
| const gitUrlHash = "gitrepo1234567890abcdef" | ||
| mockCreateHashInstance.digest.mockReturnValueOnce(gitUrlHash) | ||
|
|
||
| const gitVectorStore = new QdrantVectorStore(mockWorkspacePath, mockQdrantUrl, mockVectorSize, mockApiKey) | ||
|
|
||
| // Should use repo- prefix with git URL hash | ||
| expect((gitVectorStore as any).collectionName).toBe(`repo-${gitUrlHash.substring(0, 16)}`) | ||
|
|
||
| // Verify it hashed the normalized git URL | ||
| expect(mockCreateHashInstance.update).toHaveBeenCalledWith("https://github.com/user/repo") | ||
| }) | ||
|
|
||
| it("should normalize different git URL formats consistently", () => { | ||
| const testCases = [ | ||
| { input: "[email protected]:user/repo.git", expected: "https://github.com/user/repo" }, | ||
| { input: "https://github.com/user/repo.git", expected: "https://github.com/user/repo" }, | ||
| { input: "ssh://[email protected]/user/repo.git", expected: "https://github.com/user/repo" }, | ||
| { input: "https://[email protected]/user/repo.git", expected: "https://github.com/user/repo" }, | ||
| ] | ||
|
|
||
| testCases.forEach(({ input, expected }) => { | ||
| vitest.clearAllMocks() | ||
| ;(fs.existsSync as any).mockImplementation((path: string) => path.includes(".git")) | ||
| ;(fs.readFileSync as any).mockImplementation((path: string) => { | ||
| if (path.includes(".git/config")) { | ||
| return `[remote "origin"]\n\turl = ${input}` | ||
| } | ||
| return "" | ||
| }) | ||
|
|
||
| const gitUrlHash = "normalized1234567890abcdef" | ||
| mockCreateHashInstance.digest.mockReturnValueOnce(gitUrlHash) | ||
|
|
||
| const store = new QdrantVectorStore(mockWorkspacePath, mockQdrantUrl, mockVectorSize, mockApiKey) | ||
|
|
||
| // Verify it hashed the normalized URL | ||
| expect(mockCreateHashInstance.update).toHaveBeenCalledWith(expected) | ||
| }) | ||
| }) | ||
|
|
||
| it("should sanitize custom collection names to be Qdrant-compatible", () => { | ||
| ;(fs.existsSync as any).mockImplementation((path: string) => { | ||
| return path.includes(".roo/codebase-index.json") | ||
| }) | ||
| ;(fs.readFileSync as any).mockReturnValue( | ||
| JSON.stringify({ | ||
| collectionName: "My Custom Collection!@#$%", | ||
| }), | ||
| ) | ||
|
|
||
| const customVectorStore = new QdrantVectorStore(mockWorkspacePath, mockQdrantUrl, mockVectorSize, mockApiKey) | ||
|
|
||
| // Should sanitize the collection name | ||
| expect((customVectorStore as any).collectionName).toBe("my-custom-collection") | ||
| }) | ||
| it("should handle constructor with default URL when none provided", () => { | ||
| const vectorStoreWithDefaults = new QdrantVectorStore(mockWorkspacePath, undefined as any, mockVectorSize) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| import { QdrantClient, Schemas } from "@qdrant/js-client-rest" | ||
| import { createHash } from "crypto" | ||
| import * as path from "path" | ||
| import * as fs from "fs" | ||
| import { getWorkspacePath } from "../../../utils/path" | ||
| import { IVectorStore } from "../interfaces/vector-store" | ||
| import { Payload, VectorStoreSearchResult } from "../interfaces" | ||
|
|
@@ -77,10 +78,151 @@ export class QdrantVectorStore implements IVectorStore { | |
| }) | ||
| } | ||
|
|
||
| // Generate collection name from workspace path | ||
| const hash = createHash("sha256").update(workspacePath).digest("hex") | ||
| // Generate deterministic collection name | ||
| this.vectorSize = vectorSize | ||
| this.collectionName = `ws-${hash.substring(0, 16)}` | ||
| this.collectionName = this.generateCollectionName(workspacePath) | ||
| } | ||
|
|
||
| /** | ||
| * Generates a deterministic collection name based on repository or workspace | ||
| * @param workspacePath Path to the workspace | ||
| * @returns Collection name | ||
| */ | ||
| private generateCollectionName(workspacePath: string): string { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider adding JSDoc comments to these new private methods explaining the fallback hierarchy (custom name → git URL → workspace path). This would make the code more self-documenting for future contributors. |
||
| // First, check for a custom collection name in .roo/codebase-index.json | ||
| const customName = this.loadCustomCollectionName(workspacePath) | ||
| if (customName) { | ||
| // Sanitize the custom name to ensure it's valid for Qdrant | ||
| return this.sanitizeCollectionName(customName) | ||
| } | ||
|
|
||
| // Try to get git repository information for deterministic naming | ||
| const gitInfo = this.getGitInfoSync(workspacePath) | ||
| if (gitInfo?.repositoryUrl) { | ||
| // Use repository URL to generate a deterministic name | ||
| // This ensures the same collection name across worktrees and developers | ||
| const normalizedUrl = this.normalizeGitUrl(gitInfo.repositoryUrl) | ||
| const hash = createHash("sha256").update(normalizedUrl).digest("hex") | ||
| return `repo-${hash.substring(0, 16)}` | ||
| } | ||
|
|
||
| // Fallback to workspace path hash (original behavior) | ||
| const hash = createHash("sha256").update(workspacePath).digest("hex") | ||
| return `ws-${hash.substring(0, 16)}` | ||
| } | ||
|
|
||
| /** | ||
| * Loads custom collection name from .roo/codebase-index.json if it exists | ||
| * @param workspacePath Path to the workspace | ||
| * @returns Custom collection name or undefined | ||
| */ | ||
| private loadCustomCollectionName(workspacePath: string): string | undefined { | ||
| try { | ||
| const configPath = path.join(workspacePath, ".roo", "codebase-index.json") | ||
| if (fs.existsSync(configPath)) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor performance note: These synchronous file operations could potentially block the event loop. Since this only happens during initialization it's probably fine, but worth noting for future optimization if initialization performance becomes a concern. |
||
| const config = JSON.parse(fs.readFileSync(configPath, "utf8")) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently this silently falls back when JSON parsing fails. Should we consider logging malformed JSON at a higher level than console.warn to help users debug configuration issues? Or perhaps add validation for the expected schema? |
||
| if (config.collectionName && typeof config.collectionName === "string") { | ||
| return config.collectionName | ||
| } | ||
| } | ||
| } catch (error) { | ||
| // Ignore errors reading config file | ||
| console.warn( | ||
| `[QdrantVectorStore] Could not read custom collection name from .roo/codebase-index.json:`, | ||
| error, | ||
| ) | ||
| } | ||
| return undefined | ||
| } | ||
|
|
||
| /** | ||
| * Synchronously gets git repository information | ||
| * @param workspacePath Path to the workspace | ||
| * @returns Git repository info or undefined | ||
| */ | ||
| private getGitInfoSync(workspacePath: string): { repositoryUrl?: string } | undefined { | ||
| try { | ||
| const gitDir = path.join(workspacePath, ".git") | ||
|
|
||
| // Check if .git directory exists | ||
| if (!fs.existsSync(gitDir)) { | ||
| return undefined | ||
| } | ||
|
|
||
| // Try to read git config file | ||
| const configPath = path.join(gitDir, "config") | ||
| if (fs.existsSync(configPath)) { | ||
| const configContent = fs.readFileSync(configPath, "utf8") | ||
|
|
||
| // Extract remote URL | ||
| const urlMatch = configContent.match(/url\s*=\s*(.+?)(?:\r?\n|$)/m) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This regex pattern might not handle all edge cases in git config files (e.g., URLs with spaces or special characters). Have you considered using a more robust git config parser library, or should we add more comprehensive error handling for edge cases? |
||
| if (urlMatch && urlMatch[1]) { | ||
| const url = urlMatch[1].trim() | ||
| return { repositoryUrl: url } | ||
| } | ||
| } | ||
| } catch (error) { | ||
| // Ignore errors and fall back to workspace-based naming | ||
| console.warn(`[QdrantVectorStore] Could not read git repository info:`, error) | ||
| } | ||
| return undefined | ||
| } | ||
|
|
||
| /** | ||
| * Normalizes a git URL for consistent hashing | ||
| * @param url Git URL to normalize | ||
| * @returns Normalized URL | ||
| */ | ||
| private normalizeGitUrl(url: string): string { | ||
| try { | ||
| // Remove credentials from HTTPS URLs | ||
| let normalized = url | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The credential removal logic here is good, but consider adding a comment explaining why this is important for security (preventing credential leakage in collection names). This helps future maintainers understand the security implications. |
||
| if (url.startsWith("https://") || url.startsWith("http://")) { | ||
| try { | ||
| const urlObj = new URL(url) | ||
| urlObj.username = "" | ||
| urlObj.password = "" | ||
| normalized = urlObj.toString() | ||
| } catch { | ||
| // If URL parsing fails, just remove obvious credentials | ||
| normalized = url.replace(/^https?:\/\/[^@]+@/, "https://") | ||
| } | ||
| } | ||
|
|
||
| // Convert SSH to HTTPS format for consistency | ||
| if (normalized.startsWith("git@")) { | ||
| normalized = normalized.replace(/^git@([^:]+):/, "https://$1/") | ||
| } else if (normalized.startsWith("ssh://")) { | ||
| normalized = normalized.replace(/^ssh:\/\/(?:git@)?([^\/]+)\//, "https://$1/") | ||
| } | ||
|
|
||
| // Remove .git suffix | ||
| normalized = normalized.replace(/\.git$/, "") | ||
|
|
||
| // Convert to lowercase for consistency | ||
| normalized = normalized.toLowerCase() | ||
|
|
||
| return normalized | ||
| } catch (error) { | ||
| // If normalization fails, return the original URL | ||
| console.warn(`[QdrantVectorStore] Could not normalize git URL:`, error) | ||
| return url.toLowerCase() | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Sanitizes a collection name to ensure it's valid for Qdrant | ||
| * @param name Collection name to sanitize | ||
| * @returns Sanitized collection name | ||
| */ | ||
| private sanitizeCollectionName(name: string): string { | ||
| // Qdrant collection names must be alphanumeric with underscores or hyphens | ||
| // Max length is typically 255 characters | ||
| return name | ||
| .toLowerCase() | ||
| .replace(/[^a-z0-9_-]/g, "-") | ||
| .replace(/^-+|-+$/g, "") // Remove leading/trailing hyphens | ||
| .substring(0, 255) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great test coverage! Consider adding a few more edge case tests:
These would help ensure robustness across different user configurations.