Skip to content

Commit 9998fcd

Browse files
committed
feat: vector collapse
1 parent e4b58d0 commit 9998fcd

File tree

16 files changed

+789
-15
lines changed

16 files changed

+789
-15
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@speakeasy-api/docs-mcp-core": minor
3+
"@speakeasy-api/docs-mcp-cli": minor
4+
"@speakeasy-api/docs-mcp-server": minor
5+
---
6+
7+
Add `taxonomy` manifest field with `vector_collapse` option for collapsing content-equivalent search results across variant axes (e.g. the same API operation documented in multiple SDK languages). At search time, results sharing the same content identity are collapsed to the highest-scoring variant. On a realistic 30MB multi-language corpus this improved facet precision by 27%, MRR@5 by 10%, and NDCG@5 by 15%.

packages/cli/src/index.ts

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ import {
1616
embedChunksIncremental,
1717
loadCache,
1818
loadChunksFromPreviousIndex,
19+
mergeTaxonomyConfigs,
1920
parseManifestJson,
2021
resolveFileConfig,
2122
saveCache,
2223
type Chunk,
2324
type EmbedProgressEvent,
2425
type EmbeddingMetadata,
26+
type ManifestTaxonomyFieldConfig,
2527
type IndexBuildStep,
2628
type Manifest,
2729
type PreviousIndexReader
@@ -215,6 +217,8 @@ program
215217
const cacheSuffix = chunkCacheHits > 0 ? ` (${chunkCacheHits} cached)` : "";
216218
console.warn(`Chunked ${files.length} files into ${chunks.length.toLocaleString()} chunks${cacheSuffix}`);
217219

220+
const taxonomyConfig = mergeTaxonomyConfigs(manifestCache.values());
221+
218222
const providerInput: {
219223
provider: "none" | "hash" | "openai";
220224
model?: string;
@@ -318,10 +322,18 @@ program
318322
files,
319323
options.description,
320324
embeddingMetadata,
321-
sourceCommit
325+
sourceCommit,
326+
taxonomyConfig
322327
);
323328
const metadataKeys = Object.keys(metadata.taxonomy);
324329

330+
// Warn about taxonomy config keys that don't match any chunk metadata
331+
for (const key of Object.keys(taxonomyConfig)) {
332+
if (!metadata.taxonomy[key]) {
333+
console.warn(`warn: taxonomy config key '${key}' does not match any chunk metadata — this configuration has no effect`);
334+
}
335+
}
336+
325337
// Close previous index before writing the new one
326338
previousIndex?.close();
327339

@@ -466,11 +478,12 @@ function buildMetadata(
466478
files: string[],
467479
corpusDescription: string,
468480
embedding: EmbeddingMetadata | null,
469-
sourceCommit: string | null
481+
sourceCommit: string | null,
482+
taxonomyConfig: Record<string, ManifestTaxonomyFieldConfig>
470483
): {
471484
metadata_version: string;
472485
corpus_description: string;
473-
taxonomy: Record<string, { description: string; values: string[] }>;
486+
taxonomy: Record<string, { description: string; values: string[]; vector_collapse?: boolean }>;
474487
stats: {
475488
total_chunks: number;
476489
total_files: number;
@@ -488,11 +501,13 @@ function buildMetadata(
488501
}
489502
}
490503

491-
const taxonomy: Record<string, { description: string; values: string[] }> = {};
504+
const taxonomy: Record<string, { description: string; values: string[]; vector_collapse?: boolean }> = {};
492505
for (const [key, values] of taxonomyValues.entries()) {
506+
const config = taxonomyConfig[key];
493507
taxonomy[key] = {
494508
description: `Filter results by ${key}.`,
495-
values: [...values].sort((a, b) => a.localeCompare(b))
509+
values: [...values].sort((a, b) => a.localeCompare(b)),
510+
...(config?.vector_collapse ? { vector_collapse: true } : {})
496511
};
497512
}
498513

packages/core/src/lancedb.ts

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
import { decodeSearchCursor, encodeSearchCursor } from "./cursor.js";
1010
import {
1111
clampLimit,
12+
dedupKey,
1213
isChunkIdFormat,
1314
makeSnippet
1415
} from "./search-common.js";
@@ -48,6 +49,7 @@ export interface OpenLanceDbSearchEngineOptions {
4849
dbPath: string;
4950
tableName?: string;
5051
metadataKeys: string[];
52+
collapseKeys?: string[];
5153
proximityWeight?: number;
5254
phraseSlop?: number;
5355
queryEmbeddingProvider?: EmbeddingProvider;
@@ -127,6 +129,7 @@ export async function buildLanceDbIndex(
127129
export class LanceDbSearchEngine implements SearchEngine {
128130
private readonly table: Table;
129131
private readonly metadataKeys: string[];
132+
private readonly collapseKeys: string[];
130133
private readonly proximityWeight: number;
131134
private readonly phraseSlop: number;
132135
private readonly queryEmbeddingProvider: EmbeddingProvider | undefined;
@@ -138,13 +141,15 @@ export class LanceDbSearchEngine implements SearchEngine {
138141
table: Table,
139142
metadataKeys: string[],
140143
options: DocsIndexOptions & {
144+
collapseKeys?: string[];
141145
queryEmbeddingProvider?: EmbeddingProvider;
142146
vectorWeight?: number;
143147
onWarning?: (message: string) => void;
144148
} = {}
145149
) {
146150
this.table = table;
147151
this.metadataKeys = [...metadataKeys];
152+
this.collapseKeys = options.collapseKeys ?? [];
148153
this.proximityWeight = options.proximityWeight ?? 1.25;
149154
this.phraseSlop = normalizePhraseSlop(options.phraseSlop);
150155
this.queryEmbeddingProvider = options.queryEmbeddingProvider;
@@ -156,10 +161,14 @@ export class LanceDbSearchEngine implements SearchEngine {
156161
const db = await connect(options.dbPath);
157162
const table = await db.openTable(options.tableName ?? DEFAULT_TABLE_NAME);
158163
const engineOptions: DocsIndexOptions & {
164+
collapseKeys?: string[];
159165
queryEmbeddingProvider?: EmbeddingProvider;
160166
vectorWeight?: number;
161167
onWarning?: (message: string) => void;
162168
} = {};
169+
if (options.collapseKeys !== undefined) {
170+
engineOptions.collapseKeys = options.collapseKeys;
171+
}
163172
if (options.proximityWeight !== undefined) {
164173
engineOptions.proximityWeight = options.proximityWeight;
165174
}
@@ -211,14 +220,21 @@ export class LanceDbSearchEngine implements SearchEngine {
211220
const phraseWeight = request.rrf_weights?.phrase ?? this.proximityWeight;
212221
const vecWeight = request.rrf_weights?.vector ?? this.vectorWeight;
213222
const blended = blendRows(matchRows, phraseRows, vectorRows, phraseWeight, vecWeight, matchWeight);
214-
const paged = blended.slice(offset, offset + limit);
223+
224+
// Collapse content-equivalent results across variant axes (e.g. same
225+
// operation documented in multiple SDK languages). Skipped when active
226+
// filters already restrict every collapse axis to a single value.
227+
const activeCollapseKeys = this.collapseKeys.filter((k) => !filters[k]);
228+
const deduped = deduplicateRows(blended, activeCollapseKeys);
229+
230+
const paged = deduped.slice(offset, offset + limit);
215231

216232
const hits = paged.map((entry) =>
217233
toSearchHit(entry.row, entry.score, query, this.metadataKeys)
218234
);
219235

220236
const nextOffset = offset + paged.length;
221-
const nextCursor = nextOffset < blended.length
237+
const nextCursor = nextOffset < deduped.length
222238
? encodeSearchCursor({ offset: nextOffset, limit }, { query, filters })
223239
: null;
224240

@@ -467,6 +483,28 @@ function blendRows(
467483
});
468484
}
469485

486+
function deduplicateRows(
487+
rows: Array<{ row: ChunkRow; score: number }>,
488+
collapseKeys: string[]
489+
): Array<{ row: ChunkRow; score: number }> {
490+
if (collapseKeys.length === 0) return rows;
491+
492+
const seen = new Set<string>();
493+
return rows.filter((entry) => {
494+
const key = dedupKey(
495+
expectStringField(entry.row, "filepath"),
496+
expectStringField(entry.row, "heading"),
497+
expectStringField(entry.row, "chunk_id"),
498+
(k) => expectStringField(entry.row, k),
499+
collapseKeys
500+
);
501+
if (key === null) return true;
502+
if (seen.has(key)) return false;
503+
seen.add(key);
504+
return true;
505+
});
506+
}
507+
470508
function buildWhereClause(filters: Record<string, string>, taxonomyKeys?: string[]): string | null {
471509
const clauses: string[] = [];
472510

packages/core/src/manifest-schema.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,23 @@ export const ManifestOverrideSchema = z
5151
"Overrides the default chunking strategy and/or metadata for files matching a glob pattern. Within the overrides array, later matches take precedence."
5252
);
5353

54+
export const TaxonomyFieldConfigSchema = z
55+
.object({
56+
vector_collapse: z
57+
.boolean()
58+
.default(false)
59+
.describe(
60+
"When true, this taxonomy dimension identifies content variants that are near-identical in vector space (e.g. the same API operation documented in multiple SDK languages). At search time, results sharing the same content identity — determined by normalizing this field's value out of the filepath — are collapsed to the highest-scoring result. Has no effect when a filter for this field is active, since the filter already restricts to a single value."
61+
),
62+
})
63+
.describe("Configuration for a taxonomy field's search-time behavior.");
64+
65+
export const ManifestTaxonomyConfigSchema = z
66+
.record(z.string(), TaxonomyFieldConfigSchema)
67+
.describe(
68+
"Per-field configuration for taxonomy dimensions. Controls search-time behavior such as cross-language result collapsing."
69+
);
70+
5471
export const ManifestSchema = z
5572
.object({
5673
version: z
@@ -67,6 +84,8 @@ export const ManifestSchema = z
6784
"Key-value pairs attached to every chunk produced from this directory tree. Each key becomes a filterable taxonomy dimension exposed as an enum parameter on the search tool."
6885
)
6986
.meta({ examples: [{ language: "typescript", scope: "sdk-specific" }] }),
87+
taxonomy: ManifestTaxonomyConfigSchema.optional()
88+
.meta({ examples: [{ language: { vector_collapse: true } }] }),
7089
overrides: z
7190
.array(ManifestOverrideSchema)
7291
.optional()

packages/core/src/manifest.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,37 @@
11
import matter from "gray-matter";
22
import picomatch from "picomatch";
3+
import { ManifestTaxonomyConfigSchema } from "./manifest-schema.js";
34
import type {
45
ChunkingStrategy,
56
Manifest,
67
ManifestOverride,
8+
ManifestTaxonomyFieldConfig,
79
ResolvedFileConfig
810
} from "./types.js";
911

1012
const DEFAULT_STRATEGY: ChunkingStrategy = { chunk_by: "h2" };
1113

14+
/**
15+
* Union-merges taxonomy field configs from multiple manifests. If any manifest
16+
* sets `vector_collapse: true` for a key, the merged result includes it.
17+
*/
18+
export function mergeTaxonomyConfigs(
19+
manifests: Iterable<Manifest>
20+
): Record<string, ManifestTaxonomyFieldConfig> {
21+
const merged: Record<string, ManifestTaxonomyFieldConfig> = {};
22+
23+
for (const manifest of manifests) {
24+
if (!manifest.taxonomy) continue;
25+
for (const [key, config] of Object.entries(manifest.taxonomy)) {
26+
if (config.vector_collapse) {
27+
merged[key] = { ...merged[key], vector_collapse: true };
28+
}
29+
}
30+
}
31+
32+
return merged;
33+
}
34+
1235
const HTML_HINT_REGEX = /<!--\s*mcp_chunking_hint:\s*(\{[^}]+\})\s*-->/;
1336

1437
export function parseManifest(input: unknown): Manifest {
@@ -30,6 +53,13 @@ export function parseManifest(input: unknown): Manifest {
3053
if (manifest.metadata) {
3154
parsed.metadata = parseMetadata(manifest.metadata, "metadata");
3255
}
56+
if (manifest.taxonomy) {
57+
try {
58+
parsed.taxonomy = ManifestTaxonomyConfigSchema.parse(manifest.taxonomy);
59+
} catch (err) {
60+
throw new Error(`Invalid taxonomy config: ${err instanceof Error ? err.message : String(err)}`);
61+
}
62+
}
3363
if (manifest.overrides) {
3464
parsed.overrides = parseOverrides(manifest.overrides);
3565
}

packages/core/src/metadata.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
import semver from "semver";
22
import type { CorpusMetadata, EmbeddingMetadata, TaxonomyField } from "./types.js";
33

4+
/**
5+
* Returns taxonomy keys that have `vector_collapse: true`, i.e. dimensions
6+
* whose values should be collapsed at search time.
7+
*/
8+
export function getCollapseKeys(taxonomy: Record<string, TaxonomyField>): string[] {
9+
return Object.entries(taxonomy)
10+
.filter(([, field]) => field.vector_collapse === true)
11+
.map(([key]) => key);
12+
}
13+
414
const LIMITS = {
515
maxKeys: 64,
616
maxKeyLength: 64,
@@ -81,9 +91,13 @@ function normalizeTaxonomy(value: unknown): Record<string, TaxonomyField> {
8191
const values = normalizeValues(field.values, key);
8292
const description = field.description === undefined ? undefined : asTrimmedString(field.description);
8393

84-
normalized[key] = description
85-
? { description, values }
86-
: { values };
94+
const vectorCollapse = field.vector_collapse === true ? true : undefined;
95+
96+
normalized[key] = {
97+
...(description ? { description } : {}),
98+
values,
99+
...(vectorCollapse ? { vector_collapse: true } : {})
100+
};
87101
}
88102

89103
return normalized;

packages/core/src/search-common.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,42 @@ export function makeSnippet(content: string, query: string): string {
4848
return `${prefix}${normalized.slice(start, end).trim()}${suffix}`;
4949
}
5050

51+
/**
52+
* Computes a dedup key for collapsing content-equivalent results across
53+
* taxonomy variant axes (e.g. the same operation documented in multiple SDK
54+
* languages). Returns null when no collapsing applies.
55+
*/
56+
export function dedupKey(
57+
filepath: string,
58+
heading: string,
59+
chunkId: string,
60+
getMetadataValue: (key: string) => string,
61+
collapseKeys: string[]
62+
): string | null {
63+
if (collapseKeys.length === 0) return null;
64+
65+
const parts = filepath.split("/");
66+
let anyNormalized = false;
67+
68+
for (const key of collapseKeys) {
69+
const value = getMetadataValue(key);
70+
if (!value) return null;
71+
72+
const idx = parts.indexOf(value);
73+
if (idx >= 0) {
74+
parts[idx] = "*";
75+
anyNormalized = true;
76+
}
77+
}
78+
79+
if (!anyNormalized) return null;
80+
81+
const partMatch = chunkId.match(/-part-(\d+)$/);
82+
const partSuffix = partMatch ? `:${partMatch[1]}` : "";
83+
84+
return `${parts.join("/")}:${heading}${partSuffix}`;
85+
}
86+
5187
export function matchesMetadataFilters(
5288
metadata: Record<string, string>,
5389
filters: Record<string, string>,

0 commit comments

Comments
 (0)