Skip to content

Commit 76f657b

Browse files
committed
feat(schemas): implement schema inference for MongoDB collections
- Added functionality to infer schema from sample documents in MongoDB collections, including detection of field types, nullability, and nested schemas. - Enhanced the `handleReadResourceRequest` function to utilize the new schema inference logic, improving the accuracy of the returned collection schema. - Implemented fallback logic for sample document retrieval to ensure robustness in case of aggregation failures. This enhancement provides a more comprehensive understanding of the structure of MongoDB collections. resolves #14
1 parent 783c928 commit 76f657b

File tree

1 file changed

+223
-21
lines changed

1 file changed

+223
-21
lines changed

src/schemas/resource.ts

Lines changed: 223 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,173 @@ import type {
77
Db,
88
IndexDescriptionInfo,
99
MongoClient,
10+
Document,
1011
} from "mongodb";
12+
import { ObjectId } from "mongodb";
13+
14+
// Define interfaces for schema inference
15+
interface FieldInfo {
16+
name: string;
17+
types: Set<string>;
18+
nullable: boolean;
19+
samples: unknown[];
20+
nestedSchema?: SchemaResult;
21+
}
22+
23+
interface SchemaResult {
24+
fields: FieldSummary[];
25+
}
26+
27+
interface FieldSummary {
28+
name: string;
29+
types: string[];
30+
nullable: boolean;
31+
prevalence: string;
32+
examples: unknown[];
33+
nestedSchema?: SchemaResult;
34+
}
35+
36+
interface CollectionSchema {
37+
type: string;
38+
name: string;
39+
fields: FieldSummary[];
40+
indexes: Array<{
41+
name: string | undefined;
42+
keys: Record<string, unknown>;
43+
}>;
44+
documentCount: number | string | null;
45+
sampleSize: number;
46+
lastUpdated: string;
47+
}
48+
49+
/**
50+
* Detects the MongoDB-specific type of a value
51+
* @param value The value to detect the type of
52+
* @returns A string representing the detected type
53+
*/
54+
function detectMongoType(value: unknown): string {
55+
if (value === null) return 'null';
56+
if (value === undefined) return 'undefined';
57+
58+
if (value instanceof ObjectId) return 'ObjectId';
59+
if (value instanceof Date) return 'Date';
60+
if (Array.isArray(value)) {
61+
if (value.length === 0) return 'Array';
62+
63+
// Check if array has consistent types
64+
const elementTypes = new Set(value.map(item => detectMongoType(item)));
65+
if (elementTypes.size === 1) {
66+
return `Array<${Array.from(elementTypes)[0]}>`;
67+
}
68+
return 'Array<mixed>';
69+
}
70+
71+
if (typeof value === 'object') {
72+
// Handle nested documents
73+
return 'Document';
74+
}
75+
76+
return typeof value;
77+
}
78+
79+
/**
80+
* Helper function to infer a schema from multiple documents
81+
* @param documents Array of sample documents from the collection
82+
* @returns Inferred schema with field names and types
83+
*/
84+
function inferSchemaFromSamples(documents: Document[]): SchemaResult {
85+
if (!documents || documents.length === 0) {
86+
return { fields: [] };
87+
}
88+
89+
// Use a Map to store field information, with the key being the field name
90+
const fieldMap = new Map<string, FieldInfo>();
91+
92+
// Process each document to collect field information
93+
for (const doc of documents) {
94+
for (const [key, value] of Object.entries(doc)) {
95+
if (!fieldMap.has(key)) {
96+
// Initialize field info if we haven't seen this field before
97+
fieldMap.set(key, {
98+
name: key,
99+
types: new Set([detectMongoType(value)]),
100+
nullable: false,
101+
// Store sample values for complex types
102+
samples: [value],
103+
});
104+
} else {
105+
// Update existing field info
106+
const fieldInfo = fieldMap.get(key)!;
107+
fieldInfo.types.add(detectMongoType(value));
108+
109+
// Store up to 3 different sample values
110+
if (fieldInfo.samples.length < 3 &&
111+
!fieldInfo.samples.some((sample: unknown) =>
112+
JSON.stringify(sample) === JSON.stringify(value))) {
113+
fieldInfo.samples.push(value);
114+
}
115+
}
116+
}
117+
}
118+
119+
// Check for nullable fields by seeing which fields are missing in some documents
120+
for (const doc of documents) {
121+
for (const [key] of fieldMap.entries()) {
122+
if (!(key in doc)) {
123+
const fieldInfo = fieldMap.get(key)!;
124+
fieldInfo.nullable = true;
125+
}
126+
}
127+
}
128+
129+
// Process nested document schemas
130+
for (const [key, fieldInfo] of fieldMap.entries()) {
131+
if (fieldInfo.types.has('Document')) {
132+
// Extract nested documents for this field
133+
const nestedDocs = documents
134+
.filter(doc => doc[key] && typeof doc[key] === 'object' && !Array.isArray(doc[key]))
135+
.map(doc => doc[key] as Document);
136+
137+
if (nestedDocs.length > 0) {
138+
// Recursively infer schema for nested documents
139+
fieldInfo.nestedSchema = inferSchemaFromSamples(nestedDocs);
140+
}
141+
}
142+
}
143+
144+
// Convert the Map to an array of field objects with additional info
145+
const fields = Array.from(fieldMap.values()).map(fieldInfo => {
146+
const result: FieldSummary = {
147+
name: fieldInfo.name,
148+
types: Array.from(fieldInfo.types),
149+
nullable: fieldInfo.nullable,
150+
prevalence: Math.round((documents.filter(doc => fieldInfo.name in doc).length / documents.length) * 100) + '%',
151+
examples: [],
152+
};
153+
154+
// Include nested schema if available
155+
if (fieldInfo.nestedSchema) {
156+
result.nestedSchema = fieldInfo.nestedSchema;
157+
}
158+
159+
// Include simplified sample values
160+
const sampleValues = fieldInfo.samples.map((sample: unknown) => {
161+
if (sample instanceof ObjectId) return sample.toString();
162+
if (sample instanceof Date) return sample.toISOString();
163+
if (typeof sample === 'object') {
164+
// For objects/arrays, just indicate type rather than full structure
165+
return Array.isArray(sample) ? '[...]' : '{...}';
166+
}
167+
return sample;
168+
});
169+
170+
result.examples = sampleValues;
171+
172+
return result;
173+
});
174+
175+
return { fields };
176+
}
11177

12178
export async function handleReadResourceRequest({
13179
request,
@@ -25,28 +191,64 @@ export async function handleReadResourceRequest({
25191

26192
try {
27193
const collection = db.collection(collectionName);
28-
const sample = await collection.findOne({});
194+
195+
// Set sample size for schema inference
196+
const sampleSize = 100;
197+
let sampleDocuments: Document[] = [];
198+
199+
try {
200+
// First try using MongoDB's $sample aggregation to get a diverse set of documents
201+
sampleDocuments = await collection
202+
.aggregate([{ $sample: { size: sampleSize } }])
203+
.toArray();
204+
} catch (sampleError) {
205+
// Fallback to sequential scan if $sample is not available
206+
console.warn(`$sample aggregation failed for ${collectionName}, falling back to sequential scan: ${sampleError}`);
207+
sampleDocuments = await collection
208+
.find({})
209+
.limit(sampleSize)
210+
.toArray();
211+
}
212+
213+
// Get indexes for the collection
29214
const indexes = await collection.indexes();
30-
31-
const schema = sample
32-
? {
33-
type: "collection",
34-
name: collectionName,
35-
fields: Object.entries(sample).map(([key, value]) => ({
36-
name: key,
37-
type: typeof value,
38-
})),
39-
indexes: indexes.map((idx: IndexDescriptionInfo) => ({
40-
name: idx.name,
41-
keys: idx.key,
42-
})),
43-
}
44-
: {
45-
type: "collection",
46-
name: collectionName,
47-
fields: [],
48-
indexes: [],
49-
};
215+
216+
// Infer schema from samples
217+
const inferredSchema = inferSchemaFromSamples(sampleDocuments);
218+
219+
// Get document count with timeout protection
220+
let documentCount: number | string | null = null;
221+
try {
222+
// Set a timeout for the count operation
223+
documentCount = await Promise.race([
224+
collection.countDocuments(),
225+
new Promise<never>((_, reject) =>
226+
setTimeout(() => reject(new Error('Count operation timed out')), 5000)
227+
)
228+
]);
229+
} catch (countError) {
230+
console.warn(`Count operation failed or timed out for ${collectionName}: ${countError}`);
231+
// Estimate count based on sample size and collection stats
232+
try {
233+
const stats = await db.command({ collStats: collectionName });
234+
documentCount = stats.count;
235+
} catch {
236+
documentCount = 'unknown (count operation timed out)';
237+
}
238+
}
239+
240+
const schema: CollectionSchema = {
241+
type: "collection",
242+
name: collectionName,
243+
fields: inferredSchema.fields,
244+
indexes: indexes.map((idx: IndexDescriptionInfo) => ({
245+
name: idx.name,
246+
keys: idx.key,
247+
})),
248+
documentCount: documentCount,
249+
sampleSize: sampleDocuments.length,
250+
lastUpdated: new Date().toISOString(),
251+
};
50252

51253
return {
52254
contents: [

0 commit comments

Comments
 (0)