-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest-docs.ts
More file actions
165 lines (136 loc) Β· 5.87 KB
/
ingest-docs.ts
File metadata and controls
165 lines (136 loc) Β· 5.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env tsx
/**
* Document Ingestion Script
*
* Ingests all documentation from docs/ directory and GraphQL schema
* into ResilientDB for RAG (Retrieval-Augmented Generation).
*/
import { IngestionPipeline } from './src/rag/ingestion-pipeline';
import { DocumentLoader } from './src/rag/document-loader';
import { ResilientDBClient } from './src/resilientdb/client';
import { ResilientDBHTTPWrapper } from './src/resilientdb/http-wrapper';
import * as path from 'path';
async function ingestDocuments() {
console.log('π Document Ingestion for RAG');
console.log('================================\n');
// Start HTTP wrapper for ResilientDB on port 18001 (18000 is used by Docker)
console.log('π§ Starting ResilientDB HTTP wrapper on port 18001...\n');
const httpWrapper = new ResilientDBHTTPWrapper(18001);
try {
await httpWrapper.start();
console.log('β
ResilientDB HTTP API available at http://localhost:18001\n');
// Update environment variable for this session
process.env.RESILIENTDB_GRAPHQL_URL = 'http://localhost:18001/graphql';
} catch (error) {
console.log('β οΈ HTTP wrapper not started:', (error as Error).message);
console.log('β οΈ Will try to use existing server\n');
}
const pipeline = new IngestionPipeline();
const loader = new DocumentLoader();
const client = new ResilientDBClient();
// Step 1: Check availability
console.log('π Checking service availability...\n');
const availability = await pipeline.checkAvailability();
console.log(`Embedding Service: ${availability.embeddingService ? 'β
Available' : 'β Unavailable'}`);
console.log(`Vector Store: ${availability.vectorStore ? 'β
Available' : 'β Unavailable'}`);
if (!availability.embeddingService || !availability.vectorStore) {
console.log(`\nβ οΈ Warning: ${availability.message}`);
console.log('Some services are unavailable. Ingestion may be limited.\n');
}
// Step 2: Load documents from docs/ directory
console.log('π Loading documents from docs/ directory...\n');
const docsPath = path.join(process.cwd(), 'docs');
let documents;
try {
documents = await loader.loadDirectory(docsPath, {
recursive: true,
extensions: ['.md', '.txt', '.json'],
exclude: ['node_modules', '.git', 'dist', 'build'],
});
console.log(`β
Loaded ${documents.length} documents\n`);
if (documents.length === 0) {
console.log('β οΈ No documents found in docs/ directory');
console.log('π‘ Add .md, .txt, or .json files to docs/ directory\n');
return;
}
} catch (error) {
console.error(`β Failed to load documents: ${error instanceof Error ? error.message : String(error)}\n`);
return;
}
// Step 3: Load GraphQL schema
console.log('π Loading GraphQL schema via introspection...\n');
let schemaDocument;
try {
const schema = await client.introspectSchema();
// Convert schema to readable text
const schemaText = `# GraphQL Schema
## Queries
${schema.queries.map(q => `- ${q.name}: ${q.type || 'unknown'}`).join('\n')}
## Mutations
${schema.mutations.map(m => `- ${m.name}: ${m.type || 'unknown'}`).join('\n')}
## Types
${schema.types.map(t => `- ${t.name}: ${t.kind || 'unknown'}`).join('\n')}
`;
schemaDocument = await loader.loadSchema(schemaText, 'graphql_schema');
console.log('β
GraphQL schema loaded\n');
// Add schema to documents
documents.push(schemaDocument);
} catch (error) {
console.warn(`β οΈ Failed to load GraphQL schema: ${error instanceof Error ? error.message : String(error)}`);
console.log('Continuing with document ingestion only...\n');
}
// Step 4: Ingest documents
console.log('π Starting ingestion...\n');
console.log(`Total documents to process: ${documents.length}\n`);
let progressCount = 0;
const progress = await pipeline.ingestDocuments(documents, undefined, {
onProgress: (currentProgress) => {
const percentage = currentProgress.totalChunks > 0
? Math.round((currentProgress.processedChunks / currentProgress.totalChunks) * 100)
: 0;
// Only log every 10% or on significant milestones
const newCount = Math.floor(percentage / 10);
if (newCount > progressCount || currentProgress.processedChunks === currentProgress.totalChunks) {
console.log(
`π Progress: ${currentProgress.processedChunks}/${currentProgress.totalChunks} chunks ` +
`(${percentage}%) | ` +
`Embedded: ${currentProgress.embeddedChunks} | ` +
`Stored: ${currentProgress.storedChunks}`
);
progressCount = newCount;
}
},
});
// Step 5: Results
console.log('\n' + '='.repeat(50));
console.log('π Ingestion Results');
console.log('='.repeat(50));
console.log(`Documents processed: ${progress.processedDocuments}/${progress.totalDocuments}`);
console.log(`Chunks created: ${progress.totalChunks}`);
console.log(`Chunks embedded: ${progress.embeddedChunks}`);
console.log(`Chunks stored: ${progress.storedChunks}`);
console.log(`Errors: ${progress.errors.length}`);
if (progress.errors.length > 0) {
console.log('\nβ οΈ Errors encountered:');
progress.errors.forEach((error, index) => {
console.log(` ${index + 1}. ${error.documentId}: ${error.error}`);
});
}
if (progress.storedChunks > 0) {
console.log('\nβ
Ingestion completed successfully!');
console.log(`π¦ ${progress.storedChunks} document chunks are now available for RAG`);
console.log('\nπ‘ You can now use the RAG system for query explanations and optimizations!');
} else {
console.log('\nβ οΈ No chunks were stored. Check errors above.');
}
console.log('\n');
}
// Run ingestion
if (require.main === module) {
ingestDocuments()
.catch((error) => {
console.error('\nβ Fatal error during ingestion:', error);
process.exit(1);
});
}
export { ingestDocuments };