Skip to content

Commit 1c70e01

Browse files
committed
Rag v2
1 parent 6c8a4d0 commit 1c70e01

File tree

2 files changed

+166
-152
lines changed

2 files changed

+166
-152
lines changed

controllers/api.js

Lines changed: 95 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,10 +1213,19 @@ exports.getRag = async (req, res) => {
12131213
// Ensure rag_input/ and rag_input/ingested/ exist
12141214
const inputDir = path.join(__dirname, '../rag_input');
12151215
const ingestedDir = path.join(inputDir, 'ingested');
1216-
if (!fs.existsSync(inputDir)) fs.mkdirSync(inputDir, { recursive: true });
1217-
if (!fs.existsSync(ingestedDir)) fs.mkdirSync(ingestedDir, { recursive: true });
12181216

1219-
// List all files in MongoDB vector DB (by hash)
1217+
// Create directories if they don't exist
1218+
if (!fs.existsSync(inputDir)) {
1219+
fs.mkdirSync(inputDir, { recursive: true });
1220+
req.flash('info', {
1221+
msg: 'RAG input directory created. Please add PDF files to /rag_input/ directory for processing.',
1222+
});
1223+
}
1224+
if (!fs.existsSync(ingestedDir)) {
1225+
fs.mkdirSync(ingestedDir, { recursive: true });
1226+
}
1227+
1228+
// List all files in MongoDB vector DB
12201229
let ingestedFiles = [];
12211230
try {
12221231
const client = new MongoClient(process.env.MONGODB_URI, { dbName: 'hackathonstarter_rag' });
@@ -1225,8 +1234,6 @@ exports.getRag = async (req, res) => {
12251234
const collection = db.collection('rag_chunks');
12261235

12271236
ingestedFiles = await collection.distinct('source');
1228-
1229-
// Optional: Clean up the file paths to show just the filename
12301237
ingestedFiles = ingestedFiles.map((filepath) => path.basename(filepath));
12311238

12321239
await client.close();
@@ -1238,12 +1245,9 @@ exports.getRag = async (req, res) => {
12381245
res.render('api/rag', {
12391246
title: 'Retrieval-Augmented Generation (RAG) Demo',
12401247
ingestedFiles,
1241-
skipped: [],
1242-
processed: [],
12431248
ragResponse: null,
12441249
llmResponse: null,
12451250
question: '',
1246-
error: null,
12471251
maxInputLength: 500,
12481252
});
12491253
};
@@ -1255,10 +1259,17 @@ exports.getRag = async (req, res) => {
12551259
exports.postRagIngest = async (req, res) => {
12561260
const inputDir = path.join(__dirname, '../rag_input');
12571261
const ingestedDir = path.join(inputDir, 'ingested');
1262+
1263+
// Ensure directories exist
12581264
if (!fs.existsSync(inputDir)) fs.mkdirSync(inputDir, { recursive: true });
12591265
if (!fs.existsSync(ingestedDir)) fs.mkdirSync(ingestedDir, { recursive: true });
12601266

1261-
const files = fs.readdirSync(inputDir).filter((f) => f.endsWith('.pdf'));
1267+
// Get list of PDF files in input directory
1268+
const files = fs
1269+
.readdirSync(inputDir)
1270+
.filter((f) => f.endsWith('.pdf'))
1271+
.filter((f) => !f.includes('ingested')); // Exclude anything from ingested directory
1272+
12621273
const skipped = [];
12631274
const processed = [];
12641275

@@ -1268,75 +1279,79 @@ exports.postRagIngest = async (req, res) => {
12681279
const db = client.db('hackathonstarter_rag');
12691280
const collection = db.collection('rag_chunks');
12701281

1271-
// Use Promise.all to avoid await in loop and continue
1272-
await Promise.all(
1273-
files.map(async (file) => {
1274-
const filePath = path.join(inputDir, file);
1275-
const fileBuffer = fs.readFileSync(filePath);
1276-
const hash = crypto.createHash('sha256').update(fileBuffer).digest('hex');
1277-
1278-
// Check if hash exists
1279-
const exists = await collection.findOne({ fileHash: hash });
1280-
if (exists) {
1281-
skipped.push(file);
1282-
return;
1283-
}
1282+
try {
1283+
// Process each file
1284+
await Promise.all(
1285+
files.map(async (file) => {
1286+
const filePath = path.join(inputDir, file);
1287+
const fileBuffer = fs.readFileSync(filePath);
1288+
const hash = crypto.createHash('sha256').update(fileBuffer).digest('hex');
1289+
1290+
// Check if hash exists
1291+
const exists = await collection.findOne({ fileHash: hash });
1292+
if (exists) {
1293+
skipped.push(file);
1294+
return;
1295+
}
12841296

1285-
// Extract text and chunk
1286-
const loader = new PDFLoader(filePath, {
1287-
pdfjs: () => Promise.resolve(pdfjsLib),
1297+
// Extract text and chunk
1298+
const loader = new PDFLoader(filePath, {
1299+
pdfjs: () => Promise.resolve(pdfjsLib),
1300+
});
1301+
const docs = await loader.load();
1302+
const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
1303+
const chunks = await splitter.splitDocuments(docs);
1304+
1305+
// Add hash and source to each chunk
1306+
const chunksWithMetadata = chunks.map((chunk) => ({
1307+
...chunk,
1308+
fileHash: hash,
1309+
source: filePath,
1310+
}));
1311+
1312+
// Embed and store
1313+
const embeddings = new HuggingFaceInferenceEmbeddings({
1314+
apiKey: process.env.HUGGINGFACE_KEY,
1315+
model: process.env.HUGGINGFACE_EMBEDING_MODEL,
1316+
});
1317+
1318+
await MongoDBAtlasVectorSearch.fromDocuments(chunksWithMetadata, embeddings, {
1319+
collection,
1320+
indexName: 'default',
1321+
textKey: 'text',
1322+
embeddingKey: 'embedding',
1323+
});
1324+
1325+
// Move file to ingested directory
1326+
fs.renameSync(filePath, path.join(ingestedDir, file));
1327+
processed.push(file);
1328+
}),
1329+
);
1330+
1331+
if (processed.length > 0) {
1332+
req.flash('success', {
1333+
msg: `Successfully ingested ${processed.length} new file(s): ${processed.join(', ')}`,
12881334
});
1289-
const docs = await loader.load();
1290-
const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
1291-
const chunks = await splitter.splitDocuments(docs);
1292-
1293-
// Embed and store
1294-
const embeddings = new HuggingFaceInferenceEmbeddings({
1295-
apiKey: process.env.HUGGINGFACE_KEY,
1296-
model: process.env.HUGGINGFACE_EMBEDING_MODEL,
1335+
} else if (skipped.length > 0) {
1336+
req.flash('info', {
1337+
msg: `No new files to ingest. ${skipped.length} file(s) already processed: ${skipped.join(', ')}`,
12971338
});
1298-
// Import
1299-
await MongoDBAtlasVectorSearch.fromDocuments(chunks, embeddings, {
1300-
collection,
1301-
indexName: 'default',
1302-
textKey: 'text',
1303-
embeddingKey: 'embedding',
1304-
extraMetadata: { fileName: file, fileHash: hash },
1339+
} else {
1340+
req.flash('info', {
1341+
msg: 'No PDF files found in the input directory. Add files to /rag_input/ to process.',
13051342
});
1343+
}
13061344

1307-
// Move file to ingested
1308-
fs.renameSync(filePath, path.join(ingestedDir, file));
1309-
processed.push(file);
1310-
}),
1311-
);
1312-
1313-
await client.close();
1314-
1315-
// After ingestion, get updated file list
1316-
let ingestedFiles = [];
1317-
try {
1318-
const client2 = new MongoClient(process.env.MONGODB_URI, { dbName: 'hackathonstarter_rag' });
1319-
await client2.connect();
1320-
const db2 = client2.db('hackathonstarter_rag');
1321-
const collection2 = db2.collection('rag_chunks');
1322-
ingestedFiles = await collection2.distinct('fileName');
1323-
await client2.close();
1345+
res.redirect('/api/rag');
13241346
} catch (err) {
1325-
console.log(err);
1326-
ingestedFiles = [];
1347+
console.error('Error during ingestion:', err);
1348+
req.flash('errors', {
1349+
msg: `Error during ingestion: ${err.message}`,
1350+
});
1351+
res.redirect('/api/rag');
1352+
} finally {
1353+
await client.close();
13271354
}
1328-
1329-
res.render('api/rag', {
1330-
title: 'Retrieval-Augmented Generation (RAG) Demo',
1331-
ingestedFiles,
1332-
skipped,
1333-
processed,
1334-
ragResponse: null,
1335-
llmResponse: null,
1336-
question: '',
1337-
error: null,
1338-
maxInputLength: 500,
1339-
});
13401355
};
13411356

13421357
/**
@@ -1345,19 +1360,10 @@ exports.postRagIngest = async (req, res) => {
13451360
*/
13461361
exports.postRagAsk = async (req, res) => {
13471362
const question = (req.body.question || '').slice(0, 500);
1348-
const maxInputLength = 500;
1363+
13491364
if (!question.trim()) {
1350-
return res.render('api/rag', {
1351-
title: 'Retrieval-Augmented Generation (RAG) Demo',
1352-
ingestedFiles: [],
1353-
skipped: [],
1354-
processed: [],
1355-
ragResponse: null,
1356-
llmResponse: null,
1357-
question,
1358-
error: 'Please enter a question.',
1359-
maxInputLength,
1360-
});
1365+
req.flash('errors', { msg: 'Please enter a question.' });
1366+
return res.redirect('/api/rag');
13611367
}
13621368

13631369
// Get list of ingested files for display
@@ -1367,19 +1373,10 @@ exports.postRagAsk = async (req, res) => {
13671373
await client.connect();
13681374
const db = client.db('hackathonstarter_rag');
13691375
const collection = db.collection('rag_chunks');
1370-
ingestedFiles = await collection.distinct('fileName');
1371-
await client.close();
1372-
} catch (err) {
1373-
console.log(err);
1374-
ingestedFiles = [];
1375-
}
1376+
ingestedFiles = await collection.distinct('source');
1377+
ingestedFiles = ingestedFiles.map((filepath) => path.basename(filepath));
13761378

1377-
try {
13781379
// Setup vector store and embeddings
1379-
const client = new MongoClient(process.env.MONGODB_URI, { dbName: 'hackathonstarter_rag' });
1380-
await client.connect();
1381-
const db = client.db('hackathonstarter_rag');
1382-
const collection = db.collection('rag_chunks');
13831380
const embeddings = new HuggingFaceInferenceEmbeddings({
13841381
apiKey: process.env.HUGGINGFACE_KEY,
13851382
model: process.env.HUGGINGFACE_EMBEDING_MODEL || 'BAAI/bge-large-en-v1.5',
@@ -1414,26 +1411,15 @@ exports.postRagAsk = async (req, res) => {
14141411
res.render('api/rag', {
14151412
title: 'Retrieval-Augmented Generation (RAG) Demo',
14161413
ingestedFiles,
1417-
skipped: [],
1418-
processed: [],
14191414
ragResponse: ragResponse.content,
14201415
llmResponse: llmResponse.content,
14211416
question,
1422-
error: null,
1423-
maxInputLength,
1417+
maxInputLength: 500,
14241418
});
14251419
} catch (error) {
1426-
res.render('api/rag', {
1427-
title: 'Retrieval-Augmented Generation (RAG) Demo',
1428-
ingestedFiles,
1429-
skipped: [],
1430-
processed: [],
1431-
ragResponse: null,
1432-
llmResponse: null,
1433-
question,
1434-
error: `Error: ${error.message}`,
1435-
maxInputLength,
1436-
});
1420+
console.error('RAG Error:', error);
1421+
req.flash('errors', { msg: `Error: ${error.message}` });
1422+
res.redirect('/api/rag');
14371423
}
14381424
};
14391425

0 commit comments

Comments
 (0)