@@ -1213,10 +1213,19 @@ exports.getRag = async (req, res) => {
12131213 // Ensure rag_input/ and rag_input/ingested/ exist
12141214 const inputDir = path . join ( __dirname , '../rag_input' ) ;
12151215 const ingestedDir = path . join ( inputDir , 'ingested' ) ;
1216- if ( ! fs . existsSync ( inputDir ) ) fs . mkdirSync ( inputDir , { recursive : true } ) ;
1217- if ( ! fs . existsSync ( ingestedDir ) ) fs . mkdirSync ( ingestedDir , { recursive : true } ) ;
12181216
1219- // List all files in MongoDB vector DB (by hash)
1217+ // Create directories if they don't exist
1218+ if ( ! fs . existsSync ( inputDir ) ) {
1219+ fs . mkdirSync ( inputDir , { recursive : true } ) ;
1220+ req . flash ( 'info' , {
1221+ msg : 'RAG input directory created. Please add PDF files to /rag_input/ directory for processing.' ,
1222+ } ) ;
1223+ }
1224+ if ( ! fs . existsSync ( ingestedDir ) ) {
1225+ fs . mkdirSync ( ingestedDir , { recursive : true } ) ;
1226+ }
1227+
1228+ // List all files in MongoDB vector DB
12201229 let ingestedFiles = [ ] ;
12211230 try {
12221231 const client = new MongoClient ( process . env . MONGODB_URI , { dbName : 'hackathonstarter_rag' } ) ;
@@ -1225,8 +1234,6 @@ exports.getRag = async (req, res) => {
12251234 const collection = db . collection ( 'rag_chunks' ) ;
12261235
12271236 ingestedFiles = await collection . distinct ( 'source' ) ;
1228-
1229- // Optional: Clean up the file paths to show just the filename
12301237 ingestedFiles = ingestedFiles . map ( ( filepath ) => path . basename ( filepath ) ) ;
12311238
12321239 await client . close ( ) ;
@@ -1238,12 +1245,9 @@ exports.getRag = async (req, res) => {
12381245 res . render ( 'api/rag' , {
12391246 title : 'Retrieval-Augmented Generation (RAG) Demo' ,
12401247 ingestedFiles,
1241- skipped : [ ] ,
1242- processed : [ ] ,
12431248 ragResponse : null ,
12441249 llmResponse : null ,
12451250 question : '' ,
1246- error : null ,
12471251 maxInputLength : 500 ,
12481252 } ) ;
12491253} ;
@@ -1255,10 +1259,17 @@ exports.getRag = async (req, res) => {
12551259exports . postRagIngest = async ( req , res ) => {
12561260 const inputDir = path . join ( __dirname , '../rag_input' ) ;
12571261 const ingestedDir = path . join ( inputDir , 'ingested' ) ;
1262+
1263+ // Ensure directories exist
12581264 if ( ! fs . existsSync ( inputDir ) ) fs . mkdirSync ( inputDir , { recursive : true } ) ;
12591265 if ( ! fs . existsSync ( ingestedDir ) ) fs . mkdirSync ( ingestedDir , { recursive : true } ) ;
12601266
1261- const files = fs . readdirSync ( inputDir ) . filter ( ( f ) => f . endsWith ( '.pdf' ) ) ;
1267+ // Get list of PDF files in input directory
1268+ const files = fs
1269+ . readdirSync ( inputDir )
1270+ . filter ( ( f ) => f . endsWith ( '.pdf' ) )
1271+ . filter ( ( f ) => ! f . includes ( 'ingested' ) ) ; // Exclude anything from ingested directory
1272+
12621273 const skipped = [ ] ;
12631274 const processed = [ ] ;
12641275
@@ -1268,75 +1279,79 @@ exports.postRagIngest = async (req, res) => {
12681279 const db = client . db ( 'hackathonstarter_rag' ) ;
12691280 const collection = db . collection ( 'rag_chunks' ) ;
12701281
1271- // Use Promise.all to avoid await in loop and continue
1272- await Promise . all (
1273- files . map ( async ( file ) => {
1274- const filePath = path . join ( inputDir , file ) ;
1275- const fileBuffer = fs . readFileSync ( filePath ) ;
1276- const hash = crypto . createHash ( 'sha256' ) . update ( fileBuffer ) . digest ( 'hex' ) ;
1277-
1278- // Check if hash exists
1279- const exists = await collection . findOne ( { fileHash : hash } ) ;
1280- if ( exists ) {
1281- skipped . push ( file ) ;
1282- return ;
1283- }
1282+ try {
1283+ // Process each file
1284+ await Promise . all (
1285+ files . map ( async ( file ) => {
1286+ const filePath = path . join ( inputDir , file ) ;
1287+ const fileBuffer = fs . readFileSync ( filePath ) ;
1288+ const hash = crypto . createHash ( 'sha256' ) . update ( fileBuffer ) . digest ( 'hex' ) ;
1289+
1290+ // Check if hash exists
1291+ const exists = await collection . findOne ( { fileHash : hash } ) ;
1292+ if ( exists ) {
1293+ skipped . push ( file ) ;
1294+ return ;
1295+ }
12841296
1285- // Extract text and chunk
1286- const loader = new PDFLoader ( filePath , {
1287- pdfjs : ( ) => Promise . resolve ( pdfjsLib ) ,
1297+ // Extract text and chunk
1298+ const loader = new PDFLoader ( filePath , {
1299+ pdfjs : ( ) => Promise . resolve ( pdfjsLib ) ,
1300+ } ) ;
1301+ const docs = await loader . load ( ) ;
1302+ const splitter = new RecursiveCharacterTextSplitter ( { chunkSize : 1000 , chunkOverlap : 200 } ) ;
1303+ const chunks = await splitter . splitDocuments ( docs ) ;
1304+
1305+ // Add hash and source to each chunk
1306+ const chunksWithMetadata = chunks . map ( ( chunk ) => ( {
1307+ ...chunk ,
1308+ fileHash : hash ,
1309+ source : filePath ,
1310+ } ) ) ;
1311+
1312+ // Embed and store
1313+ const embeddings = new HuggingFaceInferenceEmbeddings ( {
1314+ apiKey : process . env . HUGGINGFACE_KEY ,
1315+ model : process . env . HUGGINGFACE_EMBEDING_MODEL ,
1316+ } ) ;
1317+
1318+ await MongoDBAtlasVectorSearch . fromDocuments ( chunksWithMetadata , embeddings , {
1319+ collection,
1320+ indexName : 'default' ,
1321+ textKey : 'text' ,
1322+ embeddingKey : 'embedding' ,
1323+ } ) ;
1324+
1325+ // Move file to ingested directory
1326+ fs . renameSync ( filePath , path . join ( ingestedDir , file ) ) ;
1327+ processed . push ( file ) ;
1328+ } ) ,
1329+ ) ;
1330+
1331+ if ( processed . length > 0 ) {
1332+ req . flash ( 'success' , {
1333+ msg : `Successfully ingested ${ processed . length } new file(s): ${ processed . join ( ', ' ) } ` ,
12881334 } ) ;
1289- const docs = await loader . load ( ) ;
1290- const splitter = new RecursiveCharacterTextSplitter ( { chunkSize : 1000 , chunkOverlap : 200 } ) ;
1291- const chunks = await splitter . splitDocuments ( docs ) ;
1292-
1293- // Embed and store
1294- const embeddings = new HuggingFaceInferenceEmbeddings ( {
1295- apiKey : process . env . HUGGINGFACE_KEY ,
1296- model : process . env . HUGGINGFACE_EMBEDING_MODEL ,
1335+ } else if ( skipped . length > 0 ) {
1336+ req . flash ( 'info' , {
1337+ msg : `No new files to ingest. ${ skipped . length } file(s) already processed: ${ skipped . join ( ', ' ) } ` ,
12971338 } ) ;
1298- // Import
1299- await MongoDBAtlasVectorSearch . fromDocuments ( chunks , embeddings , {
1300- collection,
1301- indexName : 'default' ,
1302- textKey : 'text' ,
1303- embeddingKey : 'embedding' ,
1304- extraMetadata : { fileName : file , fileHash : hash } ,
1339+ } else {
1340+ req . flash ( 'info' , {
1341+ msg : 'No PDF files found in the input directory. Add files to /rag_input/ to process.' ,
13051342 } ) ;
1343+ }
13061344
1307- // Move file to ingested
1308- fs . renameSync ( filePath , path . join ( ingestedDir , file ) ) ;
1309- processed . push ( file ) ;
1310- } ) ,
1311- ) ;
1312-
1313- await client . close ( ) ;
1314-
1315- // After ingestion, get updated file list
1316- let ingestedFiles = [ ] ;
1317- try {
1318- const client2 = new MongoClient ( process . env . MONGODB_URI , { dbName : 'hackathonstarter_rag' } ) ;
1319- await client2 . connect ( ) ;
1320- const db2 = client2 . db ( 'hackathonstarter_rag' ) ;
1321- const collection2 = db2 . collection ( 'rag_chunks' ) ;
1322- ingestedFiles = await collection2 . distinct ( 'fileName' ) ;
1323- await client2 . close ( ) ;
1345+ res . redirect ( '/api/rag' ) ;
13241346 } catch ( err ) {
1325- console . log ( err ) ;
1326- ingestedFiles = [ ] ;
1347+ console . error ( 'Error during ingestion:' , err ) ;
1348+ req . flash ( 'errors' , {
1349+ msg : `Error during ingestion: ${ err . message } ` ,
1350+ } ) ;
1351+ res . redirect ( '/api/rag' ) ;
1352+ } finally {
1353+ await client . close ( ) ;
13271354 }
1328-
1329- res . render ( 'api/rag' , {
1330- title : 'Retrieval-Augmented Generation (RAG) Demo' ,
1331- ingestedFiles,
1332- skipped,
1333- processed,
1334- ragResponse : null ,
1335- llmResponse : null ,
1336- question : '' ,
1337- error : null ,
1338- maxInputLength : 500 ,
1339- } ) ;
13401355} ;
13411356
13421357/**
@@ -1345,19 +1360,10 @@ exports.postRagIngest = async (req, res) => {
13451360 */
13461361exports . postRagAsk = async ( req , res ) => {
13471362 const question = ( req . body . question || '' ) . slice ( 0 , 500 ) ;
1348- const maxInputLength = 500 ;
1363+
13491364 if ( ! question . trim ( ) ) {
1350- return res . render ( 'api/rag' , {
1351- title : 'Retrieval-Augmented Generation (RAG) Demo' ,
1352- ingestedFiles : [ ] ,
1353- skipped : [ ] ,
1354- processed : [ ] ,
1355- ragResponse : null ,
1356- llmResponse : null ,
1357- question,
1358- error : 'Please enter a question.' ,
1359- maxInputLength,
1360- } ) ;
1365+ req . flash ( 'errors' , { msg : 'Please enter a question.' } ) ;
1366+ return res . redirect ( '/api/rag' ) ;
13611367 }
13621368
13631369 // Get list of ingested files for display
@@ -1367,19 +1373,10 @@ exports.postRagAsk = async (req, res) => {
13671373 await client . connect ( ) ;
13681374 const db = client . db ( 'hackathonstarter_rag' ) ;
13691375 const collection = db . collection ( 'rag_chunks' ) ;
1370- ingestedFiles = await collection . distinct ( 'fileName' ) ;
1371- await client . close ( ) ;
1372- } catch ( err ) {
1373- console . log ( err ) ;
1374- ingestedFiles = [ ] ;
1375- }
1376+ ingestedFiles = await collection . distinct ( 'source' ) ;
1377+ ingestedFiles = ingestedFiles . map ( ( filepath ) => path . basename ( filepath ) ) ;
13761378
1377- try {
13781379 // Setup vector store and embeddings
1379- const client = new MongoClient ( process . env . MONGODB_URI , { dbName : 'hackathonstarter_rag' } ) ;
1380- await client . connect ( ) ;
1381- const db = client . db ( 'hackathonstarter_rag' ) ;
1382- const collection = db . collection ( 'rag_chunks' ) ;
13831380 const embeddings = new HuggingFaceInferenceEmbeddings ( {
13841381 apiKey : process . env . HUGGINGFACE_KEY ,
13851382 model : process . env . HUGGINGFACE_EMBEDING_MODEL || 'BAAI/bge-large-en-v1.5' ,
@@ -1414,26 +1411,15 @@ exports.postRagAsk = async (req, res) => {
14141411 res . render ( 'api/rag' , {
14151412 title : 'Retrieval-Augmented Generation (RAG) Demo' ,
14161413 ingestedFiles,
1417- skipped : [ ] ,
1418- processed : [ ] ,
14191414 ragResponse : ragResponse . content ,
14201415 llmResponse : llmResponse . content ,
14211416 question,
1422- error : null ,
1423- maxInputLength,
1417+ maxInputLength : 500 ,
14241418 } ) ;
14251419 } catch ( error ) {
1426- res . render ( 'api/rag' , {
1427- title : 'Retrieval-Augmented Generation (RAG) Demo' ,
1428- ingestedFiles,
1429- skipped : [ ] ,
1430- processed : [ ] ,
1431- ragResponse : null ,
1432- llmResponse : null ,
1433- question,
1434- error : `Error: ${ error . message } ` ,
1435- maxInputLength,
1436- } ) ;
1420+ console . error ( 'RAG Error:' , error ) ;
1421+ req . flash ( 'errors' , { msg : `Error: ${ error . message } ` } ) ;
1422+ res . redirect ( '/api/rag' ) ;
14371423 }
14381424} ;
14391425
0 commit comments