neo4j-labs · kartikpersistent · Apr 21, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -3,11 +3,11 @@ updates:
   - package-ecosystem: 'npm'
     directory: '/frontend'
     schedule:
-      interval: 'weekly'
+      interval: 'monthly'
     target-branch: 'dev'
 
   - package-ecosystem: 'pip'
     directory: '/backend'
     schedule:
-      interval: 'weekly'
-    target-branch: 'dev'
+      interval: 'monthly'
+    target-branch: 'dev'
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz
 .vennv
 newenv
 files
-
+startupbackend.sh
+startupfrontend.sh
diff --git a/README.md b/README.md
diff --git a/backend/example.env b/backend/example.env
@@ -31,16 +31,19 @@ DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o"  #whichever model specified here , ne
 LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
 LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
 LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
-LLM_MODEL_CONFIG_openai-gpt-o3-mini="o3-mini-2025-01-31,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
 LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
 LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
 LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
+LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
 LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
 LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
 LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
 LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
 LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
-LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key"
+LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
 LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
 LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
 YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,61 +1,64 @@
+accelerate==1.6.0
 asyncio==3.4.3
-boto3==1.36.2
-botocore==1.36.2
-certifi==2024.8.30
-fastapi==0.115.6
+boto3==1.37.29
+botocore==1.37.29
+certifi==2025.1.31
+fastapi==0.115.11
 fastapi-health==0.4.0
-google-api-core==2.24.0
-google-auth==2.37.0
+google-api-core==2.24.2
+google-auth==2.38.0
 google_auth_oauthlib==1.2.1
-google-cloud-core==2.4.1
-json-repair==0.30.2
+google-cloud-core==2.4.3
+json-repair==0.39.1
 pip-install==1.3.5
-langchain==0.3.15
-langchain-aws==0.2.11
-langchain-anthropic==0.3.3
-langchain-fireworks==0.2.6
-langchain-community==0.3.15
-langchain-core==0.3.31
+langchain==0.3.23
+langchain-aws==0.2.18
+langchain-anthropic==0.3.9
+langchain-fireworks==0.2.9
+langchain-community==0.3.19
+langchain-core==0.3.51
 langchain-experimental==0.3.4
-langchain-google-vertexai==2.0.11
-langchain-groq==0.2.3
-langchain-openai==0.3.1
-langchain-text-splitters==0.3.5
+langchain-google-vertexai==2.0.19
+langchain-groq==0.2.5
+langchain-openai==0.3.12
+langchain-text-splitters==0.3.8
 langchain-huggingface==0.1.2
 langdetect==1.0.9
-langsmith==0.2.11
+langsmith==0.3.26
 langserve==0.3.1
 neo4j-rust-ext
 nltk==3.9.1
-openai==1.59.9
-opencv-python==4.10.0.84
-psutil==6.1.0
-pydantic==2.9.2
+openai==1.71.0
+opencv-python==4.11.0.86
+psutil==7.0.0
+pydantic==2.10.6
 python-dotenv==1.0.1
 python-magic==0.4.27
 PyPDF2==3.0.1
-PyMuPDF==1.24.14
-starlette==0.41.3
-sse-starlette==2.1.3
+PyMuPDF==1.25.5
+starlette==0.46.1
+sse-starlette==2.2.1
 starlette-session==0.4.3
 tqdm==4.67.1
 unstructured[all-docs]
-unstructured==0.16.11
-unstructured-client==0.28.1
-unstructured-inference==0.8.1
-urllib3==2.2.2
-uvicorn==0.32.1
+unstructured==0.17.2
+unstructured-client==0.32.3
+unstructured-inference==0.8.10
+urllib3==2.3.0
+uvicorn==0.34.0
 gunicorn==23.0.0
 wikipedia==1.4.0
-wrapt==1.16.0
-yarl==1.9.4
-youtube-transcript-api==0.6.3
-zipp==3.17.0
-sentence-transformers==3.3.1
-google-cloud-logging==3.11.3
-pypandoc==1.13
-graphdatascience==1.12
-Secweb==1.11.0
-ragas==0.2.11
+wrapt==1.17.2
+yarl==1.18.3
+youtube-transcript-api==1.0.3
+zipp==3.21.0
+sentence-transformers==4.0.2
+google-cloud-logging==3.11.4
+pypandoc==1.15
+graphdatascience==1.14
+Secweb==1.18.1
+ragas==0.2.14
 rouge_score==0.1.2
-langchain-neo4j==0.3.0
+langchain-neo4j==0.4.0
+pypandoc-binary==1.15
+chardet==5.2.0
diff --git a/backend/score.py b/backend/score.py
@@ -41,6 +41,27 @@
 CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
 MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")
 
+def sanitize_filename(filename):
+   """
+   Sanitize the user-provided filename to prevent directory traversal and remove unsafe characters.
+   """
+   # Remove path separators and collapse redundant separators
+   filename = os.path.basename(filename)
+   filename = os.path.normpath(filename)
+   return filename
+
+def validate_file_path(directory, filename):
+   """
+   Construct the full file path and ensure it is within the specified directory.
+   """
+   file_path = os.path.join(directory, filename)
+   abs_directory = os.path.abspath(directory)
+   abs_file_path = os.path.abspath(file_path)
+   # Ensure the file path starts with the intended directory path
+   if not abs_file_path.startswith(abs_directory):
+       raise ValueError("Invalid file path")
+   return abs_file_path
+
 def healthy_condition():
     output = {"healthy": True}
     return output
@@ -217,8 +238,9 @@ async def extract_knowledge_graph_from_file(
         start_time = time.time()
         graph = create_graph_database_connection(uri, userName, password, database)   
         graphDb_data_Access = graphDBdataAccess(graph)
-        merged_file_path = os.path.join(MERGED_DIR,file_name)
         if source_type == 'local file':
+            file_name = sanitize_filename(file_name)
+            merged_file_path = validate_file_path(MERGED_DIR, file_name)
             uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
 
         elif source_type == 's3 bucket' and source_url:
@@ -278,8 +300,11 @@ async def extract_knowledge_graph_from_file(
         return create_api_response('Success', data=result, file_source= source_type)
     except LLMGraphBuilderException as e:
         error_message = str(e)
+        graph = create_graph_database_connection(uri, userName, password, database)   
+        graphDb_data_Access = graphDBdataAccess(graph)
         graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition)
-        failed_file_process(uri,file_name, merged_file_path, source_type)
+        if source_type == 'local file':
+            failed_file_process(uri,file_name, merged_file_path)
         node_detail = graphDb_data_Access.get_current_status_document_node(file_name)
         # Set the status "Completed" in logging becuase we are treating these error already handled by application as like custom errors.
         json_obj = {'api_name':'extract','message':error_message,'file_created_at':formatted_time(node_detail[0]['created_time']),'error_message':error_message, 'file_name': file_name,'status':'Completed',
@@ -290,8 +315,11 @@ async def extract_knowledge_graph_from_file(
     except Exception as e:
         message=f"Failed To Process File:{file_name} or LLM Unable To Parse Content "
         error_message = str(e)
+        graph = create_graph_database_connection(uri, userName, password, database)   
+        graphDb_data_Access = graphDBdataAccess(graph)
         graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition)
-        failed_file_process(uri,file_name, merged_file_path, source_type)
+        if source_type == 'local file':
+            failed_file_process(uri,file_name, merged_file_path)
         node_detail = graphDb_data_Access.get_current_status_document_node(file_name)
 
         json_obj = {'api_name':'extract','message':message,'file_created_at':formatted_time(node_detail[0]['created_time']),'error_message':error_message, 'file_name': file_name,'status':'Failed',
@@ -548,16 +576,20 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
         result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
         end = time.time()
         elapsed_time = end - start
-        json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
-                            'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
-        logger.log_struct(json_obj, "INFO")
+        if int(chunkNumber) == int(totalChunks):
+            json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
+                                'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
+            logger.log_struct(json_obj, "INFO")
         if int(chunkNumber) == int(totalChunks):
             return create_api_response('Success',data=result, message='Source Node Created Successfully')
         else:
             return create_api_response('Success', message=result)
     except Exception as e:
-        message="Unable to upload large file into chunks. "
+        message="Unable to upload file in chunks"
         error_message = str(e)
+        graph = create_graph_database_connection(uri, userName, password, database)   
+        graphDb_data_Access = graphDBdataAccess(graph)
+        graphDb_data_Access.update_exception_db(originalname,error_message)
         logging.info(message)
         logging.exception(f'Exception:{error_message}')
         return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = originalname)
@@ -568,8 +600,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
 async def get_structured_schema(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None),email=Form(None)):
     try:
         start = time.time()
-        graph = create_graph_database_connection(uri, userName, password, database)
-        result = await asyncio.to_thread(get_labels_and_relationtypes, graph)
+        result = await asyncio.to_thread(get_labels_and_relationtypes, uri, userName, password, database)
         end = time.time()
         elapsed_time = end - start
         logging.info(f'Schema result from DB: {result}')
@@ -736,10 +767,10 @@ async def cancelled_job(uri=Form(None), userName=Form(None), password=Form(None)
         gc.collect()
 
 @app.post("/populate_graph_schema")
-async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),email=Form(None)):
+async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),is_local_storage=Form(None),email=Form(None)):
     try:
         start = time.time()
-        result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked)
+        result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked, is_local_storage)
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'populate_graph_schema', 'model':model, 'is_schema_description_checked':is_schema_description_checked, 'input_text':input_text, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
@@ -866,7 +897,7 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No
     try:
         start = time.time()
         graph = create_graph_database_connection(uri, userName, password, database)
-        chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
+        chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name})
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition,

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -177,7 +177,7 @@ def get_rag_chain(llm, system_template=CHAT_SYSTEM_TEMPLATE):
         logging.error(f"Error creating RAG chain: {e}")
         raise
 
-def format_documents(documents, model):
+def format_documents(documents, model,chat_mode_settings):
     prompt_token_cutoff = 4
     for model_names, value in CHAT_TOKEN_CUT_OFF.items():
         if model in model_names:
@@ -197,9 +197,20 @@ def format_documents(documents, model):
         try:
             source = doc.metadata.get('source', "unknown")
             sources.add(source)
-
-            entities = doc.metadata['entities'] if 'entities'in doc.metadata.keys() else entities
-            global_communities = doc.metadata["communitydetails"] if 'communitydetails'in doc.metadata.keys() else global_communities
+            if 'entities' in doc.metadata:
+                if chat_mode_settings["mode"] == CHAT_ENTITY_VECTOR_MODE:
+                    entity_ids = [entry['entityids'] for entry in doc.metadata['entities'] if 'entityids' in entry]
+                    entities.setdefault('entityids', set()).update(entity_ids)
+                else:
+                    if 'entityids' in doc.metadata['entities']:
+                        entities.setdefault('entityids', set()).update(doc.metadata['entities']['entityids'])
+                    if 'relationshipids' in doc.metadata['entities']:
+                        entities.setdefault('relationshipids', set()).update(doc.metadata['entities']['relationshipids'])
+
+            if 'communitydetails' in doc.metadata:
+                existing_ids = {entry['id'] for entry in global_communities}
+                new_entries = [entry for entry in doc.metadata["communitydetails"] if entry['id'] not in existing_ids]
+                global_communities.extend(new_entries)
 
             formatted_doc = (
                 "Document start\n"
@@ -218,7 +229,7 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
     start_time = time.time()
 
     try:
-        formatted_docs, sources, entitydetails, communities = format_documents(docs, model)
+        formatted_docs, sources, entitydetails, communities = format_documents(docs, model,chat_mode_settings)
 
         rag_chain = get_rag_chain(llm=llm)
 
@@ -369,7 +380,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
             search_kwargs={
-                'k': search_k,
+                'top_k': search_k,
                 'effective_search_ratio': ef_ratio,
                 'score_threshold': score_threshold,
                 'filter': {'fileName': {'$in': document_names}}
@@ -379,7 +390,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
     else:
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
-            search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
+            search_kwargs={'top_k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
     return retriever

diff --git a/backend/src/chunkid_entities.py b/backend/src/chunkid_entities.py
@@ -190,7 +190,7 @@ def get_entities_from_chunkids(uri, username, password, database ,nodedetails,en
         elif mode == CHAT_ENTITY_VECTOR_MODE:
 
             if "entitydetails" in nodedetails and nodedetails["entitydetails"]:
-                entity_ids = [item["id"] for item in nodedetails["entitydetails"]]
+                entity_ids = [item for item in nodedetails["entitydetails"]["entityids"]]
                 logging.info(f"chunkid_entities module: Starting for entity ids: {entity_ids}")
                 result = process_entityids(driver, entity_ids)
                 if "chunk_data" in result.keys():

diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -1,8 +1,7 @@
 import os
 import logging
 from google.cloud import storage
-from langchain_community.document_loaders import GCSFileLoader, GCSDirectoryLoader
-from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import GCSFileLoader
 from langchain_core.documents import Document
 from PyPDF2 import PdfReader
 import io
@@ -42,8 +41,9 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
       logging.exception(f'Exception Stack trace: {error_message}')
       raise LLMGraphBuilderException(error_message)
 
-def load_pdf(file_path):
-    return PyMuPDFLoader(file_path)
+def gcs_loader_func(file_path):
+   loader, _ = load_document_content(file_path)
+   return loader
 
 def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
   nltk.download('punkt')
@@ -64,7 +64,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
     blob = bucket.blob(blob_name) 
 
     if blob.exists():
-        loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
+        loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
         pages = loader.load() 
     else :
       raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz @@
     .vennv
     newenv
     files
+    startupbackend.sh
+    startupfrontend.sh