Skip to content
Merged

Staging #1245

Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ updates:
- package-ecosystem: 'npm'
directory: '/frontend'
schedule:
interval: 'weekly'
interval: 'monthly'
target-branch: 'dev'

- package-ecosystem: 'pip'
directory: '/backend'
schedule:
interval: 'weekly'
target-branch: 'dev'
interval: 'monthly'
target-branch: 'dev'
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz
.vennv
newenv
files

startupbackend.sh
startupfrontend.sh
316 changes: 167 additions & 149 deletions README.md

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions backend/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,19 @@ DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , ne
LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
LLM_MODEL_CONFIG_openai-gpt-o3-mini="o3-mini-2025-01-31,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key"
LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
Expand Down
87 changes: 45 additions & 42 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,61 +1,64 @@
accelerate==1.6.0
asyncio==3.4.3
boto3==1.36.2
botocore==1.36.2
certifi==2024.8.30
fastapi==0.115.6
boto3==1.37.29
botocore==1.37.29
certifi==2025.1.31
fastapi==0.115.11
fastapi-health==0.4.0
google-api-core==2.24.0
google-auth==2.37.0
google-api-core==2.24.2
google-auth==2.38.0
google_auth_oauthlib==1.2.1
google-cloud-core==2.4.1
json-repair==0.30.2
google-cloud-core==2.4.3
json-repair==0.39.1
pip-install==1.3.5
langchain==0.3.15
langchain-aws==0.2.11
langchain-anthropic==0.3.3
langchain-fireworks==0.2.6
langchain-community==0.3.15
langchain-core==0.3.31
langchain==0.3.23
langchain-aws==0.2.18
langchain-anthropic==0.3.9
langchain-fireworks==0.2.9
langchain-community==0.3.19
langchain-core==0.3.51
langchain-experimental==0.3.4
langchain-google-vertexai==2.0.11
langchain-groq==0.2.3
langchain-openai==0.3.1
langchain-text-splitters==0.3.5
langchain-google-vertexai==2.0.19
langchain-groq==0.2.5
langchain-openai==0.3.12
langchain-text-splitters==0.3.8
langchain-huggingface==0.1.2
langdetect==1.0.9
langsmith==0.2.11
langsmith==0.3.26
langserve==0.3.1
neo4j-rust-ext
nltk==3.9.1
openai==1.59.9
opencv-python==4.10.0.84
psutil==6.1.0
pydantic==2.9.2
openai==1.71.0
opencv-python==4.11.0.86
psutil==7.0.0
pydantic==2.10.6
python-dotenv==1.0.1
python-magic==0.4.27
PyPDF2==3.0.1
PyMuPDF==1.24.14
starlette==0.41.3
sse-starlette==2.1.3
PyMuPDF==1.25.5
starlette==0.46.1
sse-starlette==2.2.1
starlette-session==0.4.3
tqdm==4.67.1
unstructured[all-docs]
unstructured==0.16.11
unstructured-client==0.28.1
unstructured-inference==0.8.1
urllib3==2.2.2
uvicorn==0.32.1
unstructured==0.17.2
unstructured-client==0.32.3
unstructured-inference==0.8.10
urllib3==2.3.0
uvicorn==0.34.0
gunicorn==23.0.0
wikipedia==1.4.0
wrapt==1.16.0
yarl==1.9.4
youtube-transcript-api==0.6.3
zipp==3.17.0
sentence-transformers==3.3.1
google-cloud-logging==3.11.3
pypandoc==1.13
graphdatascience==1.12
Secweb==1.11.0
ragas==0.2.11
wrapt==1.17.2
yarl==1.18.3
youtube-transcript-api==1.0.3
zipp==3.21.0
sentence-transformers==4.0.2
google-cloud-logging==3.11.4
pypandoc==1.15
graphdatascience==1.14
Secweb==1.18.1
ragas==0.2.14
rouge_score==0.1.2
langchain-neo4j==0.3.0
langchain-neo4j==0.4.0
pypandoc-binary==1.15
chardet==5.2.0
55 changes: 43 additions & 12 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,27 @@
CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")

def sanitize_filename(filename):
"""
Sanitize the user-provided filename to prevent directory traversal and remove unsafe characters.
"""
# Remove path separators and collapse redundant separators
filename = os.path.basename(filename)
filename = os.path.normpath(filename)
return filename

def validate_file_path(directory, filename):
"""
Construct the full file path and ensure it is within the specified directory.
"""
file_path = os.path.join(directory, filename)
abs_directory = os.path.abspath(directory)
abs_file_path = os.path.abspath(file_path)
# Ensure the file path starts with the intended directory path
if not abs_file_path.startswith(abs_directory):
raise ValueError("Invalid file path")
return abs_file_path

def healthy_condition():
output = {"healthy": True}
return output
Expand Down Expand Up @@ -217,8 +238,9 @@ async def extract_knowledge_graph_from_file(
start_time = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
merged_file_path = os.path.join(MERGED_DIR,file_name)
if source_type == 'local file':
file_name = sanitize_filename(file_name)
merged_file_path = validate_file_path(MERGED_DIR, file_name)
uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)

elif source_type == 's3 bucket' and source_url:
Expand Down Expand Up @@ -278,8 +300,11 @@ async def extract_knowledge_graph_from_file(
return create_api_response('Success', data=result, file_source= source_type)
except LLMGraphBuilderException as e:
error_message = str(e)
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition)
failed_file_process(uri,file_name, merged_file_path, source_type)
if source_type == 'local file':
failed_file_process(uri,file_name, merged_file_path)
node_detail = graphDb_data_Access.get_current_status_document_node(file_name)
# Set the status "Completed" in logging becuase we are treating these error already handled by application as like custom errors.
json_obj = {'api_name':'extract','message':error_message,'file_created_at':formatted_time(node_detail[0]['created_time']),'error_message':error_message, 'file_name': file_name,'status':'Completed',
Expand All @@ -290,8 +315,11 @@ async def extract_knowledge_graph_from_file(
except Exception as e:
message=f"Failed To Process File:{file_name} or LLM Unable To Parse Content "
error_message = str(e)
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition)
failed_file_process(uri,file_name, merged_file_path, source_type)
if source_type == 'local file':
failed_file_process(uri,file_name, merged_file_path)
node_detail = graphDb_data_Access.get_current_status_document_node(file_name)

json_obj = {'api_name':'extract','message':message,'file_created_at':formatted_time(node_detail[0]['created_time']),'error_message':error_message, 'file_name': file_name,'status':'Failed',
Expand Down Expand Up @@ -548,16 +576,20 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
logger.log_struct(json_obj, "INFO")
if int(chunkNumber) == int(totalChunks):
json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
logger.log_struct(json_obj, "INFO")
if int(chunkNumber) == int(totalChunks):
return create_api_response('Success',data=result, message='Source Node Created Successfully')
else:
return create_api_response('Success', message=result)
except Exception as e:
message="Unable to upload large file into chunks. "
message="Unable to upload file in chunks"
error_message = str(e)
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.update_exception_db(originalname,error_message)
logging.info(message)
logging.exception(f'Exception:{error_message}')
return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = originalname)
Expand All @@ -568,8 +600,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
async def get_structured_schema(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None),email=Form(None)):
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
result = await asyncio.to_thread(get_labels_and_relationtypes, graph)
result = await asyncio.to_thread(get_labels_and_relationtypes, uri, userName, password, database)
end = time.time()
elapsed_time = end - start
logging.info(f'Schema result from DB: {result}')
Expand Down Expand Up @@ -736,10 +767,10 @@ async def cancelled_job(uri=Form(None), userName=Form(None), password=Form(None)
gc.collect()

@app.post("/populate_graph_schema")
async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),email=Form(None)):
async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None),is_local_storage=Form(None),email=Form(None)):
try:
start = time.time()
result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked)
result = populate_graph_schema_from_text(input_text, model, is_schema_description_checked, is_local_storage)
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'populate_graph_schema', 'model':model, 'is_schema_description_checked':is_schema_description_checked, 'input_text':input_text, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
Expand Down Expand Up @@ -866,7 +897,7 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
chunks = graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name})
end = time.time()
elapsed_time = end - start
json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition,
Expand Down
25 changes: 18 additions & 7 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def get_rag_chain(llm, system_template=CHAT_SYSTEM_TEMPLATE):
logging.error(f"Error creating RAG chain: {e}")
raise

def format_documents(documents, model):
def format_documents(documents, model,chat_mode_settings):
prompt_token_cutoff = 4
for model_names, value in CHAT_TOKEN_CUT_OFF.items():
if model in model_names:
Expand All @@ -197,9 +197,20 @@ def format_documents(documents, model):
try:
source = doc.metadata.get('source', "unknown")
sources.add(source)

entities = doc.metadata['entities'] if 'entities'in doc.metadata.keys() else entities
global_communities = doc.metadata["communitydetails"] if 'communitydetails'in doc.metadata.keys() else global_communities
if 'entities' in doc.metadata:
if chat_mode_settings["mode"] == CHAT_ENTITY_VECTOR_MODE:
entity_ids = [entry['entityids'] for entry in doc.metadata['entities'] if 'entityids' in entry]
entities.setdefault('entityids', set()).update(entity_ids)
else:
if 'entityids' in doc.metadata['entities']:
entities.setdefault('entityids', set()).update(doc.metadata['entities']['entityids'])
if 'relationshipids' in doc.metadata['entities']:
entities.setdefault('relationshipids', set()).update(doc.metadata['entities']['relationshipids'])

if 'communitydetails' in doc.metadata:
existing_ids = {entry['id'] for entry in global_communities}
new_entries = [entry for entry in doc.metadata["communitydetails"] if entry['id'] not in existing_ids]
global_communities.extend(new_entries)

formatted_doc = (
"Document start\n"
Expand All @@ -218,7 +229,7 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
start_time = time.time()

try:
formatted_docs, sources, entitydetails, communities = format_documents(docs, model)
formatted_docs, sources, entitydetails, communities = format_documents(docs, model,chat_mode_settings)

rag_chain = get_rag_chain(llm=llm)

Expand Down Expand Up @@ -369,7 +380,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
retriever = neo_db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={
'k': search_k,
'top_k': search_k,
'effective_search_ratio': ef_ratio,
'score_threshold': score_threshold,
'filter': {'fileName': {'$in': document_names}}
Expand All @@ -379,7 +390,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
else:
retriever = neo_db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
search_kwargs={'top_k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
)
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
return retriever
Expand Down
2 changes: 1 addition & 1 deletion backend/src/chunkid_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_entities_from_chunkids(uri, username, password, database ,nodedetails,en
elif mode == CHAT_ENTITY_VECTOR_MODE:

if "entitydetails" in nodedetails and nodedetails["entitydetails"]:
entity_ids = [item["id"] for item in nodedetails["entitydetails"]]
entity_ids = [item for item in nodedetails["entitydetails"]["entityids"]]
logging.info(f"chunkid_entities module: Starting for entity ids: {entity_ids}")
result = process_entityids(driver, entity_ids)
if "chunk_data" in result.keys():
Expand Down
10 changes: 5 additions & 5 deletions backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import logging
from google.cloud import storage
from langchain_community.document_loaders import GCSFileLoader, GCSDirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import GCSFileLoader
from langchain_core.documents import Document
from PyPDF2 import PdfReader
import io
Expand Down Expand Up @@ -42,8 +41,9 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
logging.exception(f'Exception Stack trace: {error_message}')
raise LLMGraphBuilderException(error_message)

def load_pdf(file_path):
return PyMuPDFLoader(file_path)
def gcs_loader_func(file_path):
loader, _ = load_document_content(file_path)
return loader

def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
nltk.download('punkt')
Expand All @@ -64,7 +64,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
blob = bucket.blob(blob_name)

if blob.exists():
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
pages = loader.load()
else :
raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
Expand Down
Loading