neo4j-labs
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 13 additions & 0 deletions b/‎README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion b/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backend/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎backend/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backend/score.py‎
Lines changed: 26 additions & 7 deletions b/‎backend/score.py‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎backend/src/QA_integration.py‎
Lines changed: 57 additions & 15 deletions b/‎backend/src/QA_integration.py‎
Lines changed: 57 additions & 15 deletions
diff --git a/‎backend/src/logger.py‎
Lines changed: 18 additions & 0 deletions b/‎backend/src/logger.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎backend/src/main.py‎
Lines changed: 14 additions & 24 deletions b/‎backend/src/main.py‎
Lines changed: 14 additions & 24 deletions
diff --git a/‎backend/src/shared/common_fn.py‎
Lines changed: 7 additions & 1 deletion b/‎backend/src/shared/common_fn.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
@@ -164,4 +164,4 @@ google-cloud-sdk
 google-cloud-cli-469.0.0-linux-x86_64.tar.gz
 /data/llm-experiments-387609-c73d512ca3b1.json
 /backend/src/merged_files
-/backend/src/chunks
+/backend/src/chunks
@@ -5,6 +5,19 @@ Files can be uploaded from local machine or S3 bucket and then LLM model can be
 
 ### Getting started
 
+:warning: 
+For the backend, if you want to run the LLM KG Builder locally, and don't need the GCP/VertexAI integration, make sure to have the following set in your ENV file :
+
+```env
+GEMINI_ENABLED = False
+GCP_LOG_METRICS_ENABLED = False
+```
+
+And for the frontend, make sure to export your local backend URL before running docker-compose by having the BACKEND_API_URL set in your ENV file :
+```env
+BACKEND_API_URL="http://localhost:8000"
+```
+
 1. Run Docker Compose to build and start all components:
     ```bash
     docker-compose up --build
 
@@ -14,4 +14,8 @@ LANGCHAIN_PROJECT = ""
 LANGCHAIN_TRACING_V2 = ""
 LANGCHAIN_ENDPOINT = ""
 NUMBER_OF_CHUNKS_TO_COMBINE = ""
-# NUMBER_OF_CHUNKS_ALLOWED = ""
+# NUMBER_OF_CHUNKS_ALLOWED = ""
+# Enable Gemini (default is True)
+GEMINI_ENABLED = True|False
+# Enable Google Cloud logs (default is True)
+GCP_LOG_METRICS_ENABLED = True|False
@@ -147,7 +147,6 @@ timm==0.9.12
 tokenizers==0.15.2
 tqdm==4.66.2
 transformers==4.37.1
-triton==2.2.0
 types-protobuf
 types-requests
 typing-inspect==0.9.0
 
@@ -20,10 +20,11 @@
 import json
 from typing import List
 from google.cloud import logging as gclogger
+from src.logger import CustomLogger
 
-logging_client = gclogger.Client()
-logger_name = "llm_experiments_metrics" # Saved in the google cloud logs
-logger = logging_client.logger(logger_name)
+logger = CustomLogger()
+CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
+MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")
 
 def healthy_condition():
     output = {"healthy": True}
@@ -45,7 +46,9 @@ def sick():
     allow_headers=["*"],
 )
 
-add_routes(app,ChatVertexAI(), path="/vertexai")
+is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "True").lower() in ("true", "1", "yes")
+if is_gemini_enabled:
+    add_routes(app,ChatVertexAI(), path="/vertexai")
 
 app.add_api_route("/health", health([healthy_condition, healthy]))
 
@@ -135,8 +138,10 @@ async def extract_knowledge_graph_from_file(
         graph = create_graph_database_connection(uri, userName, password, database)   
         graphDb_data_Access = graphDBdataAccess(graph)
         if source_type == 'local file':
+            merged_file_path = os.path.join(MERGED_DIR,file_name)
+            logging.info(f'File path:{merged_file_path}')
             result = await asyncio.to_thread(
-                extract_graph_from_file_local_file, graph, model, file_name, allowedNodes, allowedRelationship)
+                extract_graph_from_file_local_file, graph, model, file_name, merged_file_path, allowedNodes, allowedRelationship)
 
         elif source_type == 's3 bucket' and source_url:
             result = await asyncio.to_thread(
@@ -160,9 +165,11 @@ async def extract_knowledge_graph_from_file(
         logger.log_struct(result)
         return create_api_response('Success', data=result, file_source= source_type)
     except Exception as e:
-        message=f" Failed To Process File:{file_name} or LLM Unable To Parse Content"
+        message=f"Failed To Process File:{file_name} or LLM Unable To Parse Content "
         error_message = str(e)
         graphDb_data_Access.update_exception_db(file_name,error_message)
+        if source_type == 'local file':
+            delete_uploaded_local_file(merged_file_path, file_name)
         josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type}
         logger.log_struct(josn_obj)
         logging.exception(f'File Failed in extraction: {josn_obj}')
@@ -253,6 +260,18 @@ async def graph_query(
         logging.exception(f'Exception in graph query: {error_message}')
         return create_api_response(job_status, message=message, error=error_message)
 
+@app.post("/clear_chat_bot")
+async def clear_chat_bot(uri=Form(None),userName=Form(None), password=Form(None), database=Form(None), session_id=Form(None)):
+    try:
+        graph = create_graph_database_connection(uri, userName, password, database)
+        result = await asyncio.to_thread(clear_chat_history,graph=graph,session_id=session_id)
+        return create_api_response('Success',data=result)
+    except Exception as e:
+        job_status = "Failed"
+        message="Unable to clear chat History"
+        error_message = str(e)
+        logging.exception(f'Exception in chat bot:{error_message}')
+        return create_api_response(job_status, message=message, error=error_message)
 @app.post("/connect")
 async def connect(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None)):
     try:
@@ -274,7 +293,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
                                         password=Form(None), database=Form(None)):
     try:
         graph = create_graph_database_connection(uri, userName, password, database)
-        result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname)
+        result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, CHUNK_DIR, MERGED_DIR)
         josn_obj = {'api_name':'upload','db_url':uri}
         logger.log_struct(josn_obj)
         return create_api_response('Success', message=result)
 
@@ -105,21 +105,35 @@ def get_llm(model: str,max_tokens=1000) -> Any:
             )
         else:
             llm = ChatOpenAI(model=model_version, temperature=0,max_tokens=max_tokens)
-        return llm
+
+        return llm,model_version
 
     else:
         logging.error(f"Unsupported model: {model}")
-        return None
+        return None,None
 
 def vector_embed_results(qa,question):
     vector_res={}
     try:
         result = qa({"query": question})
         vector_res['result']=result.get("result")
-        list_source_docs=[]
-        for i in result["source_documents"]:
-            list_source_docs.append(i.metadata['source'])
-            vector_res['source']=list_source_docs
+
+        sources = set()
+        entities = set()
+        for document in result["source_documents"]:
+            sources.add(document.metadata["source"])
+            for entiti in document.metadata["entities"]:
+                entities.add(entiti)
+        vector_res['source']=list(sources)
+        vector_res['entities'] = list(entities)
+        if len( vector_res['entities']) > 5:
+            vector_res['entities'] =  vector_res['entities'][:5]
+            
+        # list_source_docs=[]
+        # for i in result["source_documents"]:
+        #     list_source_docs.append(i.metadata['source'])
+        #     vector_res['source']=list_source_docs
+
         # result = qa({"question":question},return_only_outputs=True)
         # vector_res['result'] = result.get("answer")
         # vector_res["source"] = result.get("sources")
@@ -145,6 +159,7 @@ def save_chat_history(history,user_message,ai_message):
 
 def get_chat_history(llm, history):
     """Retrieves and summarizes the chat history for a given session."""
+
     try:
         # history = Neo4jChatMessageHistory(
         #     graph=graph,
@@ -170,6 +185,26 @@ def get_chat_history(llm, history):
         logging.exception(f"Exception in retrieving chat history: {e}")
         return "" 
 
+def clear_chat_history(graph, session_id):
+
+    try:
+        logging.info(f"Clearing chat history for session ID: {session_id}")
+        history = Neo4jChatMessageHistory(
+            graph=graph,
+            session_id=session_id
+        )
+        history.clear()
+        logging.info("Chat history cleared successfully")
+
+        return {
+            "session_id": session_id,
+            "message": "The chat history is cleared",
+            "user": "chatbot"
+        }
+    except Exception as e:
+        logging.exception(f"Error occurred while clearing chat history for session ID {session_id}: {e}")
+
+
 def extract_and_remove_source(message):
     pattern = r'\[Source: ([^\]]+)\]'
     match = re.search(pattern, message)
@@ -206,6 +241,7 @@ def QA_RAG(graph,model,question,session_id):
     try:
         qa_rag_start_time = time.time()
 
+
         start_time = time.time()
         neo_db = Neo4jVector.from_existing_index(
             embedding=EMBEDDING_FUNCTION,
@@ -219,7 +255,8 @@ def QA_RAG(graph,model,question,session_id):
             session_id=session_id
         )
 
-        llm = get_llm(model=model,max_tokens=CHAT_MAX_TOKENS)
+        llm,model_version = get_llm(model=model,max_tokens=CHAT_MAX_TOKENS)
+
         qa = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
@@ -278,20 +315,25 @@ def QA_RAG(graph,model,question,session_id):
         return {
             "session_id": session_id, 
             "message": message, 
-            "sources": sources,
-            "info": f"""Metadata : 
-            RETRIEVAL_QUERY : {RETRIEVAL_QUERY}""",
+            "info": {
+                "sources": sources,
+                "model":model_version,
+                "entities":vector_res["entities"]
+            },
             "user": "chatbot"
             }
 
     except Exception as e:
         logging.exception(f"Exception in QA component at {datetime.now()}: {str(e)}")
         error_name = type(e).__name__
-        return {"session_id": session_id, 
-        "message": "Something went wrong",
-        "sources": [],
-        "info": f"Caught an exception {error_name} :- {str(e)}",
-        "user": "chatbot"}
+        return {
+            "session_id": session_id, 
+            "message": "Something went wrong",
+            "info": {
+                "sources": [],
+                "error": f"{error_name} :- {str(e)}"
+            },
+            "user": "chatbot"}
 
 
 
 
@@ -0,0 +1,18 @@
+import os
+from google.cloud import logging as gclogger
+
+class CustomLogger:
+    def __init__(self):
+        self.is_gcp_log_enabled = os.environ.get("GCP_LOG_METRICS_ENABLED", "True").lower() in ("true", "1", "yes")
+        if self.is_gcp_log_enabled:
+            self.logging_client = gclogger.Client()
+            self.logger_name = "llm_experiments_metrics"
+            self.logger = self.logging_client.logger(self.logger_name)
+        else:
+            self.logger = None
+
+    def log_struct(self, message):
+        if self.is_gcp_log_enabled:
+            self.logger.log_struct(message)
+        else:
+            print(message)
@@ -20,7 +20,6 @@
 import sys
 import shutil
 warnings.filterwarnings("ignore")
-from pathlib import Path
 load_dotenv()
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
 
@@ -146,13 +145,11 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
         lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'status':'Failed'})
     return lst_file_name,success_count,failed_count
 
-def extract_graph_from_file_local_file(graph, model, fileName, allowedNodes, allowedRelationship):
+def extract_graph_from_file_local_file(graph, model, fileName, merged_file_path, allowedNodes, allowedRelationship):
 
   logging.info(f'Process file name :{fileName}')
-  merged_file_path = os.path.join(os.path.join(os.path.dirname(__file__), "merged_files"),fileName)
-  logging.info(f'File path:{merged_file_path}')
   file_name, pages = get_documents_from_file_by_path(merged_file_path,fileName)
-
+  
   if pages==None or len(pages)==0:
     raise Exception(f'Pdf content is not available for file : {file_name}')
 
@@ -294,15 +291,10 @@ def processing_source(graph, model, file_name, pages, allowedNodes, allowedRelat
     logging.info('Updated the nodeCount and relCount properties in Docuemnt node')
     logging.info(f'file:{file_name} extraction has been completed')
 
+
+    # merged_file_path have value only when file uploaded from local
     if merged_file_path is not None:
-      file_path = Path(merged_file_path)
-      if file_path.exists():
-        file_path.unlink()
-        logging.info(f'file {file_name} delete successfully')
-      else:
-        logging.info(f'file {file_name} does not exist')
-    else:
-      logging.info(f'File Path is None i.e. source type other than local file')
+      delete_uploaded_local_file(merged_file_path, file_name)
 
     return {
         "fileName": file_name,
@@ -355,27 +347,25 @@ def connection_check(graph):
   graph_DB_dataAccess = graphDBdataAccess(graph)
   return graph_DB_dataAccess.connection_check()
 
-def merge_chunks(file_name, total_chunks):
-  
-  chunk_dir = os.path.join(os.path.dirname(__file__), "chunks")
-  merged_file_path = os.path.join(os.path.dirname(__file__), "merged_files")
+def merge_chunks(file_name, total_chunks, chunk_dir, merged_dir):
 
-  if not os.path.exists(merged_file_path):
-      os.mkdir(merged_file_path)
+  if not os.path.exists(merged_dir):
+      os.mkdir(merged_dir)
 
-  with open(os.path.join(merged_file_path, file_name), "wb") as write_stream:
+  with open(os.path.join(merged_dir, file_name), "wb") as write_stream:
       for i in range(1,total_chunks+1):
           chunk_file_path = os.path.join(chunk_dir, f"{file_name}_part_{i}")
           with open(chunk_file_path, "rb") as chunk_file:
               shutil.copyfileobj(chunk_file, write_stream)
           os.unlink(chunk_file_path)  # Delete the individual chunk file after merging
   logging.info("Chunks merged successfully and return file size")
-  file_size = os.path.getsize(os.path.join(merged_file_path, file_name))
+  file_size = os.path.getsize(os.path.join(merged_dir, file_name))
   return file_size
 
 
-def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, originalname):
-  chunk_dir = os.path.join(os.path.dirname(__file__), "chunks")  # Directory to save chunks
+
+def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, originalname, chunk_dir, merged_dir):
+  # chunk_dir = os.path.join(os.path.dirname(__file__), "chunks")  # Directory to save chunks
   if not os.path.exists(chunk_dir):
       os.mkdir(chunk_dir)
 
@@ -387,7 +377,7 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina
 
   if int(chunk_number) == int(total_chunks):
       # If this is the last chunk, merge all chunks into a single file
-      file_size = merge_chunks(originalname, int(total_chunks))
+      file_size = merge_chunks(originalname, int(total_chunks), chunk_dir, merged_dir)
       logging.info("File merged successfully")
 
       obj_source_node = sourceNode()
 
@@ -9,6 +9,7 @@
 from typing import List
 import re
 import os
+from pathlib import Path
 
 def check_url_source(source_type, yt_url:str=None, queries_list:List[str]=None):
     try:
@@ -84,4 +85,9 @@ def load_embedding_model(embedding_model_name: str):
 def save_graphDocuments_in_neo4j(graph:Neo4jGraph, graph_document_list:List[GraphDocument]):
   # graph.add_graph_documents(graph_document_list, baseEntityLabel=True)
   graph.add_graph_documents(graph_document_list)
-                 
+
+def delete_uploaded_local_file(merged_file_path, file_name):
+  file_path = Path(merged_file_path)
+  if file_path.exists():
+    file_path.unlink()
+    logging.info(f'file {file_name} deleted successfully')
@@ -39,7 +39,7 @@ services:
       - ./frontend:/app
       - /app/node_modules
     environment:
-      - BACKEND_API_URL=${BACKEND_API_URL-}
+      - BACKEND_API_URL=${BACKEND_API_URL}
       - BLOOM_URL=${BLOOM_URL}
       - REACT_APP_SOURCES=${REACT_APP_SOURCES}
     container_name: frontend