Merge branch 'DEV' into create-chatbot-component

kartikpersistent · web-flow · commit 218e3f9394f3 · 2024-03-15T17:39:45.000+05:30
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ KNN_MIN_SCORE = ""\
 https://github.com/neo4j-labs/llm-graph-builder/assets/121786590/b725a503-6ade-46d2-9e70-61d57443c311
 
 ## Links
- The Public [ Google cloud Run URL](https://frontend-dcavk67s4a-uc.a.run.app).
+ The Public [ Google cloud Run URL](https://prod-frontend-dcavk67s4a-uc.a.run.app).
  [Workspace URL](https://workspace-preview.neo4j.io/workspace)
 
 
diff --git a/backend/score.py b/backend/score.py
@@ -144,12 +144,15 @@ async def get_source_list(uri:str,
     return result
     
 @app.post("/update_similarity_graph")
-async def update_similarity_graph():
+async def update_similarity_graph(uri=Form(None),
+    userName=Form(None),
+    password=Form(None),
+    database=Form(None)):
     """
     Calls 'update_graph' which post the query to update the similiar nodes in the graph
     """
     
-    result = await asyncio.to_thread(update_graph)
+    result = await asyncio.to_thread(update_graph,uri,userName,password,database)
     return result
 
 def decode_password(pwd):
diff --git a/backend/src/diffbot_transformer.py b/backend/src/diffbot_transformer.py
@@ -5,6 +5,7 @@
 from typing import List
 import os
 import logging
+import uuid
 
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
 def extract_graph_from_diffbot(graph: Neo4jGraph, 
@@ -31,16 +32,21 @@ def extract_graph_from_diffbot(graph: Neo4jGraph,
     graph_document_list = []
     
     logging.info(f"create relationship between source,chunk and entity nodes created from Diffbot")
+    current_chunk_id = ''
+    relationship_cypher_list = []
     for i,chunk in enumerate(chunks):
+        previous_chunk_id = current_chunk_id
+        current_chunk_id = str(uuid.uuid1())
         if i == 0:
             firstChunk = True
         else:
             firstChunk = False
         graph_document = diffbot_nlp.convert_to_graph_documents([chunk])
         graph.add_graph_documents(graph_document)
-        create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk)
-        graph_document_list.append(graph_document[0]) 
+        lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk,current_chunk_id,
+        previous_chunk_id)
+        graph_document_list.append(graph_document[0])
+        relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
            
     graph.refresh_schema()
-    return graph_document_list
-    
+    return graph_document_list, relationship_cypher_list
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -20,6 +20,10 @@
 from langchain_community.document_loaders import YoutubeLoader
 from langchain_community.document_loaders import WikipediaLoader
 import warnings
+from pytube import YouTube
+from youtube_transcript_api import YouTubeTranscriptApi 
+import sys
+
 warnings.filterwarnings("ignore")
 
 load_dotenv()
@@ -215,9 +219,17 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
            # match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
             match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',source_url)
             logging.info(f"match value{match}")
-            youtube_id=match.group(1)
-            file_name=youtube_id.strip()
-            file_size=''
+            file_name = YouTube(source_url).title
+            transcript= get_youtube_transcript(match.group(1))
+            if transcript==None or len(transcript)==0:
+              file_size=''
+              job_status = "Failed"
+              message = f"Youtube transcript is not available for : {file_name}"
+              error_message = str(e)
+              logging.exception(f'Exception Stack trace:')
+              return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
+            else:  
+              file_size=sys.getsizeof(transcript)
             file_type='text'
             aws_access_key_id=''
             job_status = "Completed"
@@ -232,7 +244,15 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
         error_message = str(e)
         logging.exception(f'Exception Stack trace:')
         return create_api_response(job_status,message=message,error=error_message,file_source=source_type)  
-      
+
+def get_youtube_transcript(youtube_id):
+  transcript_dict = YouTubeTranscriptApi.get_transcript(youtube_id)
+  transcript=''
+  for td in transcript_dict:
+    transcript += ''.join(td['text'])
+  return transcript
+  
+        
 def file_into_chunks(pages: List[Document]):
     """
      Split a list of documents(file pages) into chunks of fixed size.
@@ -362,18 +382,19 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
     
     logging.info("Get graph document list from models")
     if model == 'Diffbot' :
-      graph_documents = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
+      graph_documents, cypher_list = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
       
     elif model == 'OpenAI GPT 3.5':
       model_version = 'gpt-3.5-turbo-16k'
-      graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
+      graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
       
     elif model == 'OpenAI GPT 4':
       model_version = 'gpt-4-0125-preview' 
-      graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
+      graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
               
-    #update_similarity_graph for the KNN Graph
-    update_graph(graph)
+    #create relation between chunks (FIRST_CHUNK and NEXT_CHUNK)
+    for query in cypher_list:
+       graph.query(query)
 
     distinct_nodes = set()
     relations = []
@@ -455,9 +476,10 @@ def get_documents_from_youtube(url):
                                                       translation = "en",
                                                       add_video_info=True)
       pages = youtube_loader.load()
-      match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
-      youtube_id=match.group(1)
-      file_name=youtube_id
+      # match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
+      # youtube_id=match.group(1)
+      # file_name=youtube_id
+      file_name = YouTube(url).title
       file_key=file_name
       return file_name, file_key, pages
     except Exception as e:
@@ -498,15 +520,15 @@ def get_source_list_from_graph(uri,userName,password,db_name=None):
     logging.exception(f'Exception:{error_message}')
     return create_api_response(job_status,message=message,error=error_message)
 
-def update_graph(graph):
+def update_graph(uri,userName,password,db_name):
   """
   Update the graph node with SIMILAR relationship where embedding scrore match
   """
   try:   
     knn_min_score = os.environ.get('KNN_MIN_SCORE')
 
     query = "WHERE node <> c and score >= {} MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score"
-    # graph = Neo4jGraph()
+    graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
     result = graph.query("""MATCH (c:Chunk)
                 WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
                 CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score """+ query.format(knn_min_score))
diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py
@@ -15,7 +15,9 @@ def create_source_chunk_entity_relationship(source_file_name :str,
                                             uri : str,
                                             userName : str,
                                             password : str,
-                                            isFirstChunk : bool):
+                                            isFirstChunk : bool,
+                                            current_chunk_id:uuid,
+                                            previous_chunk_id:uuid)-> list:
     """ Create relationship between source, chunk and entity nodes
     Args:
         source_file_name (str): file name of input source
@@ -26,14 +28,16 @@ def create_source_chunk_entity_relationship(source_file_name :str,
         userName: Username to use for graph creation ( if None will use username from config file )
         password: Password to use for graph creation ( if None will use password from config file )
         isFirstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
+        current_chunk_id : Unique id of chunk
+        previous_chunk_id : Unique id of previous chunk
     """
     source_node = 'fileName: "{}"'
+    lst_cypher_queries_chunk_relationship = []
     # logging.info(f'Graph Document print{graph_document}')
     # openai_api_key = os.environ.get('OPENAI_API_KEY')
     embedding_model = os.environ.get('EMBEDDING_MODEL')
     isEmbedding = os.environ.get('IS_EMBEDDING')
     
-    chunk_uuid = str(uuid.uuid1())
     chunk_node_id_set = 'id:"{}"'
     update_chunk_node_prop = ' SET c.text = "{}"'
     if isEmbedding:
@@ -43,27 +47,29 @@ def create_source_chunk_entity_relationship(source_file_name :str,
             url=uri,
             username=userName,
             password=password,
-            ids=[chunk_uuid]
+            ids=[current_chunk_id]
         )
     else:
-        graph.query('CREATE(c:Chunk {id:"'+ chunk_uuid+'"})' + update_chunk_node_prop.format(chunk.page_content))
+        graph.query('MERGE(c:Chunk {id:"'+ current_chunk_id+'"})' + update_chunk_node_prop.format(chunk.page_content))
 
     logging.info("make PART_OF relationship between chunk node and document node")
-    graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (c)-[:PART_OF]->(d)')
+    graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (c)-[:PART_OF]->(d)')
 
-    logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
+    # logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
     if isFirstChunk:
-        graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
+        lst_cypher_queries_chunk_relationship.append('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (d)-[:FIRST_CHUNK]->(c)')
+        # graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
     else:
-        graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:NEXT_CHUNK]->(c)')
+        lst_cypher_queries_chunk_relationship.append('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (pc)-[:NEXT_CHUNK]->(cc)')
+        # graph.query('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (pc)-[:NEXT_CHUNK]->(cc)')
     # dict = {}
     # nodes_list = []
     for node in graph_document[0].nodes:
         node_id = node.id
-        result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) CREATE (c)-[:HAS_ENTITY]->(n)')
+        result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
     #     json_obj = {'node_id': node_id, 'node_type' : node.type, 'uuid' : chunk_uuid}
     #     nodes_list.append(json_obj)
-
+    return lst_cypher_queries_chunk_relationship
     # dict['chunk_doc'] = chunk.page_content
     # dict['rel_chunk_entity_node'] = nodes_list
     # dict['nodes_created_in_chunk'] = len(graph_document[0].nodes)
diff --git a/backend/src/openAI_llm.py b/backend/src/openAI_llm.py
@@ -25,6 +25,7 @@
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import threading
+import uuid
 
 load_dotenv()
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
@@ -190,6 +191,13 @@ def extract_and_store_graph(
     model_version,
     graph: Neo4jGraph,
     document: Document,
+    file_name: str,
+    uri: str,
+    userName:str,
+    password:str,
+    firstChunk:bool,
+    current_chunk_id:uuid,
+    previous_chunk_id:uuid,
     nodes:Optional[List[str]] = None,
     rels:Optional[List[str]]=None) -> None:
     
@@ -198,10 +206,18 @@ def extract_and_store_graph(
      store the result into a Neo4jGraph.
      
      Args:
-     	 graph: Neo4j graph to store the data into
-     	 document: Langchain document to extract data from
-     	 nodes: List of nodes to extract ( default : None )
-     	 rels: List of relationships to extract ( default : None )
+        model_version: LLM model version
+        graph: Neo4j graph to store the data into
+        document: Langchain document to extract data from
+        file_name (str): file name of input source
+        uri: URI of the graph to extract
+        userName: Username to use for graph creation ( if None will use username from config file )
+        password: Password to use for graph creation ( if None will use password from config file )
+        firstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
+        current_chunk_id : Unique id of chunk
+        previous_chunk_id : Unique id of previous chunk
+        nodes: List of nodes to extract ( default : None )
+        rels: List of relationships to extract ( default : None )
      
      Returns: 
      	 The GraphDocument that was extracted and stored into the Neo4jgraph
@@ -221,7 +237,9 @@ def extract_and_store_graph(
     )]   
 
     graph.add_graph_documents(graph_document)
-    return graph_document   
+    lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,document,uri,userName,password,firstChunk,current_chunk_id,
+    previous_chunk_id)
+    return graph_document, lst_cypher_queries_chunk_relationship
  
     
 def extract_graph_from_OpenAI(model_version,
@@ -248,19 +266,28 @@ def extract_graph_from_OpenAI(model_version,
     """
     openai_api_key = os.environ.get('OPENAI_API_KEY')
     graph_document_list = []
+    relationship_cypher_list = []
     futures=[]
     logging.info(f"create relationship between source,chunk and entity nodes created from {model_version}")
     
     with ThreadPoolExecutor(max_workers=10) as executor:
+        current_chunk_id= ''
         for i, chunk_document in tqdm(enumerate(chunks), total=len(chunks)):
+            previous_chunk_id = current_chunk_id
+            current_chunk_id = str(uuid.uuid1())
+            position = i+1
             if i == 0:
                 firstChunk = True
             else:
                 firstChunk = False
-            futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document))   
+            metadata = {"position": position,"length": len(chunk_document.page_content)}
+            chunk_document = Document(page_content=chunk_document.page_content,metadata = metadata)
+            
+            futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document,file_name,uri,userName,password,firstChunk,current_chunk_id,previous_chunk_id))   
         for future in concurrent.futures.as_completed(futures):
-            graph_document = future.result()
-            create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk_document,uri,userName,password,firstChunk)
+            graph_document,lst_cypher_queries_chunk_relationship = future.result()
+            
             graph_document_list.append(graph_document[0])
+            relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
         
-    return graph_document_list
+    return graph_document_list, relationship_cypher_list
diff --git a/frontend/src/HOC/CustomModal.tsx b/frontend/src/HOC/CustomModal.tsx
@@ -11,6 +11,7 @@ const CustomModal: React.FC<CustomModalProps> = ({
   status,
   setStatus,
 }) => {
+  const isDisabled = status === 'danger' || status === 'info' || status === 'warning' || status === 'success';
   return (
     <Dialog
       size='small'
@@ -36,7 +37,7 @@ const CustomModal: React.FC<CustomModalProps> = ({
           <Button color='neutral' fill='outlined' onClick={onClose} size='medium'>
             Cancel
           </Button>
-          <Button onClick={submitHandler} size='medium'>
+          <Button onClick={submitHandler} size='medium' disabled={isDisabled}>
             {submitLabel}
           </Button>
         </Dialog.Actions>
diff --git a/frontend/src/components/Content.tsx b/frontend/src/components/Content.tsx
@@ -9,7 +9,9 @@ import { useFileContext } from '../context/UsersFiles';
 import CustomAlert from './Alert';
 import { extractAPI } from '../utils/FileAPI';
 import { ContentProps } from '../types';
+import { updateGraphAPI } from '../services/UpdateGraph';
 const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot }) => {
+
   const [init, setInit] = useState<boolean>(false);
   const [openConnection, setOpenConnection] = useState<boolean>(false);
   const [connectionStatus, setConnectionStatus] = useState<boolean>(false);
@@ -80,8 +82,7 @@ const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot
           filesData[uid].max_sources,
           filesData[uid].wiki_query ?? ''
         );
-
-        if (apiResponse.data?.status === 'Failed') {
+        if (apiResponse?.data?.status === 'Failed') {
           setShowAlert(true);
           setErrorMessage(apiResponse?.data?.message);
           setFilesData((prevfiles) =>
@@ -133,23 +134,26 @@ const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot
     }
   };
 
-  const handleGenerateGraph = () => {
+  const handleGenerateGraph = async () => {
+    const data = [];
     if (files.length > 0) {
       for (let i = 0; i < files.length; i++) {
         if (filesData[i]?.status === 'New') {
-          extractData(files[i], i);
+          data.push(extractData(files[i], i));
         }
       }
+      Promise.allSettled(data).then(async (_) => {
+        await updateGraphAPI(userCredentials);
+      });
     }
   };
 
   const handleClose = () => {
     setShowAlert(false);
   };
 
-  const openGraphUrl = `${process.env.BLOOM_URL}${userCredentials?.userName}@${localStorage.getItem('hostname')}%3A${
-    localStorage.getItem('port') ?? '7687'
-  }&search=Show+me+a+graph`;
+  const openGraphUrl = ` https://bloom-latest.s3.eu-west-2.amazonaws.com/assets/index.html?connectURL=${userCredentials?.userName}@${localStorage.getItem('hostname')}%3A${localStorage.getItem('port') ?? '7687'
+    }&search=Show+me+a+graph`;
 
   const classNameCheck =
     isExpanded && showChatBot
diff --git a/frontend/src/components/FileTable.tsx b/frontend/src/components/FileTable.tsx
diff --git a/frontend/src/services/UpdateGraph.ts b/frontend/src/services/UpdateGraph.ts