neo4j-labs
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion b/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backend/score.py‎
Lines changed: 20 additions & 12 deletions b/‎backend/score.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎backend/src/create_chunks.py‎
Lines changed: 12 additions & 6 deletions b/‎backend/src/create_chunks.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎backend/src/graphDB_dataAccess.py‎
Lines changed: 27 additions & 1 deletion b/‎backend/src/graphDB_dataAccess.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎backend/src/llm.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/llm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/main.py‎
Lines changed: 2 additions & 4 deletions b/‎backend/src/main.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎backend/src/post_processing.py‎
Lines changed: 27 additions & 40 deletions b/‎backend/src/post_processing.py‎
Lines changed: 27 additions & 40 deletions
diff --git a/‎backend/src/shared/common_fn.py‎
Lines changed: 41 additions & 2 deletions b/‎backend/src/shared/common_fn.py‎
Lines changed: 41 additions & 2 deletions
@@ -149,6 +149,9 @@ Allow unauthenticated request : Yes
 | VITE_GOOGLE_CLIENT_ID        | Optional           |               | Client ID for Google authentication                                                              |
 | VITE_LLM_MODELS_PROD         | Optional      | openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash | To Distinguish models based on the Enviornment PROD or DEV 
 | VITE_LLM_MODELS              | Optional | 'diffbot,openai_gpt_3.5,openai_gpt_4o,openai_gpt_4o_mini,gemini_1.5_pro,gemini_1.5_flash,azure_ai_gpt_35,azure_ai_gpt_4o,ollama_llama3,groq_llama3_70b,anthropic_claude_3_5_sonnet' | Supported Models For the application
+| VITE_AUTH0_CLIENT_ID | Mandatory if you are enabling Authentication otherwise it is optional |       |Okta Oauth Client ID for authentication
+| VITE_AUTH0_DOMAIN | Mandatory if you are enabling Authentication otherwise it is optional |           | Okta Oauth Cliend Domain
+| VITE_SKIP_AUTH | Optional | true | Flag to skip the authentication 
 
 ## LLMs Supported 
 1. OpenAI
 
@@ -44,4 +44,8 @@ LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
 YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
 EFFECTIVE_SEARCH_RATIO=5
 GRAPH_CLEANUP_MODEL="openai_gpt_4o"
-CHUNKS_TO_BE_PROCESSED="50"
+CHUNKS_TO_BE_CREATED="50"
+BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name"                       #model_name="amazon.titan-embed-text-v1"
+LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name"        #model_name="amazon.nova-micro-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name"         #model_name="amazon.nova-lite-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name"          #model_name="amazon.nova-pro-v1:0"
@@ -1,4 +1,4 @@
-from fastapi import FastAPI, File, UploadFile, Form, Request
+from fastapi import FastAPI, File, UploadFile, Form, Request, HTTPException
 from fastapi_health import health
 from fastapi.middleware.cors import CORSMiddleware
 from src.main import *
@@ -19,7 +19,6 @@
 from src.neighbours import get_neighbour_nodes
 import json
 from typing import List
-from starlette.middleware.sessions import SessionMiddleware
 from google.oauth2.credentials import Credentials
 import os
 from src.logger import CustomLogger
@@ -33,6 +32,10 @@
 from starlette.types import ASGIApp, Receive, Scope, Send
 from langchain_neo4j import Neo4jGraph
 from src.entities.source_node import sourceNode
+from starlette.middleware.sessions import SessionMiddleware
+from starlette.responses import HTMLResponse, RedirectResponse,JSONResponse
+from starlette.requests import Request
+import secrets
 
 logger = CustomLogger()
 CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
@@ -77,6 +80,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):
         )
         await gzip_middleware(scope, receive, send)
 app = FastAPI()
+
 app.add_middleware(XContentTypeOptions)
 app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'})
 app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"])
@@ -86,14 +90,14 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):
     allow_methods=["*"],
     allow_headers=["*"],
 )
+app.add_middleware(SessionMiddleware, secret_key=os.urandom(24))
 
 is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "False").lower() in ("true", "1", "yes")
 if is_gemini_enabled:
     add_routes(app,ChatVertexAI(), path="/vertexai")
 
 app.add_api_route("/health", health([healthy_condition, healthy]))
 
-app.add_middleware(SessionMiddleware, secret_key=os.urandom(24))
 
 
 @app.post("/url/scan")
@@ -346,14 +350,15 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
             await asyncio.to_thread(create_communities, uri, userName, password, database)  
 
             logging.info(f'created communities')
-            graph = create_graph_database_connection(uri, userName, password, database)   
-            graphDb_data_Access = graphDBdataAccess(graph)
-            document_name = ""
-            count_response = graphDb_data_Access.update_node_relationship_count(document_name)
-            if count_response:
-                count_response = [{"filename": filename, **counts} for filename, counts in count_response.items()]
-                logging.info(f'Updated source node with community related counts')
-        
+
+
+        graph = create_graph_database_connection(uri, userName, password, database)   
+        graphDb_data_Access = graphDBdataAccess(graph)
+        document_name = ""
+        count_response = graphDb_data_Access.update_node_relationship_count(document_name)
+        if count_response:
+            count_response = [{"filename": filename, **counts} for filename, counts in count_response.items()]
+            logging.info(f'Updated source node with community related counts')
 
         end = time.time()
         elapsed_time = end - start
@@ -502,12 +507,14 @@ async def connect(uri=Form(), userName=Form(), password=Form(), database=Form())
         graph = create_graph_database_connection(uri, userName, password, database)
         result = await asyncio.to_thread(connection_check_and_get_vector_dimensions, graph, database)
         gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
+        chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'connect','db_url':uri, 'userName':userName, 'database':database, 'count':1, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'}
         logger.log_struct(json_obj, "INFO")
         result['elapsed_api_time'] = f'{elapsed_time:.2f}'
         result['gcs_file_cache'] = gcs_file_cache
+        result['chunk_to_be_created']= chunk_to_be_created
         return create_api_response('Success',data=result)
     except Exception as e:
         job_status = "Failed"
@@ -980,8 +987,8 @@ async def backend_connection_configuration():
         database= os.getenv('NEO4J_DATABASE')
         password= os.getenv('NEO4J_PASSWORD')
         gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
+        chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
         if all([uri, username, database, password]):
-            print(f'uri:{uri}, usrName:{username}, database :{database}, password: {password}')
             graph = Neo4jGraph()
             logging.info(f'login connection status of object: {graph}')
             if graph is not None:
@@ -995,6 +1002,7 @@ async def backend_connection_configuration():
                 result["database"] = database
                 result["password"] = encoded_password
                 result['gcs_file_cache'] = gcs_file_cache
+                result['chunk_to_be_created']= chunk_to_be_created
                 end = time.time()
                 elapsed_time = end - start
                 result['api_name'] = 'backend_connection_configuration'
 
@@ -4,6 +4,7 @@
 import logging
 from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
 import re
+import os
 
 logging.basicConfig(format="%(asctime)s - %(message)s", level="INFO")
 
@@ -25,23 +26,28 @@ def split_file_into_chunks(self):
         """
         logging.info("Split file into smaller chunks")
         text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
+        chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
         if 'page' in self.pages[0].metadata:
             chunks = []
             for i, document in enumerate(self.pages):
                 page_number = i + 1
-                for chunk in text_splitter.split_documents([document]):
-                    chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))    
+                if len(chunks) >= chunk_to_be_created:
+                    break
+                else:
+                    for chunk in text_splitter.split_documents([document]):
+                        chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))    
 
         elif 'length' in self.pages[0].metadata:
             if len(self.pages) == 1  or (len(self.pages) > 1 and self.pages[1].page_content.strip() == ''): 
                 match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',self.pages[0].metadata['source'])
                 youtube_id=match.group(1)   
                 chunks_without_time_range = text_splitter.split_documents([self.pages[0]])
-                chunks = get_calculated_timestamps(chunks_without_time_range, youtube_id)
-
+                chunks = get_calculated_timestamps(chunks_without_time_range[:chunk_to_be_created], youtube_id)
             else: 
-                chunks_without_time_range = text_splitter.split_documents(self.pages)   
-                chunks = get_chunks_with_timestamps(chunks_without_time_range)
+                chunks_without_time_range = text_splitter.split_documents(self.pages)
+                chunks = get_chunks_with_timestamps(chunks_without_time_range[:chunk_to_be_created])
         else:
             chunks = text_splitter.split_documents(self.pages)
+            
+        chunks = chunks[:chunk_to_be_created]
         return chunks
@@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name):
                     "nodeCount" : nodeCount,
                     "relationshipCount" : relationshipCount
                     }
-        return response
+        return response
+    
+    def get_nodelabels_relationships(self):
+        node_query = """
+                    CALL db.labels() YIELD label
+                    WITH label
+                    WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__']
+                    CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value
+                    WHERE value.count > 0
+                    RETURN label order by label
+                    """
+
+        relation_query = """
+                CALL db.relationshipTypes() yield relationshipType
+                WHERE NOT relationshipType  IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] 
+                return relationshipType order by relationshipType
+                """
+            
+        try:
+            node_result = self.execute_query(node_query)
+            node_labels = [record["label"] for record in node_result]
+            relationship_result = self.execute_query(relation_query)
+            relationship_types = [record["relationshipType"] for record in relationship_result]
+            return node_labels,relationship_types
+        except Exception as e:
+            print(f"Error in getting node labels/relationship types from db: {e}")
+            return []
@@ -89,7 +89,7 @@ def get_llm(model: str):
             )
 
             llm = ChatBedrock(
-                client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)
+                client=bedrock_client,region_name=region_name, model_id=model_name, model_kwargs=dict(temperature=0)
             )
 
         elif "ollama" in model:
 
@@ -361,14 +361,12 @@ async def processing_source(uri, userName, password, database, model, file_name,
 
       logging.info('Update the status as Processing')
       update_graph_chunk_processed = int(os.environ.get('UPDATE_GRAPH_CHUNKS_PROCESSED'))
-      chunk_to_be_processed = int(os.environ.get('CHUNKS_TO_BE_PROCESSED', '50'))
+      
       # selected_chunks = []
       is_cancelled_status = False
       job_status = "Completed"
       for i in range(0, len(chunkId_chunkDoc_list), update_graph_chunk_processed):
         select_chunks_upto = i+update_graph_chunk_processed
-        if select_chunks_upto > chunk_to_be_processed:
-          break
         logging.info(f'Selected Chunks upto: {select_chunks_upto}')
         if len(chunkId_chunkDoc_list) <= select_chunks_upto:
           select_chunks_upto = len(chunkId_chunkDoc_list)
@@ -676,7 +674,7 @@ def get_labels_and_relationtypes(graph):
   query = """
           RETURN collect { 
           CALL db.labels() yield label 
-          WHERE NOT label  IN ['Chunk','_Bloom_Perspective_', '__Community__', '__Entity__'] 
+          WHERE NOT label  IN ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__'] 
           return label order by label limit 100 } as labels, 
           collect { 
           CALL db.relationshipTypes() yield relationshipType  as type 
 
@@ -8,7 +8,9 @@
 from langchain_core.prompts import ChatPromptTemplate
 from src.shared.constants import GRAPH_CLEANUP_PROMPT
 from src.llm import get_llm
-from src.main import get_labels_and_relationtypes
+from src.graphDB_dataAccess import graphDBdataAccess
+import time 
+
 
 DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
 LABELS_QUERY = "CALL db.labels()"
@@ -195,50 +197,35 @@ def update_embeddings(rows, graph):
     return graph.query(query,params={'rows':rows})          
 
 def graph_schema_consolidation(graph):
-    nodes_and_relations = get_labels_and_relationtypes(graph)
-    logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}")
-    node_labels = []
-    relation_labels = []
-    
-    node_labels.extend(nodes_and_relations[0]['labels'])
-    relation_labels.extend(nodes_and_relations[0]['relationshipTypes'])
-    
+    graphDb_data_Access = graphDBdataAccess(graph)
+    node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships()
     parser = JsonOutputParser()
-    prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")],
-                                            partial_variables={"format_instructions": parser.get_format_instructions()})
-    
-    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o')
+    prompt = ChatPromptTemplate(
+        messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o')
     llm, _ = get_llm(graph_cleanup_model)
     chain = prompt | llm | parser
-    nodes_dict = chain.invoke({'input':node_labels})
-    relation_dict = chain.invoke({'input':relation_labels})  
-    
-    node_match = {}
-    relation_match = {}
-    for new_label , values in nodes_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                node_match[old_label]=new_label
-            
-    for new_label , values in relation_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                relation_match[old_label]=new_label 
-
-    logging.info(f"updated node labels : {node_match}")   
-    logging.info(f"updated relationship labels : {relation_match}") 
 
-    # Update node labels in graph
-    for old_label, new_label in node_match.items():
-        query = f"""
-                MATCH (n:`{old_label}`)
-                SET n:`{new_label}`
-                REMOVE n:`{old_label}`
-                """
-        graph.query(query)
+    nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels}
+    mappings = chain.invoke({'input': nodes_relations_input})
+    node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old}
+    relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old}
+
+    logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})")
+    logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})")
+
+    if node_mapping:
+        for old_label, new_label in node_mapping.items():
+            query = f"""
+                    MATCH (n:`{old_label}`)
+                    SET n:`{new_label}`
+                    REMOVE n:`{old_label}`
+                    """
+            graph.query(query)
 
-    # Update relation types in graph
-    for old_label, new_label in relation_match.items():
+    for old_label, new_label in relation_mapping.items():
         query = f"""
                 MATCH (n)-[r:`{old_label}`]->(m)
                 CREATE (n)-[r2:`{new_label}`]->(m)
 
@@ -11,7 +11,8 @@
 import os
 from pathlib import Path
 from urllib.parse import urlparse
-
+import boto3
+from langchain_community.embeddings import BedrockEmbeddings
 
 def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
     language=''
@@ -77,6 +78,10 @@ def load_embedding_model(embedding_model_name: str):
         )
         dimension = 768
         logging.info(f"Embedding: Using Vertex AI Embeddings , Dimension:{dimension}")
+    elif embedding_model_name == "titan":
+        embeddings = get_bedrock_embeddings()
+        dimension = 1536
+        logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}")
     else:
         embeddings = HuggingFaceEmbeddings(
             model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model"
@@ -134,4 +139,38 @@ def last_url_segment(url):
   parsed_url = urlparse(url)
   path = parsed_url.path.strip("/")  # Remove leading and trailing slashes
   last_url_segment = path.split("/")[-1] if path else parsed_url.netloc.split(".")[0]
-  return last_url_segment
+  return last_url_segment
+
+def get_bedrock_embeddings():
+   """
+   Creates and returns a BedrockEmbeddings object using the specified model name.
+   Args:
+       model (str): The name of the model to use for embeddings.
+   Returns:
+       BedrockEmbeddings: An instance of the BedrockEmbeddings class.
+   """
+   try:
+       env_value = os.getenv("BEDROCK_EMBEDDING_MODEL")
+       if not env_value:
+           raise ValueError("Environment variable 'BEDROCK_EMBEDDING_MODEL' is not set.")
+       try:
+           model_name, aws_access_key, aws_secret_key, region_name = env_value.split(",")
+       except ValueError:
+           raise ValueError(
+               "Environment variable 'BEDROCK_EMBEDDING_MODEL' is improperly formatted. "
+               "Expected format: 'model_name,aws_access_key,aws_secret_key,region_name'."
+           )
+       bedrock_client = boto3.client(
+               service_name="bedrock-runtime",
+               region_name=region_name.strip(),
+               aws_access_key_id=aws_access_key.strip(),
+               aws_secret_access_key=aws_secret_key.strip(),
+           )
+       bedrock_embeddings = BedrockEmbeddings(
+           model_id=model_name.strip(),
+           client=bedrock_client
+       )
+       return bedrock_embeddings
+   except Exception as e:
+       print(f"An unexpected error occurred: {e}")
+       raise
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def get_llm(model: str):`
`89`	`89`	`)`
`90`	`90`
`91`	`91`	`llm = ChatBedrock(`
`92`		`- client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)`
	`92`	`+ client=bedrock_client,region_name=region_name, model_id=model_name, model_kwargs=dict(temperature=0)`
`93`	`93`	`)`
`94`	`94`
`95`	`95`	`elif "ollama" in model:`