neo4j-labs
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion b/‎backend/example.env‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backend/score.py‎
Lines changed: 72 additions & 60 deletions b/‎backend/score.py‎
Lines changed: 72 additions & 60 deletions
diff --git a/‎backend/src/create_chunks.py‎
Lines changed: 12 additions & 6 deletions b/‎backend/src/create_chunks.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎backend/src/graphDB_dataAccess.py‎
Lines changed: 27 additions & 1 deletion b/‎backend/src/graphDB_dataAccess.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎backend/src/llm.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/llm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/main.py‎
Lines changed: 1 addition & 2 deletions b/‎backend/src/main.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backend/src/post_processing.py‎
Lines changed: 26 additions & 40 deletions b/‎backend/src/post_processing.py‎
Lines changed: 26 additions & 40 deletions
diff --git a/‎backend/src/shared/common_fn.py‎
Lines changed: 41 additions & 2 deletions b/‎backend/src/shared/common_fn.py‎
Lines changed: 41 additions & 2 deletions
diff --git a/‎backend/src/shared/constants.py‎
Lines changed: 55 additions & 20 deletions b/‎backend/src/shared/constants.py‎
Lines changed: 55 additions & 20 deletions
@@ -149,6 +149,9 @@ Allow unauthenticated request : Yes
 | VITE_GOOGLE_CLIENT_ID        | Optional           |               | Client ID for Google authentication                                                              |
 | VITE_LLM_MODELS_PROD         | Optional      | openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash | To Distinguish models based on the Enviornment PROD or DEV 
 | VITE_LLM_MODELS              | Optional | 'diffbot,openai_gpt_3.5,openai_gpt_4o,openai_gpt_4o_mini,gemini_1.5_pro,gemini_1.5_flash,azure_ai_gpt_35,azure_ai_gpt_4o,ollama_llama3,groq_llama3_70b,anthropic_claude_3_5_sonnet' | Supported Models For the application
+| VITE_AUTH0_CLIENT_ID | Mandatory if you are enabling Authentication otherwise it is optional |       |Okta Oauth Client ID for authentication
+| VITE_AUTH0_DOMAIN | Mandatory if you are enabling Authentication otherwise it is optional |           | Okta Oauth Cliend Domain
+| VITE_SKIP_AUTH | Optional | true | Flag to skip the authentication 
 
 ## LLMs Supported 
 1. OpenAI
 
@@ -44,4 +44,8 @@ LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
 YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
 EFFECTIVE_SEARCH_RATIO=5
 GRAPH_CLEANUP_MODEL="openai_gpt_4o"
-CHUNKS_TO_BE_PROCESSED="50"
+CHUNKS_TO_BE_CREATED="50"
+BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name"                       #model_name="amazon.titan-embed-text-v1"
+LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name"        #model_name="amazon.nova-micro-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name"         #model_name="amazon.nova-lite-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name"          #model_name="amazon.nova-pro-v1:0"
@@ -4,6 +4,7 @@
 import logging
 from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
 import re
+import os
 
 logging.basicConfig(format="%(asctime)s - %(message)s", level="INFO")
 
@@ -25,23 +26,28 @@ def split_file_into_chunks(self):
         """
         logging.info("Split file into smaller chunks")
         text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
+        chunk_to_be_created = int(os.environ.get('CHUNKS_TO_BE_CREATED', '50'))
         if 'page' in self.pages[0].metadata:
             chunks = []
             for i, document in enumerate(self.pages):
                 page_number = i + 1
-                for chunk in text_splitter.split_documents([document]):
-                    chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))    
+                if len(chunks) >= chunk_to_be_created:
+                    break
+                else:
+                    for chunk in text_splitter.split_documents([document]):
+                        chunks.append(Document(page_content=chunk.page_content, metadata={'page_number':page_number}))    
 
         elif 'length' in self.pages[0].metadata:
             if len(self.pages) == 1  or (len(self.pages) > 1 and self.pages[1].page_content.strip() == ''): 
                 match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',self.pages[0].metadata['source'])
                 youtube_id=match.group(1)   
                 chunks_without_time_range = text_splitter.split_documents([self.pages[0]])
-                chunks = get_calculated_timestamps(chunks_without_time_range, youtube_id)
-
+                chunks = get_calculated_timestamps(chunks_without_time_range[:chunk_to_be_created], youtube_id)
             else: 
-                chunks_without_time_range = text_splitter.split_documents(self.pages)   
-                chunks = get_chunks_with_timestamps(chunks_without_time_range)
+                chunks_without_time_range = text_splitter.split_documents(self.pages)
+                chunks = get_chunks_with_timestamps(chunks_without_time_range[:chunk_to_be_created])
         else:
             chunks = text_splitter.split_documents(self.pages)
+            
+        chunks = chunks[:chunk_to_be_created]
         return chunks
@@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name):
                     "nodeCount" : nodeCount,
                     "relationshipCount" : relationshipCount
                     }
-        return response
+        return response
+    
+    def get_nodelabels_relationships(self):
+        node_query = """
+                    CALL db.labels() YIELD label
+                    WITH label
+                    WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__']
+                    CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value
+                    WHERE value.count > 0
+                    RETURN label order by label
+                    """
+
+        relation_query = """
+                CALL db.relationshipTypes() yield relationshipType
+                WHERE NOT relationshipType  IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] 
+                return relationshipType order by relationshipType
+                """
+            
+        try:
+            node_result = self.execute_query(node_query)
+            node_labels = [record["label"] for record in node_result]
+            relationship_result = self.execute_query(relation_query)
+            relationship_types = [record["relationshipType"] for record in relationship_result]
+            return node_labels,relationship_types
+        except Exception as e:
+            print(f"Error in getting node labels/relationship types from db: {e}")
+            return []
@@ -89,7 +89,7 @@ def get_llm(model: str):
             )
 
             llm = ChatBedrock(
-                client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)
+                client=bedrock_client,region_name=region_name, model_id=model_name, model_kwargs=dict(temperature=0)
             )
 
         elif "ollama" in model:
 
@@ -361,7 +361,6 @@ async def processing_source(uri, userName, password, database, model, file_name,
 
       logging.info('Update the status as Processing')
       update_graph_chunk_processed = int(os.environ.get('UPDATE_GRAPH_CHUNKS_PROCESSED'))
-      chunk_to_be_processed = int(os.environ.get('CHUNKS_TO_BE_PROCESSED', '50'))
       # selected_chunks = []
       is_cancelled_status = False
       job_status = "Completed"
@@ -676,7 +675,7 @@ def get_labels_and_relationtypes(graph):
   query = """
           RETURN collect { 
           CALL db.labels() yield label 
-          WHERE NOT label  IN ['Chunk','_Bloom_Perspective_', '__Community__', '__Entity__'] 
+          WHERE NOT label  IN ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__'] 
           return label order by label limit 100 } as labels, 
           collect { 
           CALL db.relationshipTypes() yield relationshipType  as type 
 
@@ -8,7 +8,8 @@
 from langchain_core.prompts import ChatPromptTemplate
 from src.shared.constants import GRAPH_CLEANUP_PROMPT
 from src.llm import get_llm
-from src.main import get_labels_and_relationtypes
+from src.graphDB_dataAccess import graphDBdataAccess
+import time 
 
 DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
 LABELS_QUERY = "CALL db.labels()"
@@ -195,50 +196,35 @@ def update_embeddings(rows, graph):
     return graph.query(query,params={'rows':rows})          
 
 def graph_schema_consolidation(graph):
-    nodes_and_relations = get_labels_and_relationtypes(graph)
-    logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}")
-    node_labels = []
-    relation_labels = []
-    
-    node_labels.extend(nodes_and_relations[0]['labels'])
-    relation_labels.extend(nodes_and_relations[0]['relationshipTypes'])
-    
+    graphDb_data_Access = graphDBdataAccess(graph)
+    node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships()
     parser = JsonOutputParser()
-    prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")],
-                                            partial_variables={"format_instructions": parser.get_format_instructions()})
-    
-    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o')
+    prompt = ChatPromptTemplate(
+        messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o')
     llm, _ = get_llm(graph_cleanup_model)
     chain = prompt | llm | parser
-    nodes_dict = chain.invoke({'input':node_labels})
-    relation_dict = chain.invoke({'input':relation_labels})  
-    
-    node_match = {}
-    relation_match = {}
-    for new_label , values in nodes_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                node_match[old_label]=new_label
-            
-    for new_label , values in relation_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                relation_match[old_label]=new_label 
 
-    logging.info(f"updated node labels : {node_match}")   
-    logging.info(f"updated relationship labels : {relation_match}") 
-
-    # Update node labels in graph
-    for old_label, new_label in node_match.items():
-        query = f"""
-                MATCH (n:`{old_label}`)
-                SET n:`{new_label}`
-                REMOVE n:`{old_label}`
-                """
-        graph.query(query)
+    nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels}
+    mappings = chain.invoke({'input': nodes_relations_input})
+    node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old}
+    relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old}
+
+    logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})")
+    logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})")
+
+    if node_mapping:
+        for old_label, new_label in node_mapping.items():
+            query = f"""
+                    MATCH (n:`{old_label}`)
+                    SET n:`{new_label}`
+                    REMOVE n:`{old_label}`
+                    """
+            graph.query(query)
 
-    # Update relation types in graph
-    for old_label, new_label in relation_match.items():
+    for old_label, new_label in relation_mapping.items():
         query = f"""
                 MATCH (n)-[r:`{old_label}`]->(m)
                 CREATE (n)-[r2:`{new_label}`]->(m)
 
@@ -11,7 +11,8 @@
 import os
 from pathlib import Path
 from urllib.parse import urlparse
-
+import boto3
+from langchain_community.embeddings import BedrockEmbeddings
 
 def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
     language=''
@@ -77,6 +78,10 @@ def load_embedding_model(embedding_model_name: str):
         )
         dimension = 768
         logging.info(f"Embedding: Using Vertex AI Embeddings , Dimension:{dimension}")
+    elif embedding_model_name == "titan":
+        embeddings = get_bedrock_embeddings()
+        dimension = 1536
+        logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}")
     else:
         embeddings = HuggingFaceEmbeddings(
             model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model"
@@ -134,4 +139,38 @@ def last_url_segment(url):
   parsed_url = urlparse(url)
   path = parsed_url.path.strip("/")  # Remove leading and trailing slashes
   last_url_segment = path.split("/")[-1] if path else parsed_url.netloc.split(".")[0]
-  return last_url_segment
+  return last_url_segment
+
+def get_bedrock_embeddings():
+   """
+   Creates and returns a BedrockEmbeddings object using the specified model name.
+   Args:
+       model (str): The name of the model to use for embeddings.
+   Returns:
+       BedrockEmbeddings: An instance of the BedrockEmbeddings class.
+   """
+   try:
+       env_value = os.getenv("BEDROCK_EMBEDDING_MODEL")
+       if not env_value:
+           raise ValueError("Environment variable 'BEDROCK_EMBEDDING_MODEL' is not set.")
+       try:
+           model_name, aws_access_key, aws_secret_key, region_name = env_value.split(",")
+       except ValueError:
+           raise ValueError(
+               "Environment variable 'BEDROCK_EMBEDDING_MODEL' is improperly formatted. "
+               "Expected format: 'model_name,aws_access_key,aws_secret_key,region_name'."
+           )
+       bedrock_client = boto3.client(
+               service_name="bedrock-runtime",
+               region_name=region_name.strip(),
+               aws_access_key_id=aws_access_key.strip(),
+               aws_secret_access_key=aws_secret_key.strip(),
+           )
+       bedrock_embeddings = BedrockEmbeddings(
+           model_id=model_name.strip(),
+           client=bedrock_client
+       )
+       return bedrock_embeddings
+   except Exception as e:
+       print(f"An unexpected error occurred: {e}")
+       raise
@@ -831,27 +831,62 @@
 DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning"
 START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"                                                    
 
-GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically 
-related types. The consolidated types must be drawn from the original list; do not introduce new types.  
-Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type
-and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and 
-repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, 
-mapped to itself.
-
-**Input:** A list of strings representing the types to be consolidated. These types may represent either node 
-labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.
-
-Example 1:
-Input: 
-[ "Person", "Human", "People", "Company", "Organization", "Product"]
-Output :
-[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]]
-
-Example 2:
-Input :
-["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"]
+GRAPH_CLEANUP_PROMPT = """
+You are tasked with organizing a list of types into semantic categories based on their meanings, including synonyms or morphological similarities. The input will include two separate lists: one for **Node Labels** and one for **Relationship Types**. Follow these rules strictly:
+### 1. Input Format
+The input will include two keys:
+- `nodes`: A list of node labels.
+- `relationships`: A list of relationship types.
+### 2. Grouping Rules
+- Group similar items into **semantic categories** based on their meaning or morphological similarities.
+- The name of each category must be chosen from the types in the input list (node labels or relationship types). **Do not create or infer new names for categories**.
+- Items that cannot be grouped must remain in their own category.
+### 3. Naming Rules
+- The category name must reflect the grouped items and must be an existing type in the input list.
+- Use a widely applicable type as the category name.
+- **Do not introduce new names or types** under any circumstances.
+### 4. Output Rules
+- Return the output as a JSON object with two keys:
+ - `nodes`: A dictionary where each key represents a category name for nodes, and its value is a list of original node labels in that category.
+ - `relationships`: A dictionary where each key represents a category name for relationships, and its value is a list of original relationship types in that category.
+- Every key and value must come from the provided input lists.
+### 5. Examples
+#### Example 1:
+Input:
+{{
+ "nodes": ["Person", "Human", "People", "Company", "Organization", "Product"],
+ "relationships": ["CREATED_FOR", "CREATED_TO", "CREATED", "PUBLISHED","PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
+}}
+Output in JSON:
+{{
+ "nodes": {{
+   "Person": ["Person", "Human", "People"],
+   "Organization": ["Company", "Organization"],
+   "Product": ["Product"]
+ }},
+ "relationships": {{
+   "CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],
+   "PUBLISHED": ["PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
+ }}
+}}
+#### Example 2: Avoid redundant or incorrect grouping
+Input:
+{{
+ "nodes": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process", "Step"],
+ "relationships": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
+}}
 Output:
-["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]]
+{{
+ "nodes": {{
+   "Process": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process"]
+ }},
+ "relationships": {{
+   "USED": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
+ }}
+}}
+### 6. Key Rule
+If any item cannot be grouped, it must remain in its own category using its original name. Do not repeat values or create incorrect mappings.
+Use these rules to group and name categories accurately without introducing errors or new types.
 """
 
 ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def get_llm(model: str):`
`89`	`89`	`)`
`90`	`90`
`91`	`91`	`llm = ChatBedrock(`
`92`		`- client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)`
	`92`	`+ client=bedrock_client,region_name=region_name, model_id=model_name, model_kwargs=dict(temperature=0)`
`93`	`93`	`)`
`94`	`94`
`95`	`95`	`elif "ollama" in model:`