Skip to content

Commit 218e3f9

Browse files
Merge branch 'DEV' into create-chatbot-component
2 parents cc5adaf + 854faf1 commit 218e3f9

File tree

10 files changed

+149
-49
lines changed

10 files changed

+149
-49
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ KNN_MIN_SCORE = ""\
114114
https://github.com/neo4j-labs/llm-graph-builder/assets/121786590/b725a503-6ade-46d2-9e70-61d57443c311
115115
116116
## Links
117-
The Public [ Google cloud Run URL](https://frontend-dcavk67s4a-uc.a.run.app).
117+
The Public [ Google cloud Run URL](https://prod-frontend-dcavk67s4a-uc.a.run.app).
118118
[Workspace URL](https://workspace-preview.neo4j.io/workspace)
119119
120120

backend/score.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,15 @@ async def get_source_list(uri:str,
144144
return result
145145

146146
@app.post("/update_similarity_graph")
147-
async def update_similarity_graph():
147+
async def update_similarity_graph(uri=Form(None),
148+
userName=Form(None),
149+
password=Form(None),
150+
database=Form(None)):
148151
"""
149152
Calls 'update_graph' which post the query to update the similiar nodes in the graph
150153
"""
151154

152-
result = await asyncio.to_thread(update_graph)
155+
result = await asyncio.to_thread(update_graph,uri,userName,password,database)
153156
return result
154157

155158
def decode_password(pwd):

backend/src/diffbot_transformer.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List
66
import os
77
import logging
8+
import uuid
89

910
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
1011
def extract_graph_from_diffbot(graph: Neo4jGraph,
@@ -31,16 +32,21 @@ def extract_graph_from_diffbot(graph: Neo4jGraph,
3132
graph_document_list = []
3233

3334
logging.info(f"create relationship between source,chunk and entity nodes created from Diffbot")
35+
current_chunk_id = ''
36+
relationship_cypher_list = []
3437
for i,chunk in enumerate(chunks):
38+
previous_chunk_id = current_chunk_id
39+
current_chunk_id = str(uuid.uuid1())
3540
if i == 0:
3641
firstChunk = True
3742
else:
3843
firstChunk = False
3944
graph_document = diffbot_nlp.convert_to_graph_documents([chunk])
4045
graph.add_graph_documents(graph_document)
41-
create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk)
42-
graph_document_list.append(graph_document[0])
46+
lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk,uri,userName,password,firstChunk,current_chunk_id,
47+
previous_chunk_id)
48+
graph_document_list.append(graph_document[0])
49+
relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
4350

4451
graph.refresh_schema()
45-
return graph_document_list
46-
52+
return graph_document_list, relationship_cypher_list

backend/src/main.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
from langchain_community.document_loaders import YoutubeLoader
2121
from langchain_community.document_loaders import WikipediaLoader
2222
import warnings
23+
from pytube import YouTube
24+
from youtube_transcript_api import YouTubeTranscriptApi
25+
import sys
26+
2327
warnings.filterwarnings("ignore")
2428

2529
load_dotenv()
@@ -215,9 +219,17 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
215219
# match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
216220
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',source_url)
217221
logging.info(f"match value{match}")
218-
youtube_id=match.group(1)
219-
file_name=youtube_id.strip()
220-
file_size=''
222+
file_name = YouTube(source_url).title
223+
transcript= get_youtube_transcript(match.group(1))
224+
if transcript==None or len(transcript)==0:
225+
file_size=''
226+
job_status = "Failed"
227+
message = f"Youtube transcript is not available for : {file_name}"
228+
error_message = str(e)
229+
logging.exception(f'Exception Stack trace:')
230+
return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
231+
else:
232+
file_size=sys.getsizeof(transcript)
221233
file_type='text'
222234
aws_access_key_id=''
223235
job_status = "Completed"
@@ -232,7 +244,15 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
232244
error_message = str(e)
233245
logging.exception(f'Exception Stack trace:')
234246
return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
235-
247+
248+
def get_youtube_transcript(youtube_id):
249+
transcript_dict = YouTubeTranscriptApi.get_transcript(youtube_id)
250+
transcript=''
251+
for td in transcript_dict:
252+
transcript += ''.join(td['text'])
253+
return transcript
254+
255+
236256
def file_into_chunks(pages: List[Document]):
237257
"""
238258
Split a list of documents(file pages) into chunks of fixed size.
@@ -362,18 +382,19 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
362382

363383
logging.info("Get graph document list from models")
364384
if model == 'Diffbot' :
365-
graph_documents = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
385+
graph_documents, cypher_list = extract_graph_from_diffbot(graph,chunks,file_name,uri,userName,password)
366386

367387
elif model == 'OpenAI GPT 3.5':
368388
model_version = 'gpt-3.5-turbo-16k'
369-
graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
389+
graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
370390

371391
elif model == 'OpenAI GPT 4':
372392
model_version = 'gpt-4-0125-preview'
373-
graph_documents = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
393+
graph_documents, cypher_list = extract_graph_from_OpenAI(model_version,graph,chunks,file_name,uri,userName,password)
374394

375-
#update_similarity_graph for the KNN Graph
376-
update_graph(graph)
395+
#create relation between chunks (FIRST_CHUNK and NEXT_CHUNK)
396+
for query in cypher_list:
397+
graph.query(query)
377398

378399
distinct_nodes = set()
379400
relations = []
@@ -455,9 +476,10 @@ def get_documents_from_youtube(url):
455476
translation = "en",
456477
add_video_info=True)
457478
pages = youtube_loader.load()
458-
match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
459-
youtube_id=match.group(1)
460-
file_name=youtube_id
479+
# match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
480+
# youtube_id=match.group(1)
481+
# file_name=youtube_id
482+
file_name = YouTube(url).title
461483
file_key=file_name
462484
return file_name, file_key, pages
463485
except Exception as e:
@@ -498,15 +520,15 @@ def get_source_list_from_graph(uri,userName,password,db_name=None):
498520
logging.exception(f'Exception:{error_message}')
499521
return create_api_response(job_status,message=message,error=error_message)
500522

501-
def update_graph(graph):
523+
def update_graph(uri,userName,password,db_name):
502524
"""
503525
Update the graph node with SIMILAR relationship where embedding scrore match
504526
"""
505527
try:
506528
knn_min_score = os.environ.get('KNN_MIN_SCORE')
507529

508530
query = "WHERE node <> c and score >= {} MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score"
509-
# graph = Neo4jGraph()
531+
graph = Neo4jGraph(url=uri, database=db_name, username=userName, password=password)
510532
result = graph.query("""MATCH (c:Chunk)
511533
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
512534
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score """+ query.format(knn_min_score))

backend/src/make_relationships.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ def create_source_chunk_entity_relationship(source_file_name :str,
1515
uri : str,
1616
userName : str,
1717
password : str,
18-
isFirstChunk : bool):
18+
isFirstChunk : bool,
19+
current_chunk_id:uuid,
20+
previous_chunk_id:uuid)-> list:
1921
""" Create relationship between source, chunk and entity nodes
2022
Args:
2123
source_file_name (str): file name of input source
@@ -26,14 +28,16 @@ def create_source_chunk_entity_relationship(source_file_name :str,
2628
userName: Username to use for graph creation ( if None will use username from config file )
2729
password: Password to use for graph creation ( if None will use password from config file )
2830
isFirstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
31+
current_chunk_id : Unique id of chunk
32+
previous_chunk_id : Unique id of previous chunk
2933
"""
3034
source_node = 'fileName: "{}"'
35+
lst_cypher_queries_chunk_relationship = []
3136
# logging.info(f'Graph Document print{graph_document}')
3237
# openai_api_key = os.environ.get('OPENAI_API_KEY')
3338
embedding_model = os.environ.get('EMBEDDING_MODEL')
3439
isEmbedding = os.environ.get('IS_EMBEDDING')
3540

36-
chunk_uuid = str(uuid.uuid1())
3741
chunk_node_id_set = 'id:"{}"'
3842
update_chunk_node_prop = ' SET c.text = "{}"'
3943
if isEmbedding:
@@ -43,27 +47,29 @@ def create_source_chunk_entity_relationship(source_file_name :str,
4347
url=uri,
4448
username=userName,
4549
password=password,
46-
ids=[chunk_uuid]
50+
ids=[current_chunk_id]
4751
)
4852
else:
49-
graph.query('CREATE(c:Chunk {id:"'+ chunk_uuid+'"})' + update_chunk_node_prop.format(chunk.page_content))
53+
graph.query('MERGE(c:Chunk {id:"'+ current_chunk_id+'"})' + update_chunk_node_prop.format(chunk.page_content))
5054

5155
logging.info("make PART_OF relationship between chunk node and document node")
52-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (c)-[:PART_OF]->(d)')
56+
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (c)-[:PART_OF]->(d)')
5357

54-
logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
58+
# logging.info("make FIRST_CHUNK, NEXT_CHUNK relationship between chunk node and document node")
5559
if isFirstChunk:
56-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
60+
lst_cypher_queries_chunk_relationship.append('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (d)-[:FIRST_CHUNK]->(c)')
61+
# graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (d)-[:FIRST_CHUNK]->(c)')
5762
else:
58-
graph.query('MATCH(d:Document {'+source_node.format(source_file_name)+'}) ,(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}) CREATE (d)-[:NEXT_CHUNK]->(c)')
63+
lst_cypher_queries_chunk_relationship.append('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) MERGE (pc)-[:NEXT_CHUNK]->(cc)')
64+
# graph.query('MATCH(pc:Chunk {'+chunk_node_id_set.format(previous_chunk_id)+'}) ,(cc:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}) CREATE (pc)-[:NEXT_CHUNK]->(cc)')
5965
# dict = {}
6066
# nodes_list = []
6167
for node in graph_document[0].nodes:
6268
node_id = node.id
63-
result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(chunk_uuid)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) CREATE (c)-[:HAS_ENTITY]->(n)')
69+
result = graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(current_chunk_id)+'}), (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
6470
# json_obj = {'node_id': node_id, 'node_type' : node.type, 'uuid' : chunk_uuid}
6571
# nodes_list.append(json_obj)
66-
72+
return lst_cypher_queries_chunk_relationship
6773
# dict['chunk_doc'] = chunk.page_content
6874
# dict['rel_chunk_entity_node'] = nodes_list
6975
# dict['nodes_created_in_chunk'] = len(graph_document[0].nodes)

backend/src/openAI_llm.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import concurrent.futures
2626
from concurrent.futures import ThreadPoolExecutor
2727
import threading
28+
import uuid
2829

2930
load_dotenv()
3031
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
@@ -190,6 +191,13 @@ def extract_and_store_graph(
190191
model_version,
191192
graph: Neo4jGraph,
192193
document: Document,
194+
file_name: str,
195+
uri: str,
196+
userName:str,
197+
password:str,
198+
firstChunk:bool,
199+
current_chunk_id:uuid,
200+
previous_chunk_id:uuid,
193201
nodes:Optional[List[str]] = None,
194202
rels:Optional[List[str]]=None) -> None:
195203

@@ -198,10 +206,18 @@ def extract_and_store_graph(
198206
store the result into a Neo4jGraph.
199207
200208
Args:
201-
graph: Neo4j graph to store the data into
202-
document: Langchain document to extract data from
203-
nodes: List of nodes to extract ( default : None )
204-
rels: List of relationships to extract ( default : None )
209+
model_version: LLM model version
210+
graph: Neo4j graph to store the data into
211+
document: Langchain document to extract data from
212+
file_name (str): file name of input source
213+
uri: URI of the graph to extract
214+
userName: Username to use for graph creation ( if None will use username from config file )
215+
password: Password to use for graph creation ( if None will use password from config file )
216+
firstChunk : It's bool value to create FIRST_CHUNK AND NEXT_CHUNK relationship between chunk and document node.
217+
current_chunk_id : Unique id of chunk
218+
previous_chunk_id : Unique id of previous chunk
219+
nodes: List of nodes to extract ( default : None )
220+
rels: List of relationships to extract ( default : None )
205221
206222
Returns:
207223
The GraphDocument that was extracted and stored into the Neo4jgraph
@@ -221,7 +237,9 @@ def extract_and_store_graph(
221237
)]
222238

223239
graph.add_graph_documents(graph_document)
224-
return graph_document
240+
lst_cypher_queries_chunk_relationship = create_source_chunk_entity_relationship(file_name,graph,graph_document,document,uri,userName,password,firstChunk,current_chunk_id,
241+
previous_chunk_id)
242+
return graph_document, lst_cypher_queries_chunk_relationship
225243

226244

227245
def extract_graph_from_OpenAI(model_version,
@@ -248,19 +266,28 @@ def extract_graph_from_OpenAI(model_version,
248266
"""
249267
openai_api_key = os.environ.get('OPENAI_API_KEY')
250268
graph_document_list = []
269+
relationship_cypher_list = []
251270
futures=[]
252271
logging.info(f"create relationship between source,chunk and entity nodes created from {model_version}")
253272

254273
with ThreadPoolExecutor(max_workers=10) as executor:
274+
current_chunk_id= ''
255275
for i, chunk_document in tqdm(enumerate(chunks), total=len(chunks)):
276+
previous_chunk_id = current_chunk_id
277+
current_chunk_id = str(uuid.uuid1())
278+
position = i+1
256279
if i == 0:
257280
firstChunk = True
258281
else:
259282
firstChunk = False
260-
futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document))
283+
metadata = {"position": position,"length": len(chunk_document.page_content)}
284+
chunk_document = Document(page_content=chunk_document.page_content,metadata = metadata)
285+
286+
futures.append(executor.submit(extract_and_store_graph,model_version,graph,chunk_document,file_name,uri,userName,password,firstChunk,current_chunk_id,previous_chunk_id))
261287
for future in concurrent.futures.as_completed(futures):
262-
graph_document = future.result()
263-
create_source_chunk_entity_relationship(file_name,graph,graph_document,chunk_document,uri,userName,password,firstChunk)
288+
graph_document,lst_cypher_queries_chunk_relationship = future.result()
289+
264290
graph_document_list.append(graph_document[0])
291+
relationship_cypher_list.extend(lst_cypher_queries_chunk_relationship)
265292

266-
return graph_document_list
293+
return graph_document_list, relationship_cypher_list

frontend/src/HOC/CustomModal.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ const CustomModal: React.FC<CustomModalProps> = ({
1111
status,
1212
setStatus,
1313
}) => {
14+
const isDisabled = status === 'danger' || status === 'info' || status === 'warning' || status === 'success';
1415
return (
1516
<Dialog
1617
size='small'
@@ -36,7 +37,7 @@ const CustomModal: React.FC<CustomModalProps> = ({
3637
<Button color='neutral' fill='outlined' onClick={onClose} size='medium'>
3738
Cancel
3839
</Button>
39-
<Button onClick={submitHandler} size='medium'>
40+
<Button onClick={submitHandler} size='medium' disabled={isDisabled}>
4041
{submitLabel}
4142
</Button>
4243
</Dialog.Actions>

frontend/src/components/Content.tsx

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ import { useFileContext } from '../context/UsersFiles';
99
import CustomAlert from './Alert';
1010
import { extractAPI } from '../utils/FileAPI';
1111
import { ContentProps } from '../types';
12+
import { updateGraphAPI } from '../services/UpdateGraph';
1213
const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot }) => {
14+
1315
const [init, setInit] = useState<boolean>(false);
1416
const [openConnection, setOpenConnection] = useState<boolean>(false);
1517
const [connectionStatus, setConnectionStatus] = useState<boolean>(false);
@@ -80,8 +82,7 @@ const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot
8082
filesData[uid].max_sources,
8183
filesData[uid].wiki_query ?? ''
8284
);
83-
84-
if (apiResponse.data?.status === 'Failed') {
85+
if (apiResponse?.data?.status === 'Failed') {
8586
setShowAlert(true);
8687
setErrorMessage(apiResponse?.data?.message);
8788
setFilesData((prevfiles) =>
@@ -133,23 +134,26 @@ const Content: React.FC<ContentProps> = ({ isExpanded, showChatBot, openChatBot
133134
}
134135
};
135136

136-
const handleGenerateGraph = () => {
137+
const handleGenerateGraph = async () => {
138+
const data = [];
137139
if (files.length > 0) {
138140
for (let i = 0; i < files.length; i++) {
139141
if (filesData[i]?.status === 'New') {
140-
extractData(files[i], i);
142+
data.push(extractData(files[i], i));
141143
}
142144
}
145+
Promise.allSettled(data).then(async (_) => {
146+
await updateGraphAPI(userCredentials);
147+
});
143148
}
144149
};
145150

146151
const handleClose = () => {
147152
setShowAlert(false);
148153
};
149154

150-
const openGraphUrl = `${process.env.BLOOM_URL}${userCredentials?.userName}@${localStorage.getItem('hostname')}%3A${
151-
localStorage.getItem('port') ?? '7687'
152-
}&search=Show+me+a+graph`;
155+
const openGraphUrl = ` https://bloom-latest.s3.eu-west-2.amazonaws.com/assets/index.html?connectURL=${userCredentials?.userName}@${localStorage.getItem('hostname')}%3A${localStorage.getItem('port') ?? '7687'
156+
}&search=Show+me+a+graph`;
153157

154158
const classNameCheck =
155159
isExpanded && showChatBot

0 commit comments

Comments
 (0)