Skip to content

Commit 440d8ac

Browse files
Merge pull request #221 from neo4j-labs/binary-removal-changes
Binary removal changes to DEV
2 parents 7c07230 + d2d56bf commit 440d8ac

File tree

21 files changed

+520
-572
lines changed

21 files changed

+520
-572
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,4 @@ google-cloud-sdk
164164
google-cloud-cli-469.0.0-linux-x86_64.tar.gz
165165
/data/llm-experiments-387609-c73d512ca3b1.json
166166
/backend/src/merged_files
167-
/backend/chunks
167+
/backend/src/chunks

backend/score.py

Lines changed: 79 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
from src.main import *
66
from src.QA_integration import *
77
from src.entities.user_credential import user_credential
8+
from src.shared.common_fn import *
89
import uvicorn
910
import asyncio
1011
import base64
1112
from langserve import add_routes
1213
from langchain_google_vertexai import ChatVertexAI
1314
from src.api_response import create_api_response
15+
from src.graphDB_dataAccess import graphDBdataAccess
1416

1517
def healthy_condition():
1618
output = {"healthy": True}
@@ -36,37 +38,37 @@ def sick():
3638

3739
app.add_api_route("/health", health([healthy_condition, healthy]))
3840

39-
@app.post("/sources")
40-
async def create_source_knowledge_graph(
41-
uri=Form(None), userName=Form(None), password=Form(None), file: UploadFile = File(...), model=Form(),database=Form(None),
42-
):
43-
"""
44-
Calls 'create_source_node_graph' function in a new thread to create
45-
source node in Neo4jGraph when a new file is uploaded.
41+
# @app.post("/sources")
42+
# async def create_source_knowledge_graph(
43+
# uri=Form(None), userName=Form(None), password=Form(None), file: UploadFile = File(...), model=Form(),database=Form(None),
44+
# ):
45+
# """
46+
# Calls 'create_source_node_graph' function in a new thread to create
47+
# source node in Neo4jGraph when a new file is uploaded.
4648

47-
Args:
48-
uri: URI of Graph Service to connect to
49-
userName: Username to connect to Graph Service with ( default : None )
50-
password: Password to connect to Graph Service with ( default : None )
51-
file: File object containing the PDF file
49+
# Args:
50+
# uri: URI of Graph Service to connect to
51+
# userName: Username to connect to Graph Service with ( default : None )
52+
# password: Password to connect to Graph Service with ( default : None )
53+
# file: File object containing the PDF file
5254

53-
Returns:
54-
'Source' Node creation in Neo4j database
55-
"""
56-
try:
57-
result = await asyncio.to_thread(
58-
create_source_node_graph_local_file, uri, userName, password, file, model, database
59-
)
60-
return create_api_response("Success",message="Source Node created successfully",file_source=result.file_source, file_name=result.file_name)
61-
except Exception as e:
62-
# obj_source_node = sourceNode()
63-
job_status = "Failed"
64-
message = "Unable to create source node"
65-
error_message = str(e)
66-
logging.error(f"Error in creating document node: {error_message}")
67-
#update exception in source node
68-
# obj_source_node.update_exception_db(file.filename, error_message)
69-
return create_api_response(job_status, message=message,error=error_message,file_source='local file',file_name=file.filename)
55+
# Returns:
56+
# 'Source' Node creation in Neo4j database
57+
# """
58+
# try:
59+
# result = await asyncio.to_thread(
60+
# create_source_node_graph_local_file, uri, userName, password, file, model, database
61+
# )
62+
# return create_api_response("Success",message="Source Node created successfully",file_source=result.file_source, file_name=result.file_name)
63+
# except Exception as e:
64+
# # obj_source_node = sourceNode()
65+
# job_status = "Failed"
66+
# message = "Unable to create source node"
67+
# error_message = str(e)
68+
# logging.error(f"Error in creating document node: {error_message}")
69+
# #update exception in source node
70+
# # obj_source_node.update_exception_db(file.filename, error_message)
71+
# return create_api_response(job_status, message=message,error=error_message,file_source='local file',file_name=file.filename)
7072

7173
@app.post("/url/scan")
7274
async def create_source_knowledge_graph_url(
@@ -84,22 +86,23 @@ async def create_source_knowledge_graph_url(
8486
source_type=Form(None)
8587
):
8688
try:
89+
graph = create_graph_database_connection(uri, userName, password, database)
90+
graphDb_data_Access = graphDBdataAccess(graph)
8791
if source_type == 's3 bucket' and aws_access_key_id and aws_secret_access_key:
88-
lst_file_name,success_count,failed_count = create_source_node_graph_url_s3(
89-
uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, source_type
92+
lst_file_name,success_count,failed_count = create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, aws_secret_access_key, source_type
9093
)
9194
elif source_type == 'gcs bucket':
92-
lst_file_name,success_count,failed_count = create_source_node_graph_url_gcs(
93-
uri, userName, password, database, model, source_url, gcs_bucket_name, gcs_bucket_folder, source_type
95+
lst_file_name,success_count,failed_count = create_source_node_graph_url_gcs(graph, model, source_url, gcs_bucket_name, gcs_bucket_folder, source_type
9496
)
9597
elif source_type == 'youtube':
96-
lst_file_name,success_count,failed_count = create_source_node_graph_url_youtube(
97-
uri, userName, password, database, model, source_url, source_type
98+
lst_file_name,success_count,failed_count = create_source_node_graph_url_youtube(graph, model, source_url, source_type
9899
)
99100
elif source_type == 'Wikipedia':
100-
lst_file_name,success_count,failed_count = create_source_node_graph_url_wikipedia(
101-
uri, userName, password, database, model, wiki_query, source_type
101+
lst_file_name,success_count,failed_count = create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
102102
)
103+
else:
104+
return create_api_response('Failed',message='source_type is other than accepted source')
105+
103106
if source_url is not None:
104107
source = source_url
105108
else:
@@ -108,11 +111,10 @@ async def create_source_knowledge_graph_url(
108111
message = f"Source Node created successfully for source type: {source_type} and source: {source}"
109112
return create_api_response("Success",message=message,success_count=success_count,failed_count=failed_count,file_name=lst_file_name)
110113
except Exception as e:
111-
job_status = "Failed"
112-
message = f"Unable to create source node for source type: {source_type} and source: {source_url}{wiki_query}"
114+
message = f"Unable to create source node for source type: {source_type} and source: {source}"
113115
error_message = str(e)
114116
logging.exception(f'Exception Stack trace:')
115-
return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
117+
return create_api_response('Failed',message=message,error=error_message,file_source=source_type)
116118

117119

118120
@app.post("/extract")
@@ -122,7 +124,6 @@ async def extract_knowledge_graph_from_file(
122124
password=Form(None),
123125
model=Form(None),
124126
database=Form(None),
125-
file: UploadFile = File(None),
126127
source_url=Form(None),
127128
aws_access_key_id=Form(None),
128129
aws_secret_access_key=Form(None),
@@ -131,7 +132,8 @@ async def extract_knowledge_graph_from_file(
131132
gcs_bucket_name=Form(None),
132133
gcs_bucket_folder=Form(None),
133134
gcs_blob_filename=Form(None),
134-
source_type=Form(None)
135+
source_type=Form(None),
136+
file_name=Form(None)
135137
):
136138
"""
137139
Calls 'extract_graph_from_file' in a new thread to create Neo4jGraph from a
@@ -148,35 +150,38 @@ async def extract_knowledge_graph_from_file(
148150
Nodes and Relations created in Neo4j databse for the pdf file
149151
"""
150152
try:
153+
graph = create_graph_database_connection(uri, userName, password, database)
154+
graphDb_data_Access = graphDBdataAccess(graph)
151155
if source_type == 'local file':
152-
return await asyncio.to_thread(
153-
extract_graph_from_file_local_file, uri, userName, password, model, database, file=file)
156+
result = await asyncio.to_thread(
157+
extract_graph_from_file_local_file, graph, model, file_name)
154158

155159
elif source_type == 's3 bucket' and source_url:
156160
result = await asyncio.to_thread(
157-
extract_graph_from_file_s3, uri, userName, password, model, database,
158-
source_url, aws_access_key_id, aws_secret_access_key)
161+
extract_graph_from_file_s3, graph, model, source_url, aws_access_key_id, aws_secret_access_key)
159162

160163
elif source_type == 'youtube' and source_url:
161164
result = await asyncio.to_thread(
162-
extract_graph_from_file_youtube, uri, userName, password, model, database, source_url)
165+
extract_graph_from_file_youtube, graph, model, source_url)
163166

164167
elif source_type == 'Wikipedia' and wiki_query:
165168
result = await asyncio.to_thread(
166-
extract_graph_from_file_Wikipedia, uri, userName, password, model, database, wiki_query, max_sources)
169+
extract_graph_from_file_Wikipedia, graph, model, wiki_query, max_sources)
167170

168171
elif source_type == 'gcs bucket' and gcs_bucket_name:
169172
result = await asyncio.to_thread(
170-
extract_graph_from_file_gcs, uri, userName, password, model, database,
171-
gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename)
173+
extract_graph_from_file_gcs, graph, model, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename)
174+
else:
175+
return create_api_response('Failed',message='source_type is other than accepted source')
172176

173-
return create_api_response('Success',data=result)
177+
return create_api_response('Success', data=result)
174178
except Exception as e:
175-
message=f"Failed To Process File or LLM Unable To Parse Content"
176-
job_status = "Failed"
177-
error_message = str(e)
178-
return create_api_response(job_status,message=message,error=error_message)
179-
179+
message=f"Failed To Process File:{file_name} or LLM Unable To Parse Content"
180+
logging.info(message)
181+
error_message = str(e)
182+
graphDb_data_Access.update_exception_db(file_name,error_message)
183+
logging.exception(f'Exception Stack trace: {error_message}')
184+
return create_api_response('Failed', message=message, error=error_message, file_name = file_name)
180185

181186
@app.get("/sources_list")
182187
async def get_source_list(uri:str, userName:str, password:str, database:str=None):
@@ -194,7 +199,7 @@ async def get_source_list(uri:str, userName:str, password:str, database:str=None
194199
message="Unable to fetch source list"
195200
error_message = str(e)
196201
logging.exception(f'Exception:{error_message}')
197-
return create_api_response(job_status,message=message,error=error_message)
202+
return create_api_response(job_status, message=message, error=error_message)
198203

199204
@app.post("/update_similarity_graph")
200205
async def update_similarity_graph(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None)):
@@ -210,7 +215,7 @@ async def update_similarity_graph(uri=Form(None), userName=Form(None), password=
210215
message="Unable to update KNN Graph"
211216
error_message = str(e)
212217
logging.exception(f'Exception in update KNN graph:{error_message}')
213-
return create_api_response(job_status,message=message,error=error_message)
218+
return create_api_response(job_status, message=message, error=error_message)
214219

215220
@app.post("/chat_bot")
216221
async def chat_bot(uri=Form(None),model=Form(None),userName=Form(None), password=Form(None), question=Form(None), session_id=Form(None)):
@@ -224,7 +229,7 @@ async def chat_bot(uri=Form(None),model=Form(None),userName=Form(None), password
224229
message="Unable to get chat response"
225230
error_message = str(e)
226231
logging.exception(f'Exception in chat bot:{error_message}')
227-
return create_api_response(job_status,message=message,error=error_message)
232+
return create_api_response(job_status, message=message, error=error_message)
228233

229234
@app.post("/connect")
230235
async def connect(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None)):
@@ -236,7 +241,21 @@ async def connect(uri=Form(None), userName=Form(None), password=Form(None), data
236241
message="Connection failed to connect Neo4j database"
237242
error_message = str(e)
238243
logging.exception(f'Connection failed to connect Neo4j database:{error_message}')
239-
return create_api_response(job_status,message=message,error=error_message)
244+
return create_api_response(job_status, message=message, error=error_message)
245+
246+
@app.post("/upload")
247+
async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber=Form(None), totalChunks=Form(None),
248+
originalname=Form(None), model=Form(None), uri=Form(None), userName=Form(None),
249+
password=Form(None), database=Form(None)):
250+
try:
251+
result = await asyncio.to_thread(upload_file,uri,userName,password,database,model,file,chunkNumber,totalChunks,originalname)
252+
return create_api_response('Success', message=result)
253+
except Exception as e:
254+
job_status = "Failed"
255+
message="Unable to upload large file into chunks or saving the chunks"
256+
error_message = str(e)
257+
logging.info(message)
258+
logging.exception(f'Exception:{error_message}')
240259

241260
def decode_password(pwd):
242261
sample_string_bytes = base64.b64decode(pwd)

backend/src/document_sources/local_file.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,25 @@
44
from tempfile import NamedTemporaryFile
55
from langchain_community.document_loaders import PyPDFLoader
66

7-
def get_documents_from_file(file):
8-
file_name = file.filename
9-
logging.info(f"get_documents_from_file called for filename = {file_name}")
10-
suffix = Path(file.filename).suffix
11-
with NamedTemporaryFile(delete=True, suffix=suffix) as tmp:
12-
shutil.copyfileobj(file.file, tmp)
13-
tmp_path = Path(tmp.name)
14-
loader = PyPDFLoader(str(tmp_path))
7+
# def get_documents_from_file_by_bytes(file):
8+
# file_name = file.filename
9+
# logging.info(f"get_documents_from_file called for filename = {file_name}")
10+
# suffix = Path(file.filename).suffix
11+
# with NamedTemporaryFile(delete=True, suffix=suffix) as tmp:
12+
# shutil.copyfileobj(file.file, tmp)
13+
# tmp_path = Path(tmp.name)
14+
# loader = PyPDFLoader(str(tmp_path))
15+
# pages = loader.load_and_split()
16+
# return file_name, pages
17+
18+
def get_documents_from_file_by_path(file_path,file_name):
19+
file_path = Path(file_path)
20+
if file_path.exists():
21+
logging.info(f'file {file_name} processing')
22+
loader = PyPDFLoader(str(file_path))
1523
pages = loader.load_and_split()
16-
return file_name, pages
24+
else:
25+
logging.info(f'File {file_name} does not exist')
26+
raise Exception(f'File {file_name} does not exist')
27+
28+
return file_name, pages

backend/src/graphDB_dataAccess.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,16 +81,20 @@ def update_KNN_graph(self):
8181
"""
8282
Update the graph node with SIMILAR relationship where embedding scrore match
8383
"""
84+
index = self.graph.query("""show indexes yield * where type = 'VECTOR' and name = 'vector'""")
85+
# logging.info(f'show index vector: {index}')
8486
knn_min_score = os.environ.get('KNN_MIN_SCORE')
85-
result = self.graph.query("""MATCH (c:Chunk)
86-
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
87-
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
88-
WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
89-
""",
90-
{"score":knn_min_score}
91-
)
87+
if index[0]['name'] == 'vector':
88+
logging.info('update KNN graph')
89+
result = self.graph.query("""MATCH (c:Chunk)
90+
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
91+
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
92+
WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
93+
""",
94+
{"score":knn_min_score}
95+
)
96+
logging.info(f"result : {result}")
9297

93-
9498
def connection_check(self):
9599
"""
96100
Args:

0 commit comments

Comments
 (0)