Skip to content

Commit 8a67950

Browse files
Merge branch 'Wikipedia_loader_integration' into create_source_node_youtube
2 parents 0b32da0 + 8bbbc75 commit 8a67950

File tree

4 files changed

+30
-16
lines changed

4 files changed

+30
-16
lines changed

backend/score.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ async def extract_knowledge_graph_from_file(
8888
source_url=Form(None),
8989
aws_access_key_id=Form(None),
9090
aws_secret_access_key=Form(None),
91+
wiki_query=Form(None),
92+
max_sources=Form(None),
9193
):
9294
"""
9395
Calls 'extract_graph_from_file' in a new thread to create Neo4jGraph from a
@@ -113,6 +115,8 @@ async def extract_knowledge_graph_from_file(
113115
model,
114116
file=file,
115117
source_url=None,
118+
wiki_query=wiki_query,
119+
max_sources=max_sources,
116120
)
117121
elif source_url:
118122
return await asyncio.to_thread(
@@ -124,6 +128,8 @@ async def extract_knowledge_graph_from_file(
124128
source_url=source_url,
125129
aws_access_key_id=aws_access_key_id,
126130
aws_secret_access_key=aws_secret_access_key,
131+
wiki_query=wiki_query,
132+
max_sources=max_sources,
127133
)
128134
else:
129135
return {"job_status": "Failure", "error": "No file found"}

backend/src/main.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
from tempfile import NamedTemporaryFile
1818
import re
1919
from langchain_community.document_loaders import YoutubeLoader
20+
from langchain.document_loaders import WikipediaLoader
21+
import warnings
22+
warnings.filterwarnings("ignore")
2023

2124
load_dotenv()
2225
logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
@@ -126,7 +129,7 @@ def check_url_source(url):
126129
except Exception as e:
127130
raise e
128131

129-
def create_source_node_graph_url(uri, userName, password, source_url, max_limit, query_source, model, aws_access_key_id=None,aws_secret_access_key=None):
132+
def create_source_node_graph_url(uri, userName, password, source_url, max_limit, wiki_query,model, aws_access_key_id=None,aws_secret_access_key=None):
130133
"""
131134
Creates a source node in Neo4jGraph and sets properties.
132135
@@ -141,9 +144,6 @@ def create_source_node_graph_url(uri, userName, password, source_url, max_limit,
141144
Success or Failed message of node creation
142145
"""
143146
try:
144-
# if aws_access_key_id !=None and aws_secret_access_key !=None:
145-
# os.environ['AWS_ACCESS_KEY_ID']= aws_access_key_id
146-
# os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
147147
graph = Neo4jGraph(url=uri, username=userName, password=password)
148148
source_type = check_url_source(source_url)
149149
print(f"source type URL:{source_type}")
@@ -164,7 +164,7 @@ def create_source_node_graph_url(uri, userName, password, source_url, max_limit,
164164
file_size=file_info['file_size_bytes']
165165
s3_file_path=str(source_url+file_name)
166166
try:
167-
create_source_node(graph,file_name,file_size,file_type,source_type,model,s3_file_path,aws_access_key_id)
167+
create_source_node(graph,file_name.split('/')[-1],file_size,file_type,source_type,model,s3_file_path,aws_access_key_id)
168168
success_count+=1
169169
except Exception as e:
170170
err_flag=1
@@ -206,10 +206,12 @@ def file_into_chunks(pages: List[Document]):
206206
logging.info("Split file into smaller chunks")
207207
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
208208
chunks = text_splitter.split_documents(pages)
209+
# print('Before chunks',len(chunks))
210+
chunks=chunks[:10]
209211
return chunks
210212

211213
def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None):
212-
# try:
214+
try:
213215
# Extract bucket name and directory from the S3 URL
214216
parsed_url = urlparse(s3_url)
215217
bucket_name = parsed_url.netloc
@@ -222,10 +224,22 @@ def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None)
222224
else:
223225
return None
224226

225-
# except Exception as e:
226-
# return None
227+
except Exception as e:
228+
return None
229+
230+
def wiki_loader(wiki_query,max_sources,max_wiki_pages=2):
231+
232+
searches=wiki_query.split(',')
233+
searches=searches[:max_sources]
234+
pages=[]
235+
for query in searches:
236+
pages.extend(WikipediaLoader(query=query,load_all_available_meta=False).load())
237+
pages=pages[:max_wiki_pages]
238+
return pages
227239

228-
def extract_graph_from_file(uri, userName, password, model, file=None,source_url=None,aws_access_key_id=None,aws_secret_access_key=None):
240+
241+
242+
def extract_graph_from_file(uri, userName, password, model, file=None,source_url=None,aws_access_key_id=None,aws_secret_access_key=None,wiki_query=None,max_sources=None,max_wiki_pages=2):
229243
"""
230244
Extracts a Neo4jGraph from a PDF file based on the model.
231245
@@ -350,22 +364,16 @@ def get_documents_from_file(file):
350364
return file_name,file_key,pages
351365

352366
def get_documents_from_s3(s3_url, aws_access_key_id, aws_secret_access_key):
353-
# if aws_access_key_id !=None and aws_secret_access_key !=None:
354-
# os.environ['AWS_ACCESS_KEY_ID']= aws_access_key_id
355-
# os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
356367

357368
parsed_url = urlparse(s3_url)
358369
bucket = parsed_url.netloc
359370
file_key = parsed_url.path.lstrip('/')
360371
file_name=file_key.split('/')[-1]
361372
s3=boto3.client('s3',aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
362373
response=s3.head_object(Bucket=bucket,Key=file_key)
363-
# response = s3.get_object(Bucket=bucket, Key=file_key)
364374
file_size=response['ContentLength']
365375

366376
logging.info(f'bucket : {bucket}, file key : {file_key}, file size : {file_size}')
367-
368-
# loader = S3FileLoader(bucket,file_key)
369377
pages=get_s3_pdf_content(s3_url,aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
370378
return file_name,file_key,pages
371379

backend/src/openAI_llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def extract_graph_from_OpenAI(model_version,
246246
openai_api_key = os.environ.get('OPENAI_API_KEY')
247247
graph_document_list = []
248248

249-
logging.info(f"create relationship between source,chunck and entity nodes created from {model_version}")
249+
logging.info(f"create relationship between source,chunk and entity nodes created from {model_version}")
250250
for i, chunk_document in tqdm(enumerate(chunks), total=len(chunks)):
251251
if i == 0:
252252
firstChunk = True

backend/temp.pdf

220 KB
Binary file not shown.

0 commit comments

Comments
 (0)