Skip to content

Commit 4cd0b4b

Browse files
committed
updated chunking
1 parent 3f054a4 commit 4cd0b4b

File tree

3 files changed

+11
-15
lines changed

3 files changed

+11
-15
lines changed

src/chunking/MPNet/local/api.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,18 @@
44
import aiohttp
55
import pandas as pd
66
import io
7-
from PyPDF2 import PdfReader
7+
import fitz
88
import os
99

1010
def extract_text_from_txt(txt_path):
1111
with open(txt_path, 'r', encoding='utf-8') as file:
1212
return file.read()
1313

1414
def extract_text_from_pdf(pdf_path):
15-
reader = PdfReader(pdf_path)
16-
number_of_pages = len(reader.pages)
15+
doc = fitz.open(pdf_path) # open a document
1716
all_text = ""
18-
19-
for page in reader.pages:
20-
all_text += page.extract_text()
17+
for page in doc: # iterate the document pages
18+
all_text += page.get_text("text")
2119

2220
return all_text
2321

@@ -49,8 +47,8 @@ async def embed():
4947
text_data = uploaded_file.stream.read().decode('utf-8')
5048
elif file_extension == '.pdf':
5149
pdf_file_stream = io.BytesIO(uploaded_file.stream.read())
52-
reader = PdfReader(pdf_file_stream)
53-
pages = [(i, page.extract_text()) for i, page in enumerate(reader.pages)] # Modified line
50+
doc = fitz.open("pdf", pdf_file_stream.getvalue())
51+
pages = [(i, page.get_text("text")) for i, page in enumerate(doc)] # Modified line
5452
text_data = pages
5553
else:
5654
return (print('Wrong format of file submitted'))

src/chunking/MPNet/local/model.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from sklearn.metrics.pairwise import cosine_similarity
99
import math
1010
from scipy.signal import argrelextrema
11-
from PyPDF2 import PdfReader
11+
import fitz
1212
from request import ModelRequest
1313
import torch
1414
import nltk
@@ -127,12 +127,10 @@ def activate_similarities(self,similarities:np.array, p_size=10)->np.array:
127127

128128

129129
def extract_text_from_pdf(pdf_path):
130-
reader = PdfReader(pdf_path)
131-
number_of_pages = len(reader.pages)
130+
doc = fitz.open(pdf_path) # open a document
132131
all_text = ""
133-
134-
for page in reader.pages:
135-
all_text += page.extract_text()
132+
for page in doc: # iterate the document pages
133+
all_text += page.get_text("text")
136134

137135
return all_text
138136

src/chunking/MPNet/local/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ segeval
77
numpy
88
nltk
99
scipy
10-
PyPDF2
10+
PyMuPDF

0 commit comments

Comments
 (0)