Skip to content

Commit 035bb0f

Browse files
committed
feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml
1 parent 77e021d commit 035bb0f

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

runtime/ops/formatter/mineru_formatter/process.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
3131
start = time.time()
3232
filename = sample[self.filename_key]
3333
filename_without_ext = os.path.splitext(filename)[0]
34-
if not filename.lower().endswith(".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf"):
34+
if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")):
3535
return sample
3636
try:
3737
filepath = sample[self.filepath_key]
3838
parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm")
3939
pdf_bytes = read_fn(filepath)
4040
total_page = len(PdfReader(filepath).pages)
4141
content = ""
42-
for page in range(total_page, 0, 10):
42+
for page in range(0, total_page, 10):
4343
do_parse(
4444
output_dir=self.output_dir,
4545
pdf_file_names=[filename_without_ext],
@@ -48,7 +48,7 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
4848
backend=self.backend,
4949
server_url=self.server_url,
5050
start_page_id=page,
51-
end_page_id=min(page + 10, total_page - 1),
51+
end_page_id=min(page + 9, total_page - 1),
5252
)
5353
if os.path.exists(parse_dir):
5454
content += get_infer_result(".md", filename_without_ext, parse_dir)

runtime/ops/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ dependencies = [
1111
"emoji>=2.15.0",
1212
"jieba>=0.42.1",
1313
"loguru>=0.7.3",
14-
"mineru>=2.6.8",
14+
"mineru>=2.6.5",
1515
"numpy==1.24.3",
16+
"python-multipart>=0.0.20",
1617
"opencv-contrib-python-headless==4.7.0.72",
1718
"opencv-python-headless==4.7.0.72",
1819
"openslide-python>=1.4.3",

0 commit comments

Comments
 (0)