-
Notifications
You must be signed in to change notification settings - Fork 170
Expand file tree
/
Copy pathPix2text_img2md.py
More file actions
66 lines (59 loc) · 1.63 KB
/
Pix2text_img2md.py
File metadata and controls
66 lines (59 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
from pathlib import Path
from pix2text import Pix2Text
text_formula_config = dict(
languages=('en', 'ch_sim'),
mfd=dict(
model_path=os.path.expanduser(
"YOUR MODEL PATH"
),
),
formula=dict(
model_name='mfr-pro',
model_backend='onnx',
model_dir=os.path.expanduser(
"YOUR MODEL PATH"
),
),
text=dict(
rec_model_name='doc-densenet_lite_136-gru',
rec_model_backend='pytorch',
rec_model_fp=os.path.expanduser(
"YOUR MODEL PATH"
),
),
)
table_config = {
'model_path': os.path.expanduser(
"YOUR MODEL PATH"
)
}
layout_config = {
'model_type': 'DocXLayoutParser',
'table_as_image': False,
}
total_config = {
'layout': layout_config,
'text_formula': text_formula_config,
'table': table_config,
}
p2t = Pix2Text.from_config(
total_configs=total_config,
enable_formula=True,
enable_table=True,
)
def batch_process_pdfs(input_dir, output_dir):
os.makedirs(output_dir, exist_ok=True)
pdf_files = list(Path(input_dir).glob('*.pdf'))
for pdf_path in pdf_files:
try:
doc = p2t.recognize_pdf(str(pdf_path))
output_subdir = os.path.join(output_dir, pdf_path.stem)
doc.to_markdown(output_subdir)
print(f"Process sucessfully: {pdf_path.name}")
except Exception as e:
print(f"Process {pdf_path.name} wrong: {str(e)}")
if __name__ == "__main__":
input_directory = "/path/to/input/folder"
output_directory = "/path/to/output/folder"
batch_process_pdfs(input_directory, output_directory)