-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpdf2md.py
More file actions
62 lines (45 loc) · 1.74 KB
/
pdf2md.py
File metadata and controls
62 lines (45 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
def pdf2md(pdfname):
## args
model_list = []
pdf_file_name = f"./pdfs/{pdfname}.pdf" # replace with the real pdf path
## prepare env
local_image_dir, local_md_dir = f"output/images/{pdfname}", f"output/{pdfname}"
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
def main():
pdfs=[]
pdfs_path="./pdfs"
for filename in os.listdir(pdfs_path):#read all pdf names
if filename.endswith('.pdf'):
new_filename,_ = os.path.splitext(filename)
pdfs.append(new_filename)
havescaned=[f.name for f in os.scandir("./output") if f.is_dir()]
for filename in pdfs:
if filename not in havescaned:
pdf2md(filename)
print(f"{filename} DOWN")
main()
#tail -f /root/autodl-tmp/output.log
#ps -ef | grep main.py
#kill pid