-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
105 lines (80 loc) · 3.84 KB
/
main.py
File metadata and controls
105 lines (80 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
import PyPDF2
from PyPDF2.generic import NameObject, TextStringObject
# Function to read PDF and extract text
def read_pdf(input_pdf_path):
text = ""
with open(input_pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
page_text = page.extract_text() or "" # Extract text from each page
text += page_text + "\n\n" # Keep the paragraph structure intact
return text.strip()
# Function to clean and normalize extracted text
def clean_tag(tag_content):
tag_content = re.sub(r"[^a-zA-Z0-9\s]", "", tag_content) # Remove special characters
tag_content = re.sub(r'\s+', ' ', tag_content) # Replace multiple spaces with a single space
return tag_content.strip()
# Function to extract the paragraph below each specific tag
def extract_info_after_tags(text, predefined_tags):
tag_data = {}
for tag in predefined_tags:
# Define the next tag or stop extraction after a paragraph or new heading
next_tags = '|'.join(map(re.escape, predefined_tags))
# Build the regex pattern to capture only the paragraph below the tag
pattern = re.compile(
rf"(?i){re.escape(tag)}\s*(.*?)(?=\n\s*\n|\s*(?:{next_tags})|\Z)", # Stops at paragraph break or next tag
re.DOTALL
)
match = pattern.search(text)
if match:
tag_content = match.group(1).strip()
tag_data[tag] = clean_tag(tag_content)
print(f"Extracted text for '{tag}': {tag_content[:100]}...") # Print first 100 characters for debugging
else:
print(f"Tag '{tag}' not found in the document.")
return tag_data
# Function to add/update metadata to a PDF
def add_metadata_to_pdf(input_pdf_path, output_pdf_path, tag_data):
with open(input_pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
# Copy pages to the writer
for page in reader.pages:
writer.add_page(page)
# Extract existing metadata
existing_metadata = reader.metadata or {}
# Add extracted tag data to the metadata
for tag, content in tag_data.items():
existing_metadata[NameObject(f"/{tag}")] = TextStringObject(content)
# Add updated metadata to the writer
writer.add_metadata(existing_metadata)
# Write to a new PDF file
with open(output_pdf_path, 'wb') as output_file:
writer.write(output_file)
print(f"Metadata added to {output_pdf_path}")
# Example usage:
input_pdf = "Z:/13_Case Studies/02_CCHMC/12U – iTransition (Children's Hospital)/12SU - iTransition Case Study.pdf" # PDF from uploaded file
output_pdf = "Z:/13_Case Studies/02_CCHMC/12U – iTransition (Children's Hospital)/12SU - iTransition Case Study_with_metadata.pdf" # Output PDF path
# Predefined tags to look for in the document
predefined_tags = ["The Challenge", "The Approach", "The Results", "Outcomes"]
# Read the PDF and extract text
text = read_pdf(input_pdf)
# Extract relevant information based on predefined tags
tag_data = extract_info_after_tags(text, predefined_tags)
# Add extracted tag data to PDF metadata
add_metadata_to_pdf(input_pdf, output_pdf, tag_data)
# Function to view PDF metadata
def view_pdf_metadata(file_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
metadata = reader.metadata # Get the PDF metadata
# Print the metadata
if metadata:
print(f"Metadata for {file_path}:")
for key, value in metadata.items():
print(f"{key}: {value}")
else:
print(f"No metadata found for {file_path}.")
# View the metadata of the output PDF
view_pdf_metadata(output_pdf)