Skip to content

Commit f4eed15

Browse files
Merge pull request CarterPerez-dev#5 from Heritage-XioN/Implement-Office-document-metadata-removal-for-Word-(.docx)-Excel-(.xlsx)-and-PowerPoint-(.pptx)-files
Implement office document metadata removal for word (.docx) excel (.xlsx) and power point (.pptx) files
2 parents ffa2bda + e98488c commit f4eed15

22 files changed

+2002
-234
lines changed

pyproject.toml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ dependencies = [
1010
"pillow>=12.0.0",
1111
"piexif>=1.1.3",
1212
"pypdf>=6.5.0",
13+
"openpyxl>=3.1.5",
14+
"python-pptx>=1.0.2",
15+
"python-docx>=1.2.0",
1316
]
1417

1518
[project.optional-dependencies]
@@ -70,6 +73,26 @@ show_missing = true
7073
[tool.mypy]
7174
python_version = "3.10"
7275
warn_return_any = true
73-
warn_unused_ignores = true
76+
warn_unused_ignores = false
7477
ignore_missing_imports = true
7578

79+
# pptx library doesn't have type stubs
80+
[[tool.mypy.overrides]]
81+
module = "pptx.*"
82+
ignore_missing_imports = true
83+
ignore_errors = true
84+
85+
[[tool.mypy.overrides]]
86+
module = "src.services.powerpoint_handler"
87+
ignore_errors = true
88+
89+
# docx library doesn't have type stubs
90+
[[tool.mypy.overrides]]
91+
module = "docx.*"
92+
ignore_missing_imports = true
93+
ignore_errors = true
94+
95+
[[tool.mypy.overrides]]
96+
module = "src.services.worddoc_handler"
97+
ignore_errors = true
98+

src/services/excel_handler.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
"""
2+
Excel metadata handler for Excel files.
3+
4+
This module provides the ExcelHandler class which implements the MetadataHandler
5+
interface for Excel files (.xlsx, .xlsm, .xltx, .xltm). Uses openpyxl for
6+
reading and writing Excel workbook properties.
7+
8+
Note:
9+
Does not support password-protected/encrypted workbooks.
10+
"""
11+
12+
import shutil
13+
from pathlib import Path
14+
from typing import Any
15+
16+
from openpyxl import load_workbook
17+
18+
from src.services.metadata_handler import MetadataHandler
19+
from src.utils.exceptions import (
20+
MetadataNotFoundError,
21+
MetadataReadingError,
22+
UnsupportedFormatError,
23+
)
24+
25+
# Supported Excel formats
26+
FORMAT_MAP = {
27+
"xlsx": "xlsx",
28+
"xlsm": "xlsm",
29+
"xltx": "xltx",
30+
"xltm": "xltm",
31+
}
32+
33+
# Properties to preserve (not deleted during wipe)
34+
PRESERVED_PROPERTIES = {"created", "modified", "language"}
35+
36+
37+
class ExcelHandler(MetadataHandler):
38+
"""
39+
Excel metadata handler for Excel files.
40+
41+
Handles extraction and removal of document properties from Excel workbooks
42+
including author, title, subject, keywords, and other core properties.
43+
44+
Attributes:
45+
keys_to_delete: List of property names to be wiped.
46+
"""
47+
48+
def __init__(self, filepath: str):
49+
"""
50+
Initialize the Excel handler.
51+
52+
Args:
53+
filepath: Path to the Excel file to process.
54+
"""
55+
super().__init__(filepath)
56+
self.keys_to_delete: list[str] = []
57+
58+
def _detect_format(self) -> str:
59+
"""
60+
Detect Excel format from file extension.
61+
62+
Returns:
63+
Normalized format string ('xlsx', 'xlsm', 'xltx', or 'xltm').
64+
65+
Raises:
66+
UnsupportedFormatError: If file extension is not a supported Excel format.
67+
"""
68+
ext = Path(self.filepath).suffix.lower()
69+
normalised = FORMAT_MAP.get(ext[1:]) # Remove leading dot
70+
if normalised is None:
71+
raise UnsupportedFormatError(f"Unsupported format: {ext}")
72+
73+
return normalised
74+
75+
def read(self) -> dict[str, Any]:
76+
"""
77+
Extract metadata properties from the Excel workbook.
78+
79+
Reads all document properties from the workbook and identifies
80+
which properties should be wiped (excludes created, modified, language).
81+
82+
Returns:
83+
Dictionary of property names to their values.
84+
85+
Raises:
86+
MetadataReadingError: If the workbook is password-protected.
87+
MetadataNotFoundError: If no properties are found.
88+
"""
89+
self.metadata.clear()
90+
self.keys_to_delete.clear()
91+
wb = load_workbook(Path(self.filepath))
92+
try:
93+
if wb.security.workbookPassword is not None:
94+
raise MetadataReadingError("File is encrypted.")
95+
96+
if wb.properties is None:
97+
raise MetadataNotFoundError("No metadata found in the file.")
98+
99+
for attr, value in vars(wb.properties).items():
100+
self.metadata[attr] = value
101+
if attr not in PRESERVED_PROPERTIES:
102+
self.keys_to_delete.append(attr)
103+
104+
return self.metadata
105+
finally:
106+
wb.close()
107+
108+
def wipe(self) -> None:
109+
"""
110+
Remove metadata properties from the Excel workbook.
111+
112+
Clears all properties identified during read() except for
113+
preserved properties (created, modified, language).
114+
115+
Raises:
116+
MetadataNotFoundError: If no properties are found.
117+
"""
118+
self.processed_metadata.clear()
119+
wb = load_workbook(Path(self.filepath))
120+
try:
121+
if wb.properties is None:
122+
raise MetadataNotFoundError("No metadata found in the file.")
123+
124+
# Clear each property marked for deletion
125+
for attr in self.keys_to_delete:
126+
if hasattr(wb.properties, attr):
127+
setattr(wb.properties, attr, None)
128+
129+
self.processed_metadata = wb.properties
130+
finally:
131+
wb.close()
132+
133+
def save(self, output_path: str | None) -> None:
134+
"""
135+
Save the workbook with cleaned metadata to the output path.
136+
137+
Creates a copy of the original file and applies the wiped
138+
metadata properties to it.
139+
140+
Args:
141+
output_path: Path where the cleaned file should be saved.
142+
143+
Raises:
144+
ValueError: If output_path is None or empty.
145+
"""
146+
if not output_path:
147+
raise ValueError("output_path is required")
148+
149+
destination_file_path = Path(output_path)
150+
shutil.copy2(self.filepath, destination_file_path)
151+
152+
# Use keep_vba=True for macro-enabled workbooks
153+
detected_format = self._detect_format()
154+
if detected_format == "xlsm":
155+
wb = load_workbook(destination_file_path, keep_vba=True)
156+
else:
157+
wb = load_workbook(destination_file_path)
158+
159+
try:
160+
# Apply wiped properties
161+
for attr, value in vars(self.processed_metadata).items():
162+
setattr(wb.properties, attr, value)
163+
164+
wb.save(destination_file_path)
165+
finally:
166+
wb.close()

src/services/image_handler.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ def read(self):
9090
9191
Uses actual format detection to select the appropriate processor.
9292
"""
93+
self.metadata.clear()
94+
self.text_keys_to_delete.clear()
95+
self.tags_to_delete.clear()
96+
9397
self.detected_format = self._detect_format()
9498
processor = self.processors.get(self.detected_format)
9599

@@ -111,6 +115,8 @@ def wipe(self) -> None:
111115
112116
Uses actual format detection to select the appropriate processor.
113117
"""
118+
self.processed_metadata.clear()
119+
self.clean_pnginfo = None
114120
# Use cached format if available, otherwise detect
115121
if not self.detected_format:
116122
self.detected_format = self._detect_format()

src/services/metadata_factory.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88

99
from pathlib import Path
1010

11+
from src.services.excel_handler import ExcelHandler
1112
from src.services.image_handler import ImageHandler
1213
from src.services.pdf_handler import PDFHandler
14+
from src.services.powerpoint_handler import PowerpointHandler
15+
from src.services.worddoc_handler import WorddocHandler
1316
from src.utils.exceptions import UnsupportedFormatError
1417

1518

@@ -41,19 +44,19 @@ def get_handler(filepath: str):
4144
UnsupportedFormatError: If no handler is defined for the file type.
4245
ValueError: If the path is not a valid file.
4346
"""
44-
supported_extensions = ".jpg, .jpeg, .png"
47+
supported_extensions = ".jpg, .jpeg, .png, .pdf, .docx, .xlsx, .xlsm, .xltx, .xltm, .pptx, .pptm, .potx, .potm"
4548
ext = Path(filepath).suffix.lower()
4649
if Path(filepath).is_file():
4750
if ext in [".jpg", ".jpeg", ".png"]:
4851
return ImageHandler(filepath)
4952
elif ext == ".pdf":
5053
return PDFHandler(filepath)
51-
52-
# TODO: implement other handlers
53-
# elif ext == ".xlsx":
54-
# return ExcelHandler(filepath)
55-
# elif ext == ".pptx":
56-
# return PowerPointHandler(filepath)
54+
elif ext in [".xlsx", ".xlsm", ".xltx", ".xltm"]:
55+
return ExcelHandler(filepath)
56+
elif ext in [".pptx", ".pptm", ".potx", ".potm"]:
57+
return PowerpointHandler(filepath)
58+
elif ext == ".docx":
59+
return WorddocHandler(filepath)
5760
else:
5861
raise UnsupportedFormatError(
5962
f"No handler defined for {ext} files. we curently only support {supported_extensions} files."

0 commit comments

Comments
 (0)