Skip to content

Commit 19212aa

Browse files
committed
Add CLI support for specifying PDFs and directories
The script now accepts file and directory arguments via the command line, allowing users to specify which PDFs to process. If no arguments are provided, it defaults to scanning the current working directory. The extraction function was refactored to accept a list of PDF files instead of a directory.
1 parent 118c588 commit 19212aa

File tree

1 file changed

+36
-15
lines changed

1 file changed

+36
-15
lines changed

Programming/pdfextract.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,24 @@
22

33
import pathlib
44
import os
5+
import sys
56

6-
def extract_images_from_pdfs(working_dir):
7+
def extract_images_from_pdfs(pdf_files):
78
"""
8-
Finds all PDFs in the working directory, extracts embedded images from each,
9-
and saves them into dedicated subfolders.
9+
Extracts embedded images from each PDF in the provided list
10+
and saves them into dedicated subfolders relative to the PDF's location.
1011
1112
Args:
12-
working_dir (pathlib.Path): The directory where the script is run and PDFs are located.
13+
pdf_files (list of pathlib.Path): List of PDF files to process.
1314
"""
1415
pdf_count = 0
1516
total_images_extracted = 0
1617

17-
print(f"Scanning for PDF files in: {working_dir}")
18-
19-
# Use glob to find all PDF files in the working directory
20-
pdf_files = list(working_dir.glob('*.pdf'))
21-
2218
if not pdf_files:
23-
print("No PDF files found in the working directory.")
19+
print("No PDF files to process.")
2420
return
2521

26-
print(f"Found {len(pdf_files)} PDF file(s).")
22+
print(f"Found {len(pdf_files)} PDF file(s) to process.")
2723

2824
for pdf_path in pdf_files:
2925
pdf_count += 1
@@ -32,7 +28,7 @@ def extract_images_from_pdfs(working_dir):
3228

3329
# Create a subdirectory for the images from this PDF
3430
output_folder_name = f"{pdf_path.stem}_images"
35-
output_path = working_dir / output_folder_name
31+
output_path = pdf_path.parent / output_folder_name
3632
output_path.mkdir(exist_ok=True) # Create folder, ignore if already exists
3733

3834
try:
@@ -89,6 +85,31 @@ def extract_images_from_pdfs(working_dir):
8985

9086

9187
if __name__ == "__main__":
92-
# Get the current working directory (where the script is *run* from)
93-
current_working_directory = pathlib.Path.cwd()
94-
extract_images_from_pdfs(current_working_directory)
88+
args = sys.argv[1:]
89+
pdf_to_process = []
90+
91+
if not args:
92+
# Default behavior: scan current working directory
93+
cwd = pathlib.Path.cwd()
94+
print(f"No arguments provided. Scanning for PDF files in: {cwd}")
95+
pdf_to_process.extend(cwd.glob('*.pdf'))
96+
else:
97+
for arg in args:
98+
path = pathlib.Path(arg)
99+
if path.is_file():
100+
if path.suffix.lower() == '.pdf':
101+
pdf_to_process.append(path)
102+
else:
103+
print(f"Skipping non-PDF file: {path}")
104+
elif path.is_dir():
105+
print(f"Scanning directory: {path}")
106+
pdf_to_process.extend(path.glob('*.pdf'))
107+
else:
108+
print(f"Argument not found (skipping): {arg}")
109+
110+
if pdf_to_process:
111+
# Sort to ensure consistent processing order
112+
pdf_to_process.sort()
113+
extract_images_from_pdfs(pdf_to_process)
114+
else:
115+
print("No valid PDF files found to process.")

0 commit comments

Comments
 (0)