22
33import pathlib
44import os
5+ import sys
56
6- def extract_images_from_pdfs (working_dir ):
7+ def extract_images_from_pdfs (pdf_files ):
78 """
8- Finds all PDFs in the working directory, extracts embedded images from each,
9- and saves them into dedicated subfolders.
9+ Extracts embedded images from each PDF in the provided list
10+ and saves them into dedicated subfolders relative to the PDF's location .
1011
1112 Args:
12- working_dir ( pathlib.Path): The directory where the script is run and PDFs are located .
13+ pdf_files (list of pathlib.Path): List of PDF files to process .
1314 """
1415 pdf_count = 0
1516 total_images_extracted = 0
1617
17- print (f"Scanning for PDF files in: { working_dir } " )
18-
19- # Use glob to find all PDF files in the working directory
20- pdf_files = list (working_dir .glob ('*.pdf' ))
21-
2218 if not pdf_files :
23- print ("No PDF files found in the working directory ." )
19+ print ("No PDF files to process ." )
2420 return
2521
26- print (f"Found { len (pdf_files )} PDF file(s)." )
22+ print (f"Found { len (pdf_files )} PDF file(s) to process ." )
2723
2824 for pdf_path in pdf_files :
2925 pdf_count += 1
@@ -32,7 +28,7 @@ def extract_images_from_pdfs(working_dir):
3228
3329 # Create a subdirectory for the images from this PDF
3430 output_folder_name = f"{ pdf_path .stem } _images"
35- output_path = working_dir / output_folder_name
31+ output_path = pdf_path . parent / output_folder_name
3632 output_path .mkdir (exist_ok = True ) # Create folder, ignore if already exists
3733
3834 try :
@@ -89,6 +85,31 @@ def extract_images_from_pdfs(working_dir):
8985
9086
9187if __name__ == "__main__" :
92- # Get the current working directory (where the script is *run* from)
93- current_working_directory = pathlib .Path .cwd ()
94- extract_images_from_pdfs (current_working_directory )
88+ args = sys .argv [1 :]
89+ pdf_to_process = []
90+
91+ if not args :
92+ # Default behavior: scan current working directory
93+ cwd = pathlib .Path .cwd ()
94+ print (f"No arguments provided. Scanning for PDF files in: { cwd } " )
95+ pdf_to_process .extend (cwd .glob ('*.pdf' ))
96+ else :
97+ for arg in args :
98+ path = pathlib .Path (arg )
99+ if path .is_file ():
100+ if path .suffix .lower () == '.pdf' :
101+ pdf_to_process .append (path )
102+ else :
103+ print (f"Skipping non-PDF file: { path } " )
104+ elif path .is_dir ():
105+ print (f"Scanning directory: { path } " )
106+ pdf_to_process .extend (path .glob ('*.pdf' ))
107+ else :
108+ print (f"Argument not found (skipping): { arg } " )
109+
110+ if pdf_to_process :
111+ # Sort to ensure consistent processing order
112+ pdf_to_process .sort ()
113+ extract_images_from_pdfs (pdf_to_process )
114+ else :
115+ print ("No valid PDF files found to process." )
0 commit comments