55import re
66import cv2
77import numpy as np
8+ import fitz # PyMuPDF
89
910# We'll use OpenCV's built-in Haar Cascade for face detection as it's lightweight and usually pre-installed with opencv-python
1011def download_image (url , output_path ):
@@ -24,8 +25,26 @@ def download_image(url, output_path):
2425 return False
2526
2627def smart_crop_face (image_path , output_path , target_size = (400 , 400 )):
27- # Load image
28- img = cv2 .imread (image_path )
28+ # Check if this is a PDF
29+ try :
30+ if image_path .lower ().endswith ('.pdf' ):
31+ doc = fitz .open (image_path )
32+ for page in doc :
33+ pix = page .get_pixmap ()
34+ img = np .frombuffer (pix .samples , dtype = np .uint8 ).reshape (pix .h , pix .w , pix .n )
35+ if pix .n == 4 :
36+ img = cv2 .cvtColor (img , cv2 .COLOR_RGBA2BGR )
37+ elif pix .n == 1 :
38+ img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2BGR )
39+ else :
40+ img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
41+ break # Just get first page image
42+ else :
43+ img = cv2 .imread (image_path )
44+ except Exception as e :
45+ print (f"Error loading image or PDF { image_path } : { e } " )
46+ return False
47+
2948 if img is None :
3049 print (f"Could not read image { image_path } " )
3150 return False
@@ -145,17 +164,34 @@ def main():
145164 filename = filename + ".jpg"
146165
147166 output_file = os .path .join (target_dir , filename )
167+
168+ # Since download might be a PDF, check the headers or just try downloading and inspecting
148169 temp_file = os .path .join (target_dir , f"temp_{ filename } " )
149170
150171 print (f"Processing { name } ..." )
151172
152173 if download_image (img_url , temp_file ):
153- if smart_crop_face (temp_file , output_file ):
174+ # Try to detect if it's a PDF by reading the first few bytes
175+ is_pdf = False
176+ with open (temp_file , 'rb' ) as tf :
177+ header = tf .read (4 )
178+ if header == b'%PDF' :
179+ is_pdf = True
180+
181+ # Rename temp file if it's a PDF so PyMuPDF knows how to parse it
182+ proc_file = temp_file
183+ if is_pdf :
184+ proc_file = temp_file + ".pdf"
185+ os .rename (temp_file , proc_file )
186+
187+ if smart_crop_face (proc_file , output_file ):
154188 print (f" -> Saved smart cropped image to { output_file } " )
155189 else :
156190 print (f" -> Failed to process image" )
157191
158192 # Cleanup temp file
193+ if os .path .exists (proc_file ):
194+ os .remove (proc_file )
159195 if os .path .exists (temp_file ):
160196 os .remove (temp_file )
161197 else :
0 commit comments