55
66import os
77import cv2
8+ import fastdup .definitions
89import numpy as np
910import base64
1011import io
12+
13+ import pandas as pd
1114from fastdup .definitions import *
1215from fastdup .sentry import fastdup_capture_exception
1316import tarfile
1417import platform
18+ import pathlib
19+ from PIL import Image
20+ from pillow_heif import register_heif_opener
21+
22+ register_heif_opener ()
23+
24+
1525
1626def safe_replace (path ):
1727 return path .replace ('/' ,'_' ).replace ('\\ ' ,'_' ).replace (":" ,'_' )
@@ -98,6 +108,32 @@ def truncate_folder_name(path):
98108 return None
99109
100110
111+
112+ def inner_read (img1_path ):
113+ if img1_path .lower ().endswith ('.heic' ) or img1_path .lower ().endswith ('.heif' ):
114+ img = Image .open (img1_path )
115+ assert img is not None , f"Failed to open image from { img1_path } "
116+ img = np .array (img )
117+ channels = img .shape [- 1 ] if img .ndim == 3 else 1
118+ if channels == 1 :
119+ img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
120+ elif channels == 4 :
121+ img = cv2 .cvtColor (img , cv2 .COLOR_RGBA2RGB )
122+ img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
123+ else :
124+ img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
125+ else :
126+ img = cv2 .imread (img1_path , cv2 .IMREAD_UNCHANGED )
127+ assert img is not None , f"Failed to open image from { img1_path } "
128+ if img .dtype == 'uint16' :
129+ img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
130+ channels = img .shape [- 1 ] if img .ndim == 3 else 1
131+ if channels == 1 :
132+ img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
133+ elif channels == 4 :
134+ img = cv2 .cvtColor (img , cv2 .COLOR_RGBA2RGB )
135+ return img
136+
101137def fastdup_imread (img1_path , input_dir , kwargs ):
102138 """
103139 Read an image from local file, or from a tar file, or from s3/minio path using minio client mc
@@ -108,22 +144,22 @@ def fastdup_imread(img1_path, input_dir, kwargs):
108144 Returns:
109145 img1 (np.array): the image
110146 """
111- assert img1_path is not None , f"img1_path should not be None { input_dir } , { kwargs } "
112-
147+ assert not pd .isnull (img1_path ), f"img1_path should not be None { img1_path } { input_dir } , { kwargs } "
113148 is_minio_or_s3 = False
114- if input_dir is not None :
149+ if input_dir is not None and (isinstance (input_dir , str ) or isinstance (input_dir , pathlib .Path )):
150+ if input_dir .startswith ('~/' ):
151+ input_dir = os .path .expanduser (input_dir )
115152 if not input_dir .startswith ("s3://" ) and not input_dir .startswith ("minio://" ):
116153 assert os .path .exists (input_dir ), "Failed to find input_dir: " + input_dir
117154 else :
118155 is_minio_or_s3 = True
119156
120-
157+ if img1_path .startswith ('~/' ):
158+ img1_path = os .path .expanduser (img1_path )
121159 if os .path .exists (img1_path ):
122- img = cv2 .imread (img1_path , cv2 .IMREAD_UNCHANGED )
123- if img is not None :
124- if img .dtype == 'uint16' :
125- img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
126- img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
160+ img = inner_read (img1_path )
161+
162+
127163 return img
128164 elif ('/' + S3_TEMP_FOLDER + '/' in img1_path or '/' + S3_TEST_TEMP_FOLDER + '/' in img1_path ) and \
129165 '.tar/' in img1_path :
@@ -150,38 +186,68 @@ def fastdup_imread(img1_path, input_dir, kwargs):
150186 minio_prefix = "/" .join (input_dir .replace ("minio://" , "" ).split ('/' )[:2 ])
151187 #print('minio_prefix', minio_prefix)
152188 download_minio (minio_prefix + '/' + local_dir_no_temp + '/' + os .path .basename (img1_path ), S3_TEMP_FOLDER )
153- ret = cv2 . imread (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
189+ ret = inner_read (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
154190 assert ret is not None , f"Failed to read image { os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path ))} "
155191 return ret
156192 elif input_dir .startswith ("s3://" ):
157193 local_dir_no_temp = truncate_folder_name (os .path .dirname (img1_path ))
158194 s3_prefix = 's3://' + "/" .join (input_dir .replace ("s3://" , "" ).split ('/' )[:1 ])
159195 #print('s3_prefix', s3_prefix)
160196 download_s3 (s3_prefix + '/' + local_dir_no_temp + '/' + os .path .basename (img1_path ), S3_TEMP_FOLDER )
161- ret = cv2 .imread (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
162- assert ret is not None , f"Failed to read image { os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path ))} "
197+ ret = inner_read (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
163198 return ret
164199 #Failed to read image1 ..\milvus_vector_db\data\images\..\milvus_vector_db\data\images\Egyptian_Mau_210.jpg
165200 elif img1_path .startswith (input_dir ) and len (img1_path ) >= len (input_dir ) + 2 :
166201 suffix = img1_path [len (input_dir ):]
167202 if input_dir in suffix and os .path .exists (suffix ):
168- img = cv2 .imread (suffix , cv2 .IMREAD_UNCHANGED )
169- if img is not None :
170- if img .dtype == 'uint16' :
171- img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
172- img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
203+ img = inner_read (suffix )
173204 return img
174205 elif "''" in img1_path : # try to handle french and other languages where c side doubles the '' otherwise pandas can't read it
175206 new_img1_path = img1_path .replace ("''" ,"" )
176207 if os .path .exists (new_img1_path ):
177- img = cv2 . imread (new_img1_path , cv2 . IMREAD_UNCHANGED )
208+ img = inner_read (new_img1_path )
178209 return img
179210
180211
181212 print ('Failed to read image from img_path' , img1_path )
182213 return None
183214
184215
216+ def check_valid_image_extension (filename ):
217+ # Check whether a file name ends with an image extension
218+ # Required by OpenCV imwrite
219+ return any ([filename .lower ().endswith (ext ) for ext in SUPPORTED_IMG_FORMATS ])
220+
221+
222+ def fastdup_imwrite (local_file , im ):
223+ has_extension = check_valid_image_extension (local_file )
224+ if has_extension :
225+ ret = cv2 .imwrite (local_file , im )
226+ else :
227+ local_file_wext = local_file + '.jpg'
228+ ret = cv2 .imwrite (local_file_wext , im )
229+ assert ret , f"Failed to save img to { local_file } most likely filename is too long for the OS"
230+
231+ # Rename back if extension was added
232+ os .rename (local_file_wext , local_file )
233+ assert os .path .isfile (local_file ), "Failed to save img to " + local_file
234+
235+ if ret == False and len (local_file ) >= 254 :
236+ try :
237+ import uuid
238+ import shutil
239+ file , ext = os .path .splitext (local_file )
240+ tmp_filename = str (uuid .uuid4 ()) + ext
241+ ret = cv2 .imwrite (tmp_filename , im )
242+ if os .path .exists (local_file ):
243+ os .unlink (local_file )
244+ shutil .move (tmp_filename , local_file )
245+ finally :
246+ assert ret , f"Failed to save img to { local_file } most likely filename is too long for the OS"
247+ elif ret == False :
248+ assert ret , f"Failed to save img to { local_file } "
249+ assert os .path .isfile (local_file ), "Failed to save img to " + local_file
250+
185251def get_type (str ):
186252 if 'train' in str :
187253 return 'train'
@@ -282,17 +348,7 @@ def draw_text(img, text,
282348
283349 return text_size , img
284350
285- def create_triplet_img (row , work_dir , save_path , extract_filenames , get_bounding_box_func = None , input_dir = None , kwargs = None ):
286- #v1 = 'id_to_filename_func' in kwargs
287- id_from , id_to = row ['from' ], row ['to' ]
288- #if v1:
289- # assert not isinstance(id_from, str), f"Wrong type {row}"
290-
291- #suffix_from, suffix_to = (f'_{id_from}', f'_{id_to}') if v1 else ('', '')
292- #if v1:
293- # id_to_filename_func = kwargs['id_to_filename_func']
294- # row[['from','to']] = [id_to_filename_func(row['from']), id_to_filename_func(row['to'])]
295-
351+ def create_triplet_img (index , row , work_dir , save_path , extract_filenames , get_bounding_box_func = None , input_dir = None , kwargs = None ):
296352 img1_path , img2_path , distance , ptype = extract_filenames (row , work_dir , save_path , kwargs )
297353
298354 img1 = fastdup_imread (img1_path , input_dir , kwargs )
@@ -301,6 +357,10 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
301357 assert img1 is not None , f"Failed to read image1 { img1_path } { str (input_dir )} "
302358 assert img2 is not None , f"Failed to read image2 { img2_path } { str (input_dir )} "
303359
360+ if 'crop_filename_from' in row and 'crop_filename_to' in row :
361+ id_from , id_to = row ['crop_filename_from' ], row ['crop_filename_to' ]
362+ else :
363+ id_from , id_to = row ['from' ], row ['to' ]
304364 img1 = plot_bounding_box (img1 , get_bounding_box_func , id_from )
305365 img2 = plot_bounding_box (img2 , get_bounding_box_func , id_to )
306366
@@ -317,9 +377,20 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
317377 if rimg1 .shape != rimg2 .shape : # combination of grayscale and color
318378 if len (rimg1 .shape ) == 2 :
319379 rimg1 = cv2 .cvtColor (rimg1 , cv2 .COLOR_GRAY2RGB )
380+ elif len (rimg1 .shape ) == 3 and rimg1 .shape [2 ] == 4 :
381+ rimg1 = cv2 .cvtColor (rimg1 , cv2 .COLOR_RGBA2RGB )
320382 if len (rimg2 .shape ) == 2 :
321383 rimg2 = cv2 .cvtColor (rimg2 , cv2 .COLOR_GRAY2RGB )
322- cimage = cv2 .addWeighted (rimg1 ,alpha ,rimg2 ,1 - alpha ,0 )
384+ elif len (rimg1 .shape ) == 3 and rimg2 .shape [2 ] == 4 :
385+ rimg2 = cv2 .cvtColor (rimg2 , cv2 .COLOR_RGBA2RGB )
386+
387+ error_weighted = False
388+ try :
389+ cimage = cv2 .addWeighted (rimg1 ,alpha ,rimg2 ,1 - alpha ,0 )
390+ except Exception as ex :
391+ error_weighted = True
392+ fastdup_capture_exception ("create_triplet_image" , ex , True , f"Dimes are { rimg1 .shape } { rimg2 .shape } " )
393+
323394
324395 hierarchical_run = kwargs is not None and 'hierarchical_run' in kwargs and kwargs ['hierarchical_run' ]
325396 text1 = os .path .splitext (os .path .basename (img1_path ))[0 ]
@@ -330,11 +401,11 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
330401
331402 (w , h ),nimg1 = draw_text (rimg1 , text1 , font_scale = 1 , pos = (10 , 10 ))
332403 (w , h ),nimg2 = draw_text (rimg2 , text2 , font_scale = 1 , pos = (10 , 10 ))
333- (w , h ),cimage = draw_text (cimage , 'blended image' , font_scale = 1 , pos = (10 , 10 ))
404+ if not error_weighted :
405+ (w , h ),cimage = draw_text (cimage , 'blended image' , font_scale = 1 , pos = (10 , 10 ))
406+ assert cimage .shape [0 ] > 0 and cimage .shape [1 ] > 0
334407
335- assert cimage .shape [0 ] > 0 and cimage .shape [1 ] > 0
336-
337- if hierarchical_run :
408+ if hierarchical_run or error_weighted :
338409 hcon_img = hconcat_resize_min ([nimg1 , nimg2 ])
339410 else :
340411 hcon_img = hconcat_resize_min ([nimg1 , nimg2 , cimage ])
@@ -355,11 +426,9 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
355426 lazy_load = 'lazy_load' in kwargs and kwargs ['lazy_load' ]
356427 if lazy_load :
357428 os .makedirs (os .path .join (save_path , 'images' ), exist_ok = True )
358- hcon_img_path = f'{ save_path } /images/{ pid } .jpg'
429+ hcon_img_path = f'{ save_path } /images/{ pid } _ { index } .jpg'
359430 else :
360- hcon_img_path = f'{ save_path } /{ pid } .jpg'
361- cv2 .imwrite (hcon_img_path , hcon_img )
362- assert os .path .exists (hcon_img_path ), f"Failed to write image to { hcon_img_path } "
363-
431+ hcon_img_path = f'{ save_path } /{ pid } _{ index } .jpg'
432+ fastdup_imwrite (hcon_img_path , hcon_img )
364433 return hcon_img , hcon_img_path
365434
0 commit comments