11import logging
2+ import re
23from collections import Counter
34from itertools import groupby
45from os import path , remove
78from cloudinary import api
89
910from cloudinary_cli .utils .api_utils import query_cld_folder , upload_file , download_file , get_folder_mode , \
10- get_default_upload_options , get_destination_folder_options
11- from cloudinary_cli .utils .file_utils import walk_dir , delete_empty_dirs , normalize_file_extension , posix_rel_path
11+ get_default_upload_options , get_destination_folder_options , cld_folder_exists
12+ from cloudinary_cli .utils .file_utils import (walk_dir , delete_empty_dirs , normalize_file_extension , posix_rel_path ,
13+ populate_duplicate_name )
1214from cloudinary_cli .utils .json_utils import print_json , read_json_from_file , write_json_to_file
1315from cloudinary_cli .utils .utils import logger , run_tasks_concurrently , get_user_action , invert_dict , chunker , \
14- group_params , parse_option_value
16+ group_params , parse_option_value , duplicate_values
1517
1618_DEFAULT_DELETION_BATCH_SIZE = 30
1719_DEFAULT_CONCURRENT_WORKERS = 30
@@ -43,6 +45,10 @@ def sync(local_folder, cloudinary_folder, push, pull, include_hidden, concurrent
4345 if push == pull :
4446 raise UsageError ("Please use either the '--push' OR '--pull' options" )
4547
48+ if pull and not cld_folder_exists (cloudinary_folder ):
49+ logger .error (f"Cloudinary folder '{ cloudinary_folder } ' does not exist. Aborting..." )
50+ return False
51+
4652 sync_dir = SyncDir (local_folder , cloudinary_folder , include_hidden , concurrent_workers , force , keep_unique ,
4753 deletion_batch_size , folder_mode , optional_parameter , optional_parameter_parsed )
4854
@@ -81,9 +87,12 @@ def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, fo
8187 self .local_files = walk_dir (path .abspath (self .local_dir ), include_hidden )
8288 logger .info (f"Found { len (self .local_files )} items in local folder '{ local_dir } '" )
8389
84- self . remote_files = query_cld_folder (self .remote_dir , self .folder_mode )
85- logger .info (f"Found { len (self . remote_files )} items in Cloudinary folder '{ self .user_friendly_remote_dir } ' "
90+ raw_remote_files = query_cld_folder (self .remote_dir , self .folder_mode )
91+ logger .info (f"Found { len (raw_remote_files )} items in Cloudinary folder '{ self .user_friendly_remote_dir } ' "
8692 f"({ self .folder_mode } folder mode)" )
93+ self .remote_files = self ._normalize_remote_file_names (raw_remote_files , self .local_files )
94+ self .remote_duplicate_names = duplicate_values (self .remote_files , "normalized_path" , "asset_id" )
95+ self ._print_duplicate_file_names ()
8796
8897 local_file_names = self .local_files .keys ()
8998 remote_file_names = self .remote_files .keys ()
@@ -94,10 +103,14 @@ def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, fo
94103 Usually Cloudinary sanitizes those file names and strips invalid characters. Although it is a good best effort
95104 for a general use case, when syncing local folder with Cloudinary, it is not the best option, since directories
96105 will be always out-of-sync.
106+
107+ In addition in dynamic folder mode Cloudinary allows having identical display names for differrent files.
97108
98109 To overcome this limitation, cloudinary-cli keeps .cld-sync hidden file in the sync directory that contains a
99110 mapping of the diverse file names. This file keeps tracking of the files and allows syncing in both directions.
100111 """
112+
113+ # handle fixed folder mode public_id differences
101114 diverse_file_names = read_json_from_file (self .sync_meta_file , does_not_exist_ok = True )
102115 self .diverse_file_names = dict (
103116 (normalize_file_extension (k ), normalize_file_extension (v )) for k , v in diverse_file_names .items ())
@@ -189,6 +202,70 @@ def pull(self):
189202 if download_errors :
190203 raise Exception ("Sync did not finish successfully" )
191204
205+ def _normalize_remote_file_names (self , remote_files , local_files ):
206+ """
207+ When multiple remote files have duplicate display name, we save them locally by appending index at the end
208+ of the base name, e.g. Image (1).jpg, Image (2).jpg, etc.
209+
210+ For consistency, we sort files by `created_at` date.
211+
212+ For partially synced files, when a remote file in the middle was deleted, we want to avoid resync
213+ of the remaining files.
214+
215+ For example, if we had: Image (1), Image (2),..., Image(5), Image (10) on Cloudinary.
216+ If we delete "Image (2)" and resync - that would cause all files from Image (3) to Image (10) to be resynced.
217+ (Image (3) would become Image (2), ... Image (10) -> Image (9))
218+
219+ Instead, since those indexes are arbitrary, we map local files to the remote files by etag (md5sum).
220+ Synced files will keep their indexes, out-of-sync files will be synced.
221+
222+ :param remote_files: Remote files.
223+ :param local_files: Local files.
224+ :return:
225+ """
226+ duplicate_ids = duplicate_values (remote_files , "normalized_path" )
227+ for duplicate_name , asset_ids in duplicate_ids .items ():
228+ duplicate_dts = sorted ([remote_files [asset_id ] for asset_id in asset_ids ], key = lambda f : f ['created_at' ])
229+ local_candidates = self ._local_candidates (duplicate_name )
230+ remainng_duplicate_dts = []
231+ for duplicate_dt in duplicate_dts :
232+ matched_name = next ((f for f in local_candidates .keys () if local_candidates [f ] == duplicate_dt ["etag" ]),
233+ None )
234+ if matched_name is None :
235+ remainng_duplicate_dts .append (duplicate_dt )
236+ continue
237+ # found local synced file.
238+ remote_files [duplicate_dt ["asset_id" ]]["normalized_unique_path" ] = matched_name
239+ local_candidates .pop (matched_name )
240+
241+ unique_paths = {v ["normalized_unique_path" ] for v in remote_files .values ()}
242+ curr_index = 0
243+ for dup in remainng_duplicate_dts :
244+ # here we check for collisions with other existing files.
245+ # remote file can have both "Image.jpg" and "Image (1).jpg", which are valid names, skip those.
246+ candidate_path = populate_duplicate_name (dup ['normalized_path' ], curr_index )
247+ while candidate_path in unique_paths :
248+ curr_index += 1
249+ candidate_path = populate_duplicate_name (dup ['normalized_path' ], curr_index )
250+ remote_files [dup ["asset_id" ]]["normalized_unique_path" ] = candidate_path
251+ curr_index += 1
252+
253+ return {dt ["normalized_unique_path" ]: dt for dt in remote_files .values ()}
254+
255+ def _local_candidates (self , candidate_path ):
256+ filename , extension = path .splitext (candidate_path )
257+ r = re .compile (f"({ candidate_path } |{ filename } \(\d+\){ extension } )" )
258+ # sort local files by base name (without ext) for accurate results.
259+ return dict (sorted ({f : self .local_files [f ]["etag" ] for f in filter (r .match , self .local_files .keys ())}.items (),
260+ key = lambda f : path .splitext (f [0 ])[0 ]))
261+
262+ def _print_duplicate_file_names (self ):
263+ if (len (self .remote_duplicate_names ) > 0 ):
264+ logger .warning (f"Cloudinary folder '{ self .user_friendly_remote_dir } ' "
265+ f"contains { len (self .remote_duplicate_names )} duplicate asset names" )
266+ for normalized_path , asset_ids in self .remote_duplicate_names .items ():
267+ logger .debug (f"Duplicate name: '{ normalized_path } ', asset ids: { ', ' .join (asset_ids )} " )
268+
192269 def _print_sync_status (self , success , errors ):
193270 logger .info ("==Sync Status==" )
194271 logger .info ("===============" )
0 commit comments