Skip to content

Commit 6fd7447

Browse files
Improve sync of files with special characters
1 parent 0473eab commit 6fd7447

File tree

6 files changed

+100
-35
lines changed

6 files changed

+100
-35
lines changed

cloudinary_cli/modules/sync.py

Lines changed: 70 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
import logging
22
from functools import reduce
33
from itertools import product
4-
from os import remove
5-
from os.path import join as path_join, abspath
4+
from os import path, remove
65

76
from click import command, argument, option, style
87
from cloudinary import api
98

109
from cloudinary_cli.utils.api_utils import query_cld_folder, upload_file, download_file
1110
from cloudinary_cli.utils.file_utils import walk_dir, delete_empty_dirs, get_destination_folder
12-
from cloudinary_cli.utils.json_utils import print_json
13-
from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action
11+
from cloudinary_cli.utils.json_utils import print_json, read_json_from_file, write_json_to_file
12+
from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action, invert_dict
1413

1514
_DEFAULT_DELETION_BATCH_SIZE = 30
1615
_DEFAULT_CONCURRENT_WORKERS = 30
1716

17+
_SYNC_META_FILE = '.cld-sync'
18+
1819

1920
@command("sync",
2021
short_help="Synchronize between a local directory and a Cloudinary folder.",
@@ -50,31 +51,52 @@ class SyncDir:
5051
def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, force, keep_deleted,
5152
deletion_batch_size):
5253
self.local_dir = local_dir
53-
self.remote_dir = remote_dir
54+
self.remote_dir = remote_dir.strip('/')
5455
self.include_hidden = include_hidden
5556
self.concurrent_workers = concurrent_workers
5657
self.force = force
5758
self.keep_unique = keep_deleted
5859
self.deletion_batch_size = deletion_batch_size
5960

61+
self.sync_meta_file = path.join(self.local_dir, _SYNC_META_FILE)
62+
6063
self.verbose = logger.getEffectiveLevel() < logging.INFO
6164

62-
self.local_files = walk_dir(abspath(self.local_dir), include_hidden)
65+
self.local_files = walk_dir(path.abspath(self.local_dir), include_hidden)
6366
logger.info(f"Found {len(self.local_files)} items in local folder '{local_dir}'")
6467

6568
self.remote_files = query_cld_folder(self.remote_dir)
6669
logger.info(f"Found {len(self.remote_files)} items in Cloudinary folder '{self.remote_dir}'")
6770

6871
local_file_names = self.local_files.keys()
6972
remote_file_names = self.remote_files.keys()
73+
"""
74+
Cloudinary is a very permissive service. When uploading files that contain invalid characters,
75+
unicode characters, etc, Cloudinary does the best effort to store those files.
76+
77+
Usually Cloudinary sanitizes those file names and strips invalid characters. Although it is good best effort for
78+
a general use case, when syncing local folder with Cloudinary, it is not the best option, since directories will
79+
be always out-of-sync.
80+
81+
To overcome this limitation, cloudinary-cli keeps .cld-sync hidden file in the sync directory that contains a
82+
mapping of the diverse file names. This file keeps tracking on the files and allows syncing in both directions.
83+
"""
84+
self.diverse_file_names = read_json_from_file(self.sync_meta_file, does_not_exist_ok=True)
85+
inverted_diverse_file_names = invert_dict(self.diverse_file_names)
86+
87+
cloudinarized_local_file_names = [self.diverse_file_names.get(f, f) for f in local_file_names]
88+
self.recovered_remote_files = {inverted_diverse_file_names.get(f, f): dt for f, dt in self.remote_files.items()}
89+
90+
self.unique_remote_file_names = remote_file_names - cloudinarized_local_file_names
91+
self.unique_local_file_names = local_file_names - self.recovered_remote_files.keys()
7092

71-
self.unique_remote_file_names = remote_file_names - local_file_names
72-
self.unique_local_file_names = local_file_names - remote_file_names
7393
common_file_names = local_file_names - self.unique_local_file_names
7494

75-
self.out_of_sync_file_names = self._get_out_of_sync_file_names(common_file_names)
95+
self.out_of_sync_local_file_names = self._get_out_of_sync_file_names(common_file_names)
96+
self.out_of_sync_remote_file_names = set(self.diverse_file_names.get(f, f) for f in
97+
self.out_of_sync_local_file_names)
7698

77-
skipping = len(common_file_names) - len(self.out_of_sync_file_names)
99+
skipping = len(common_file_names) - len(self.out_of_sync_local_file_names)
78100

79101
if skipping:
80102
logger.info(f"Skipping {skipping} items")
@@ -83,12 +105,16 @@ def _get_out_of_sync_file_names(self, common_file_names):
83105
logger.debug("\nCalculating differences...\n")
84106
out_of_sync_file_names = set()
85107
for f in common_file_names:
86-
if self.local_files[f]['etag'] != self.remote_files[f]['etag']:
87-
logger.warning(f"{f} is out of sync")
88-
logger.debug(f"Local etag: {self.local_files[f]['etag']}. Remote etag: {self.remote_files[f]['etag']}")
108+
local_etag = self.local_files[f]['etag']
109+
remote_etag = self.recovered_remote_files[f]['etag']
110+
if local_etag != remote_etag:
111+
logger.warning(f"{f} is out of sync" +
112+
(f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))
113+
logger.debug(f"Local etag: {local_etag}. Remote etag: {remote_etag}")
89114
out_of_sync_file_names.add(f)
90115
continue
91-
logger.debug(f"{f} is in sync")
116+
logger.debug(f"'{f}' is in sync" +
117+
(f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))
92118

93119
return out_of_sync_file_names
94120

@@ -97,7 +123,7 @@ def push(self):
97123
logger.info("Aborting...")
98124
return False
99125

100-
files_to_push = self.unique_local_file_names | self.out_of_sync_file_names
126+
files_to_push = self.unique_local_file_names | self.out_of_sync_local_file_names
101127
if not files_to_push:
102128
return True
103129

@@ -109,14 +135,37 @@ def push(self):
109135
'invalidate': True,
110136
'resource_type': 'auto'
111137
}
138+
upload_results = {}
112139
uploads = []
113140
for file in files_to_push:
114141
folder = get_destination_folder(self.remote_dir, file)
115142

116-
uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}))
143+
uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}, upload_results))
117144

118145
run_tasks_concurrently(upload_file, uploads, self.concurrent_workers)
119146

147+
self.save_sync_meta_file(upload_results)
148+
149+
def save_sync_meta_file(self, upload_results):
150+
diverse_filenames = {}
151+
for local_path, remote_path in upload_results.items():
152+
local = path.relpath(local_path, self.local_dir)
153+
remote = path.relpath(remote_path, self.remote_dir)
154+
if local != remote:
155+
diverse_filenames[local] = remote
156+
157+
# filter out outdated meta file entries
158+
current_diverse_files = {k: v for k, v in self.diverse_file_names.items() if k in self.local_files.keys()}
159+
160+
if diverse_filenames or current_diverse_files != self.diverse_file_names:
161+
current_diverse_files.update(diverse_filenames)
162+
try:
163+
write_json_to_file(current_diverse_files, self.sync_meta_file)
164+
logger.debug(f"Updated '{self.sync_meta_file}' file")
165+
except Exception as e:
166+
# Meta file is not critical for the sync itself, in case we cannot write it, we just log a warning
167+
logger.warning(f"Failed updating '{self.sync_meta_file}' file: {e}")
168+
120169
def _handle_unique_remote_files(self):
121170
handled = self._handle_files_deletion(len(self.unique_remote_file_names), "remote")
122171
if handled is not None:
@@ -155,7 +204,7 @@ def pull(self):
155204
if not self._handle_unique_local_files():
156205
return False
157206

158-
files_to_pull = self.unique_remote_file_names | self.out_of_sync_file_names
207+
files_to_pull = self.unique_remote_file_names | self.out_of_sync_remote_file_names
159208

160209
if not files_to_pull:
161210
return True
@@ -164,7 +213,7 @@ def pull(self):
164213
downloads = []
165214
for file in files_to_pull:
166215
remote_file = self.remote_files[file]
167-
local_path = abspath(path_join(self.local_dir, file))
216+
local_path = path.abspath(path.join(self.local_dir, file))
168217

169218
downloads.append((remote_file, local_path))
170219

@@ -177,9 +226,9 @@ def _handle_unique_local_files(self):
177226

178227
logger.info(f"Deleting {len(self.unique_local_file_names)} local files...")
179228
for file in self.unique_local_file_names:
180-
path = abspath(self.local_files[file]['path'])
181-
remove(path)
182-
logger.info(f"Deleted '{path}'")
229+
full_path = path.abspath(self.local_files[file]['path'])
230+
remove(full_path)
231+
logger.info(f"Deleted '{full_path}'")
183232

184233
logger.info("Deleting empty folders...")
185234
delete_empty_dirs(self.local_dir)

cloudinary_cli/utils/api_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def query_cld_folder(folder):
4242

4343

4444
def upload_file(file_path, options, uploaded=None, skipped=None):
45-
uploaded = uploaded if uploaded is not None else []
45+
uploaded = uploaded if uploaded is not None else {}
4646
skipped = skipped if skipped is not None else []
4747
verbose = logger.getEffectiveLevel() < logging.INFO
4848

@@ -55,7 +55,7 @@ def upload_file(file_path, options, uploaded=None, skipped=None):
5555
logger.info(style(f"Successfully uploaded {file_path} as {result['public_id']}", fg="green"))
5656
if verbose:
5757
print_json(result)
58-
uploaded.append(result['public_id'])
58+
uploaded[file_path] = asset_source(result)
5959
except Exception as e:
6060
log_exception(e, f"Failed uploading {file_path}")
6161
skipped.append(file_path)

cloudinary_cli/utils/config_utils.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,7 @@
1010

1111

1212
def load_config():
13-
if not os.path.exists(CLOUDINARY_CLI_CONFIG_FILE) or os.path.getsize(CLOUDINARY_CLI_CONFIG_FILE) < 1:
14-
return {}
15-
16-
return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE)
13+
return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE, does_not_exist_ok=True)
1714

1815

1916
def save_config(config):
@@ -68,9 +65,7 @@ def migrate_old_config():
6865
f"please fix or remove it")
6966
raise
7067

71-
new_config = load_config()
72-
new_config.update(old_config)
73-
save_config(new_config)
68+
update_config(old_config)
7469

7570
os.remove(OLD_CLOUDINARY_CLI_CONFIG_FILE)
7671

cloudinary_cli/utils/file_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ def is_hidden_path(filepath):
3535

3636

3737
def has_hidden_attribute(filepath):
38-
st = os.stat(filepath)
38+
try:
39+
st = os.stat(filepath)
40+
except OSError as e:
41+
logger.debug(f"Failed getting os.stat for file '{filepath}': {e}")
42+
return False
3943

4044
if not hasattr(st, 'st_file_attributes'): # not a pythonic way, but it's relevant only for windows, no need to try
4145
return False

cloudinary_cli/utils/json_utils.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,27 @@
11
import json
22
from platform import system
3-
3+
from os import path
44
import click
55
from pygments import highlight, lexers, formatters
66

77

8+
def read_json_from_file(filename, does_not_exist_ok=False):
9+
if does_not_exist_ok and (not path.exists(filename) or path.getsize(filename) < 1):
10+
return {}
11+
12+
with open(filename, 'r') as file:
13+
return json.loads(file.read() or "{}")
14+
15+
816
def write_json_to_file(json_obj, filename, indent=2, sort_keys=False):
917
with open(filename, 'w') as file:
1018
json.dump(json_obj, file, indent=indent, sort_keys=sort_keys)
1119

1220

13-
def read_json_from_file(filename):
14-
with open(filename, 'r') as file:
15-
return json.loads(file.read() or "{}")
21+
def update_json_file(json_obj, filename, indent=2, sort_keys=False):
22+
curr_obj = read_json_from_file(filename, True)
23+
curr_obj.update(json_obj)
24+
write_json_to_file(curr_obj, filename, indent, sort_keys)
1625

1726

1827
def print_json(res):

cloudinary_cli/utils/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ def remove_string_prefix(string, prefix):
102102
return string[string.startswith(prefix) and len(prefix):]
103103

104104

105+
def invert_dict(d):
106+
inv_dict = {}
107+
for k, v in d.items():
108+
inv_dict[v] = k
109+
110+
return inv_dict
111+
112+
105113
def write_json_list_to_csv(json_list, filename, fields_to_keep=()):
106114
with open(f'{filename}.csv', 'w') as f:
107115
if not fields_to_keep:

0 commit comments

Comments
 (0)