Skip to content

Commit c756f56

Browse files
Fix sync with duplicate display_name in dynamic folders mode
1 parent 7b3f392 commit c756f56

File tree

6 files changed

+186
-18
lines changed

6 files changed

+186
-18
lines changed

cloudinary_cli/modules/sync.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import re
23
from collections import Counter
34
from itertools import groupby
45
from os import path, remove
@@ -7,11 +8,12 @@
78
from cloudinary import api
89

910
from cloudinary_cli.utils.api_utils import query_cld_folder, upload_file, download_file, get_folder_mode, \
10-
get_default_upload_options, get_destination_folder_options
11-
from cloudinary_cli.utils.file_utils import walk_dir, delete_empty_dirs, normalize_file_extension, posix_rel_path
11+
get_default_upload_options, get_destination_folder_options, cld_folder_exists
12+
from cloudinary_cli.utils.file_utils import (walk_dir, delete_empty_dirs, normalize_file_extension, posix_rel_path,
13+
populate_duplicate_name)
1214
from cloudinary_cli.utils.json_utils import print_json, read_json_from_file, write_json_to_file
1315
from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action, invert_dict, chunker, \
14-
group_params, parse_option_value
16+
group_params, parse_option_value, duplicate_values
1517

1618
_DEFAULT_DELETION_BATCH_SIZE = 30
1719
_DEFAULT_CONCURRENT_WORKERS = 30
@@ -43,6 +45,10 @@ def sync(local_folder, cloudinary_folder, push, pull, include_hidden, concurrent
4345
if push == pull:
4446
raise UsageError("Please use either the '--push' OR '--pull' options")
4547

48+
if pull and not cld_folder_exists(cloudinary_folder):
49+
logger.error(f"Cloudinary folder '{cloudinary_folder}' does not exist. Aborting...")
50+
return False
51+
4652
sync_dir = SyncDir(local_folder, cloudinary_folder, include_hidden, concurrent_workers, force, keep_unique,
4753
deletion_batch_size, folder_mode, optional_parameter, optional_parameter_parsed)
4854

@@ -81,9 +87,12 @@ def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, fo
8187
self.local_files = walk_dir(path.abspath(self.local_dir), include_hidden)
8288
logger.info(f"Found {len(self.local_files)} items in local folder '{local_dir}'")
8389

84-
self.remote_files = query_cld_folder(self.remote_dir, self.folder_mode)
85-
logger.info(f"Found {len(self.remote_files)} items in Cloudinary folder '{self.user_friendly_remote_dir}' "
90+
raw_remote_files = query_cld_folder(self.remote_dir, self.folder_mode)
91+
logger.info(f"Found {len(raw_remote_files)} items in Cloudinary folder '{self.user_friendly_remote_dir}' "
8692
f"({self.folder_mode} folder mode)")
93+
self.remote_files = self._normalize_remote_file_names(raw_remote_files, self.local_files)
94+
self.remote_duplicate_names = duplicate_values(self.remote_files, "normalized_path", "asset_id")
95+
self._print_duplicate_file_names()
8796

8897
local_file_names = self.local_files.keys()
8998
remote_file_names = self.remote_files.keys()
@@ -94,10 +103,14 @@ def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, fo
94103
Usually Cloudinary sanitizes those file names and strips invalid characters. Although it is a good best effort
95104
for a general use case, when syncing local folder with Cloudinary, it is not the best option, since directories
96105
will be always out-of-sync.
106+
107+
In addition in dynamic folder mode Cloudinary allows having identical display names for differrent files.
97108
98109
To overcome this limitation, cloudinary-cli keeps .cld-sync hidden file in the sync directory that contains a
99110
mapping of the diverse file names. This file keeps tracking of the files and allows syncing in both directions.
100111
"""
112+
113+
# handle fixed folder mode public_id differences
101114
diverse_file_names = read_json_from_file(self.sync_meta_file, does_not_exist_ok=True)
102115
self.diverse_file_names = dict(
103116
(normalize_file_extension(k), normalize_file_extension(v)) for k, v in diverse_file_names.items())
@@ -189,6 +202,70 @@ def pull(self):
189202
if download_errors:
190203
raise Exception("Sync did not finish successfully")
191204

205+
def _normalize_remote_file_names(self, remote_files, local_files):
206+
"""
207+
When multiple remote files have duplicate display name, we save them locally by appending index at the end
208+
of the base name, e.g. Image (1).jpg, Image (2).jpg, etc.
209+
210+
For consistency, we sort files by `created_at` date.
211+
212+
For partially synced files, when a remote file in the middle was deleted, we want to avoid resync
213+
of the remaining files.
214+
215+
For example, if we had: Image (1), Image (2),..., Image(5), Image (10) on Cloudinary.
216+
If we delete "Image (2)" and resync - that would cause all files from Image (3) to Image (10) to be resynced.
217+
(Image (3) would become Image (2), ... Image (10) -> Image (9))
218+
219+
Instead, since those indexes are arbitrary, we map local files to the remote files by etag (md5sum).
220+
Synced files will keep their indexes, out-of-sync files will be synced.
221+
222+
:param remote_files: Remote files.
223+
:param local_files: Local files.
224+
:return:
225+
"""
226+
duplicate_ids = duplicate_values(remote_files, "normalized_path")
227+
for duplicate_name, asset_ids in duplicate_ids.items():
228+
duplicate_dts = sorted([remote_files[asset_id] for asset_id in asset_ids], key=lambda f: f['created_at'])
229+
local_candidates = self._local_candidates(duplicate_name)
230+
remainng_duplicate_dts = []
231+
for duplicate_dt in duplicate_dts:
232+
matched_name = next((f for f in local_candidates.keys() if local_candidates[f] == duplicate_dt["etag"]),
233+
None)
234+
if matched_name is None:
235+
remainng_duplicate_dts.append(duplicate_dt)
236+
continue
237+
# found local synced file.
238+
remote_files[duplicate_dt["asset_id"]]["normalized_unique_path"] = matched_name
239+
local_candidates.pop(matched_name)
240+
241+
unique_paths = {v["normalized_unique_path"] for v in remote_files.values()}
242+
curr_index = 0
243+
for dup in remainng_duplicate_dts:
244+
# here we check for collisions with other existing files.
245+
# remote file can have both "Image.jpg" and "Image (1).jpg", which are valid names, skip those.
246+
candidate_path = populate_duplicate_name(dup['normalized_path'], curr_index)
247+
while candidate_path in unique_paths:
248+
curr_index += 1
249+
candidate_path = populate_duplicate_name(dup['normalized_path'], curr_index)
250+
remote_files[dup["asset_id"]]["normalized_unique_path"] = candidate_path
251+
curr_index += 1
252+
253+
return {dt["normalized_unique_path"]: dt for dt in remote_files.values()}
254+
255+
def _local_candidates(self, candidate_path):
256+
filename, extension = path.splitext(candidate_path)
257+
r = re.compile(f"({candidate_path}|{filename} \(\d+\){extension})")
258+
# sort local files by base name (without ext) for accurate results.
259+
return dict(sorted({f: self.local_files[f]["etag"] for f in filter(r.match, self.local_files.keys())}.items(),
260+
key=lambda f: path.splitext(f[0])[0]))
261+
262+
def _print_duplicate_file_names(self):
263+
if (len(self.remote_duplicate_names) > 0):
264+
logger.warning(f"Cloudinary folder '{self.user_friendly_remote_dir}' "
265+
f"contains {len(self.remote_duplicate_names)} duplicate asset names")
266+
for normalized_path, asset_ids in self.remote_duplicate_names.items():
267+
logger.debug(f"Duplicate name: '{normalized_path}', asset ids: {', '.join(asset_ids)}")
268+
192269
def _print_sync_status(self, success, errors):
193270
logger.info("==Sync Status==")
194271
logger.info("===============")

cloudinary_cli/utils/api_utils.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,16 @@
33

44
import requests
55
from click import style, launch
6-
from cloudinary import Search, uploader, api
6+
from cloudinary import Search, SearchFolders, uploader, api
77
from cloudinary.utils import cloudinary_url
88

99
from cloudinary_cli.defaults import logger
1010
from cloudinary_cli.utils.config_utils import is_valid_cloudinary_config
11-
from cloudinary_cli.utils.file_utils import normalize_file_extension, posix_rel_path, get_destination_folder
11+
from cloudinary_cli.utils.file_utils import (normalize_file_extension, posix_rel_path, get_destination_folder,
12+
populate_duplicate_name)
1213
from cloudinary_cli.utils.json_utils import print_json, write_json_to_file
1314
from cloudinary_cli.utils.utils import log_exception, confirm_action, get_command_params, merge_responses, \
14-
normalize_list_params, ConfigurationError, print_api_help
15+
normalize_list_params, ConfigurationError, print_api_help, duplicate_values
1516

1617
PAGINATION_MAX_RESULTS = 500
1718

@@ -34,14 +35,19 @@ def query_cld_folder(folder, folder_mode):
3435
rel_path = _relative_path(asset, folder)
3536
rel_display_path = _relative_display_path(asset, folder)
3637
path_key = rel_display_path if folder_mode == "dynamic" else rel_path
37-
files[normalize_file_extension(path_key)] = {
38+
normalized_path_key = normalize_file_extension(path_key)
39+
files[asset["asset_id"]] = {
40+
"asset_id": asset['asset_id'],
41+
"normalized_path": normalized_path_key,
42+
"normalized_unique_path": normalized_path_key,
3843
"type": asset['type'],
3944
"resource_type": asset['resource_type'],
4045
"public_id": asset['public_id'],
4146
"format": asset['format'],
4247
"etag": asset.get('etag', '0'),
4348
"relative_path": rel_path, # save for inner use
4449
"access_mode": asset.get('access_mode', 'public'),
50+
"created_at": asset.get('created_at'),
4551
# dynamic folder mode fields
4652
"asset_folder": asset.get('asset_folder'),
4753
"display_name": asset.get('display_name'),
@@ -53,6 +59,15 @@ def query_cld_folder(folder, folder_mode):
5359

5460
return files
5561

62+
def cld_folder_exists(folder):
63+
folder = folder.strip('/') # omit redundant leading slash and duplicate trailing slashes in query
64+
65+
if not folder:
66+
return True # root folder
67+
68+
res = SearchFolders().expression(f"name=\"{folder}\"").execute()
69+
70+
return res.get("total_count", 0) > 0
5671

5772
def _display_path(asset):
5873
if asset.get("display_name") is None:
@@ -80,9 +95,9 @@ def regen_derived_version(public_id, delivery_type, res_type,
8095
eager_trans, eager_async,
8196
eager_notification_url):
8297
options = {"type": delivery_type, "resource_type": res_type,
83-
"eager": eager_trans, "eager_async": eager_async,
84-
"eager_notification_url": eager_notification_url,
85-
"overwrite": True, "invalidate": True}
98+
"eager": eager_trans, "eager_async": eager_async,
99+
"eager_notification_url": eager_notification_url,
100+
"overwrite": True, "invalidate": True}
86101
try:
87102
exp_res = uploader.explicit(public_id, **options)
88103
derived_url = f'{exp_res.get("eager")[0].get("secure_url")}'

cloudinary_cli/utils/file_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,19 @@ def normalize_file_extension(filename: str) -> str:
129129

130130
return ".".join([p for p in [filename, extension_alias] if p])
131131

132+
def populate_duplicate_name(filename, index=0):
133+
"""
134+
Adds index to the filename in order to avoid duplicates.
135+
136+
:param filename: The file name to modify.
137+
:param index: The desired index.
138+
:return: Modified file name.
139+
"""
140+
filename, extension = os.path.splitext(filename)
141+
if index != 0:
142+
filename = f"{filename} ({index})"
143+
144+
return ".".join([p for p in [filename, extension[1:]] if p])
132145

133146
def posix_rel_path(end, start) -> str:
134147
"""

cloudinary_cli/utils/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,19 @@ def chunker(seq, size):
333333
:return: a single chunk
334334
"""
335335
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
336+
337+
338+
def duplicate_values(items, value_key, key_of_interest=None):
339+
"""
340+
Finds duplicate values in a dictionary of objects.
341+
342+
:param items: All items.
343+
:param value_key: The duplicate value key to search.
344+
:param key_of_interest: The key to add to the resulting list.
345+
:return:
346+
"""
347+
rev_multidict = {}
348+
for key, value in items.items():
349+
rev_multidict.setdefault(value[value_key], set()).add(value[key_of_interest] if key_of_interest is not None else key)
350+
351+
return {key: values for key, values in rev_multidict.items() if len(values) > 1}

test/helper_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import cloudinary.api
99
from cloudinary import logger
10+
from cloudinary_cli.utils.api_utils import query_cld_folder
1011
from urllib3 import HTTPResponse, disable_warnings
1112
from urllib3._collections import HTTPHeaderDict
1213

@@ -116,8 +117,14 @@ def retry_func(*args, **kwargs):
116117
return retry_decorator
117118

118119

119-
def delete_cld_folder_if_exists(folder):
120-
cloudinary.api.delete_resources_by_prefix(folder)
120+
def delete_cld_folder_if_exists(folder, folder_mode = "fixed"):
121+
if folder_mode == "fixed":
122+
cloudinary.api.delete_resources_by_prefix(folder)
123+
else:
124+
assets = query_cld_folder(folder, folder_mode)
125+
if (len(assets)):
126+
cloudinary.api.delete_resources([f["public_id"] for f in assets.values()])
127+
121128
try:
122129
cloudinary.api.delete_folder(folder)
123130
except cloudinary.exceptions.NotFound:

test/test_modules/test_cli_sync.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from test.helper_test import unique_suffix, RESOURCES_DIR, TEST_FILES_DIR, delete_cld_folder_if_exists, retry_assertion, \
1111
get_request_url, get_params, URLLIB3_REQUEST
1212
from test.test_modules.test_cli_upload_dir import UPLOAD_MOCK_RESPONSE
13+
from cloudinary_cli.utils.api_utils import get_folder_mode
1314

1415

1516
class TestCLISync(unittest.TestCase):
@@ -19,14 +20,19 @@ class TestCLISync(unittest.TestCase):
1920
LOCAL_SYNC_PULL_DIR = str(Path.joinpath(RESOURCES_DIR, unique_suffix("test_sync_pull")))
2021
CLD_SYNC_DIR = unique_suffix("test_sync")
2122

23+
DUPLICATE_NAME = unique_suffix("duplicate_name")
24+
2225
GRACE_PERIOD = 3 # seconds
2326

27+
folder_mode = "fixed"
28+
2429
def setUp(self) -> None:
25-
delete_cld_folder_if_exists(self.CLD_SYNC_DIR)
30+
self.folder_mode = get_folder_mode()
31+
delete_cld_folder_if_exists(self.CLD_SYNC_DIR, self.folder_mode)
2632
time.sleep(1)
2733

2834
def tearDown(self) -> None:
29-
delete_cld_folder_if_exists(self.CLD_SYNC_DIR)
35+
delete_cld_folder_if_exists(self.CLD_SYNC_DIR, self.folder_mode)
3036
time.sleep(1)
3137
shutil.rmtree(self.LOCAL_SYNC_PULL_DIR, ignore_errors=True)
3238

@@ -81,6 +87,14 @@ def test_cli_sync_pull(self):
8187
self.assertIn("Synced | 12", result.output)
8288
self.assertIn("Done!", result.output)
8389

90+
91+
def test_cli_sync_pull_non_existing_folder(self):
92+
non_existing_dir = self.CLD_SYNC_DIR + "non_existing"
93+
result = self.runner.invoke(cli, ['sync', '--pull', self.LOCAL_SYNC_PULL_DIR, non_existing_dir])
94+
95+
self.assertIn(f"error: Cloudinary folder '{non_existing_dir}' does not exist.", result.output)
96+
self.assertIn("Aborting...", result.output)
97+
8498
@retry_assertion
8599
def test_cli_sync_pull_twice(self):
86100
self._upload_sync_files(TEST_FILES_DIR)
@@ -119,8 +133,10 @@ def test_cli_sync_pull_out_of_sync(self):
119133
self.assertIn("Synced | 11", result.output)
120134
self.assertIn("Done!", result.output)
121135

122-
def _upload_sync_files(self, dir):
123-
result = self.runner.invoke(cli, ['sync', '--push', '-F', dir, self.CLD_SYNC_DIR])
136+
def _upload_sync_files(self, dir, optional_params=None):
137+
if optional_params is None:
138+
optional_params = []
139+
result = self.runner.invoke(cli, ['sync', '--push', '-F', dir, self.CLD_SYNC_DIR] + optional_params)
124140

125141
self.assertEqual(0, result.exit_code)
126142
self.assertIn("Synced | 12", result.output)
@@ -137,3 +153,27 @@ def test_sync_override_defaults(self, mocker):
137153

138154
self.assertIn("raw/upload", get_request_url(mocker))
139155
self.assertTrue(get_params(mocker)['unique_filename'])
156+
157+
158+
@unittest.skipUnless(get_folder_mode() == "dynamic", "requires dynamic folder mode")
159+
@retry_assertion
160+
def test_cli_sync_duplicate_file_names_dynamic_folder_mode(self):
161+
self._upload_sync_files(TEST_FILES_DIR, ['-o', 'display_name', self.DUPLICATE_NAME])
162+
163+
# wait for indexing to be updated
164+
time.sleep(self.GRACE_PERIOD)
165+
166+
result = self.runner.invoke(cli, ['sync', '--pull', '-F', self.LOCAL_SYNC_PULL_DIR, self.CLD_SYNC_DIR])
167+
168+
self.assertEqual(0, result.exit_code)
169+
self.assertIn("Found 0 items in local folder", result.output)
170+
self.assertIn("Downloading 12 files", result.output)
171+
for index in range(1, 6):
172+
self.assertIn(f"{self.DUPLICATE_NAME} ({index})", result.output)
173+
self.assertIn("Done!", result.output)
174+
175+
result = self.runner.invoke(cli, ['sync', '--push', '-F', self.LOCAL_SYNC_PULL_DIR, self.CLD_SYNC_DIR])
176+
177+
self.assertEqual(0, result.exit_code)
178+
self.assertIn("Skipping 12 items", result.output)
179+
self.assertIn("Done!", result.output)

0 commit comments

Comments
 (0)