5252from kili .utils .tqdm import tqdm
5353
5454FILTER_EXISTING_BATCH_SIZE = 1000
55+ MAX_ASSET_EXTERNAL_IDS_TO_LOG = 20
5556
5657if TYPE_CHECKING :
5758 from kili .client import Kili
@@ -588,10 +589,34 @@ def filter_duplicate_external_ids(self, assets):
588589 assets_external_ids = [
589590 asset .get ("external_id" ) for asset in assets if asset .get ("external_id" )
590591 ]
591- # split assets_external_ids into chunks of 1000
592+ unique_externals_ids = []
593+ duplicated_externals_ids = []
594+
595+ for external_id in assets_external_ids :
596+ if assets_external_ids .count (external_id ) == 1 :
597+ unique_externals_ids .append (external_id )
598+ elif external_id not in duplicated_externals_ids :
599+ duplicated_externals_ids .append (external_id )
600+
601+ nb_duplicated_externals_ids = len (duplicated_externals_ids )
602+
603+ if nb_duplicated_externals_ids > 0 :
604+ if nb_duplicated_externals_ids <= MAX_ASSET_EXTERNAL_IDS_TO_LOG :
605+ warnings .warn (
606+ "The following input assets have been ignored because of duplicated external_id: "
607+ + ", " .join (duplicated_externals_ids ),
608+ stacklevel = 2 ,
609+ )
610+ else :
611+ warnings .warn (
612+ "Some input assets have been ignored because of duplicated external_id" ,
613+ stacklevel = 2 ,
614+ )
615+
616+ # split unique_externals_ids into chunks of 1000
592617 assets_external_ids_chunks = [
593- assets_external_ids [x : x + FILTER_EXISTING_BATCH_SIZE ]
594- for x in range (0 , len (assets_external_ids ), FILTER_EXISTING_BATCH_SIZE )
618+ unique_externals_ids [x : x + FILTER_EXISTING_BATCH_SIZE ]
619+ for x in range (0 , len (unique_externals_ids ), FILTER_EXISTING_BATCH_SIZE )
595620 ]
596621 external_ids_in_project = []
597622 for assets_external_ids_chunk in assets_external_ids_chunks :
@@ -606,13 +631,23 @@ def filter_duplicate_external_ids(self, assets):
606631 )
607632 nb_duplicate_assets = len (external_ids_in_project )
608633 if nb_duplicate_assets > 0 :
609- warnings .warn (
610- f"{ nb_duplicate_assets } assets were not imported because their external_id are"
611- " already in the project" ,
612- stacklevel = 2 ,
613- )
634+ if nb_duplicate_assets <= MAX_ASSET_EXTERNAL_IDS_TO_LOG :
635+ warnings .warn (
636+ "The following assets were not imported because their external_id are"
637+ f" already in the project: { ', ' .join (external_ids_in_project )} " ,
638+ stacklevel = 2 ,
639+ )
640+ else :
641+ warnings .warn (
642+ f"{ nb_duplicate_assets } assets were not imported because their external_id are"
643+ " already in the project" ,
644+ stacklevel = 2 ,
645+ )
614646 filtered_assets = [
615- asset for asset in assets if asset .get ("external_id" ) not in external_ids_in_project
647+ asset
648+ for asset in assets
649+ if asset .get ("external_id" ) not in duplicated_externals_ids
650+ and asset .get ("external_id" ) not in external_ids_in_project
616651 ]
617652 return filtered_assets
618653
0 commit comments