@@ -211,14 +211,33 @@ def _validate_pii(column_name, **kwargs):
211211
212212 def __init__ (self ):
213213 self .columns = {}
214- self .primary_key = None
214+ self ._primary_key = None
215215 self .alternate_keys = []
216216 self .sequence_key = None
217217 self .sequence_index = None
218218 self .column_relationships = []
219219 self ._version = self .METADATA_SPEC_VERSION
220220 self ._updated = False
221221
222+ @property
223+ def _primary_key_is_composite (self ):
224+ if self .primary_key and isinstance (self .primary_key , list ) and len (self .primary_key ) > 1 :
225+ return True
226+
227+ return False
228+
229+ @property
230+ def primary_key (self ):
231+ """Property to handle singleton composite key case."""
232+ if isinstance (self ._primary_key , list ) and len (self ._primary_key ) == 1 :
233+ return self ._primary_key [0 ]
234+
235+ return self ._primary_key
236+
237+ @primary_key .setter
238+ def primary_key (self , primary_key ):
239+ self ._primary_key = primary_key
240+
222241 def _get_unexpected_kwargs (self , sdtype , ** kwargs ):
223242 expected_kwargs = self ._SDTYPE_KWARGS .get (sdtype , ['pii' ])
224243 unexpected_kwargs = set (kwargs ) - set (expected_kwargs )
@@ -774,29 +793,40 @@ def detect_from_csv(self, filepath, read_csv_parameters=None):
774793 self .detect_from_dataframe (data )
775794
776795 @staticmethod
777- def _validate_key_datatype (column_name ):
796+ def _validate_key_datatype (column_name , key_type ):
778797 """Check whether column_name is a string."""
779- return isinstance (column_name , str )
798+ is_string = isinstance (column_name , str )
799+ is_list_of_strings = isinstance (column_name , list ) and all (
800+ isinstance (i , str ) for i in column_name
801+ )
802+ return is_string or (key_type == 'primary' and is_list_of_strings )
780803
781804 def _validate_keys_sdtype (self , keys , key_type ):
782805 """Validate that each key is of type 'id' or a valid Faker function."""
783- bad_keys = set ()
806+ bad_keys = []
784807 for key in keys :
785- if not (
786- self .columns [key ]['sdtype' ] == 'id'
787- or is_faker_function (self .columns [key ]['sdtype' ])
808+ if not any (
809+ self .columns [key_col ]['sdtype' ] == 'id'
810+ or is_faker_function (self .columns [key_col ]['sdtype' ])
811+ for key_col in _cast_to_iterable (key )
788812 ):
789- bad_keys .add (key )
813+ bad_keys .append (key )
814+
790815 if bad_keys :
791816 raise InvalidMetadataError (
792- f"The { key_type } _keys { sorted (bad_keys )} must be type 'id' or another PII type."
817+ f'The { key_type } _keys { bad_keys } must have a column of '
818+ "type 'id' or another PII type."
793819 )
794820
795821 def _validate_key (self , column_name , key_type ):
796822 """Validate the primary and sequence keys."""
797823 if column_name is not None :
798- if not self ._validate_key_datatype (column_name ):
799- raise InvalidMetadataError (f"'{ key_type } _key' must be a string." )
824+ if not self ._validate_key_datatype (column_name , key_type ):
825+ err_msg = f"'{ key_type } _key' must be a string"
826+ if key_type == 'primary' :
827+ err_msg += ' or a list of strings'
828+
829+ raise InvalidMetadataError (err_msg + '.' )
800830
801831 keys = {column_name } if isinstance (column_name , str ) else set (column_name )
802832 setting_sequence_as_primary = key_type == 'primary' and column_name == self .sequence_key
@@ -814,7 +844,7 @@ def _validate_key(self, column_name, key_type):
814844 ' Keys should be columns that exist in the table.'
815845 )
816846
817- self ._validate_keys_sdtype (keys , key_type )
847+ self ._validate_keys_sdtype ([ column_name ] , key_type )
818848
819849 def set_primary_key (self , column_name ):
820850 """Set the metadata primary key.
@@ -866,7 +896,8 @@ def set_sequence_key(self, column_name):
866896
867897 def _validate_alternate_keys (self , column_names ):
868898 if not isinstance (column_names , list ) or not all (
869- self ._validate_key_datatype (column_name ) for column_name in column_names
899+ self ._validate_key_datatype (column_name , 'alternate_keys' )
900+ for column_name in column_names
870901 ):
871902 raise InvalidMetadataError ("'alternate_keys' must be a list of strings." )
872903
@@ -1158,7 +1189,10 @@ def _get_primary_and_alternate_keys(self):
11581189 """
11591190 keys = set (self .alternate_keys )
11601191 if self .primary_key :
1161- keys .update ({self .primary_key })
1192+ primary_key = (
1193+ tuple (self .primary_key ) if isinstance (self .primary_key , list ) else self .primary_key
1194+ )
1195+ keys .add (primary_key )
11621196
11631197 return keys
11641198
@@ -1181,31 +1215,45 @@ def _validate_keys_dont_have_missing_values(self, data):
11811215 errors = []
11821216 keys = self ._get_primary_and_alternate_keys ()
11831217 keys .update (self ._get_set_of_sequence_keys ())
1184- for key in sorted (keys ):
1185- if pd .isna (data [key ]).any ():
1186- errors .append (f"Key column '{ key } ' contains missing values." )
1218+ for key in sorted (keys , key = lambda key : key if isinstance (key , str ) else key [0 ]):
1219+ key_list = [key ] if isinstance (key , str ) else list (key )
1220+ if pd .isna (data [key_list ]).all (axis = 1 ).any ():
1221+ key = f"'{ key } '" if isinstance (key , str ) else f'{ key } '
1222+ errors .append (f'Key column { key } contains missing values.' )
11871223
11881224 return errors
11891225
11901226 def _validate_key_values_are_unique (self , data ):
11911227 errors = []
11921228 keys = self ._get_primary_and_alternate_keys ()
1193- for key in sorted (keys ):
1194- repeated_values = set (data [key ][data [key ].duplicated ()])
1195- if repeated_values :
1196- repeated_values = _format_invalid_values_string (repeated_values , 3 )
1197- errors .append (f"Key column '{ key } ' contains repeating values: " + repeated_values )
1229+ for key in sorted (keys , key = lambda key : key if isinstance (key , str ) else key [0 ]):
1230+ key_list = [key ] if isinstance (key , str ) else list (key )
1231+ repeated_values = data [key_list ][data [key_list ].duplicated ()]
1232+ if not repeated_values .empty :
1233+ if len (repeated_values .columns ) == 1 :
1234+ repeated_values = ' ' + _format_invalid_values_string (
1235+ set (repeated_values [key ]), 3
1236+ )
1237+ else :
1238+ repeated_values = '\n ' + _format_invalid_values_string (
1239+ repeated_values .drop_duplicates (), 3
1240+ )
1241+
1242+ key = f"'{ key } '" if isinstance (key , str ) else f'{ key } '
1243+ errors .append (f'Key column { key } contains repeating values:' + repeated_values )
11981244
11991245 return errors
12001246
12011247 def _validate_primary_key (self , data ):
12021248 error = []
1203- is_int = self .primary_key and pd .api .types .is_integer_dtype (data [self .primary_key ])
1204- regex = self .columns .get (self .primary_key , {}).get ('regex_format' )
1205- if is_int and regex :
1206- possible_characters = get_possible_chars (regex , 1 )
1207- if '0' in possible_characters :
1208- error .append (f'Primary key "{ self .primary_key } " { INT_REGEX_ZERO_ERROR_MESSAGE } ' )
1249+ primary_key_list = _cast_to_iterable (self .primary_key ) if self .primary_key else []
1250+ for key in primary_key_list :
1251+ is_int = pd .api .types .is_integer_dtype (data [key ])
1252+ regex = self .columns .get (key , {}).get ('regex_format' )
1253+ if is_int and regex :
1254+ possible_characters = get_possible_chars (regex , 1 )
1255+ if '0' in possible_characters :
1256+ error .append (f'Primary key column "{ key } " { INT_REGEX_ZERO_ERROR_MESSAGE } ' )
12091257
12101258 return error
12111259
0 commit comments