11import json
2+ import os
23from dataclasses import dataclass
4+
35import pandas as pd
4- import os
6+
57
68@dataclass
79class AnonymizationConfig :
810 """
911 Configuration object for the anonymization workflow.
1012
1113 Attributes:
12- data (str):
14+ data (str):
1315 Path to the input dataset. Supported formats include CSV, Excel,
1416 JSON, and SQLite (.db) files.
1517
16- identifiers (list[str]):
18+ identifiers (list[str]):
1719 List of direct identifiers (e.g., name, SSN, phone number).
18-
20+
1921 quasi_identifiers (list[str]):
2022 List of quasi-identifying attributes requiring generalization
2123 (e.g., age, zipcode, occupation)
22-
24+
2325 sensitive_attributes (list[str]):
2426 Attributes considered sensitive (e.g., disease, salary)
2527 If not empty, either l-diversity or t-closeness must be specified.
2628
2729 insensitive_attributes (list[str]):
2830 Attributes that are neither identifiers nor sensitive and are carried through unchanged.
2931
30-
31- hierarchies (dict[str, str]):
32+
33+ hierarchies (dict[str, str]):
3234 Mapping from quasi-identifiers to CSV hierarchy files.
3335
3436 k (int, optional):
@@ -50,6 +52,7 @@ class AnonymizationConfig:
5052 Anonymization backend to use, either 'arx' or 'anjana'.
5153 Defaults to 'arx'
5254 """
55+
5356 data : str
5457 identifiers : list [str ]
5558 quasi_identifiers : list [str ]
@@ -70,7 +73,11 @@ def from_json(cls, json_path: str):
7073 with open (json_path , "r" ) as file :
7174 config_json = json .load (file )
7275
73- attributes = {key : config_json [key ] for key in cls .__annotations__ if key in config_json }
76+ attributes = {
77+ key : config_json [key ]
78+ for key in cls .__annotations__
79+ if key in config_json
80+ }
7481 return cls (** attributes )
7582
7683 def _validate (self ) -> None :
@@ -93,7 +100,7 @@ def _validate(self) -> None:
93100 def _validate_parameters (self ) -> None :
94101 """
95102 Validates the anonymization parameters.
96-
103+
97104 Checks:
98105 - k is a positive integer if provided
99106 - l is a positive integer if provided
@@ -109,38 +116,46 @@ def _validate_parameters(self) -> None:
109116 # --- Checks if k is correct ---
110117 if self .k is not None :
111118 if not isinstance (self .k , int ):
112- raise TypeError (f"k must be an integer, but got { self .k !r} instead" )
113-
119+ raise TypeError (
120+ f"k must be an integer, but got { self .k !r} instead"
121+ )
122+
114123 if self .k <= 0 :
115124 raise ValueError (
116125 f"k must be positive, but got { self .k !r} instead"
117126 )
118-
127+
119128 # --- Checks if l is correct ---
120129 if self .l is not None :
121130 if not isinstance (self .l , int ):
122- raise TypeError (f"l must be an integer, but got { self .l !r} instead" )
123-
131+ raise TypeError (
132+ f"l must be an integer, but got { self .l !r} instead"
133+ )
134+
124135 if self .l <= 0 :
125136 raise ValueError (
126137 f"l must be positive, but got { self .l !r} instead"
127138 )
128-
139+
129140 # --- Checks if t is correct ---
130141 if self .t is not None :
131142 if not isinstance (self .t , (float , int )):
132- raise TypeError (f"t must be a float, but got { self .t !r} instead" )
133-
143+ raise TypeError (
144+ f"t must be a float, but got { self .t !r} instead"
145+ )
146+
134147 if not 0.0 <= self .t <= 1.0 :
135148 raise ValueError (
136149 f"t must be in [0,1], but got { self .t !r} instead"
137150 )
138-
151+
139152 # --- Checks if the suppression limit is correct ---
140153 if self .suppression_limit is not None :
141154 if not isinstance (self .suppression_limit , int ):
142- raise TypeError (f"suppression_limit must be an integer, but got { self .suppression_limit !r} instead" )
143-
155+ raise TypeError (
156+ f"suppression_limit must be an integer, but got { self .suppression_limit !r} instead"
157+ )
158+
144159 if not 0 <= self .suppression_limit <= 100 :
145160 raise ValueError (
146161 f"t must be in [0,100], but got { self .suppression_limit !r} instead"
@@ -150,13 +165,13 @@ def _validate_parameters(self) -> None:
150165 if not isinstance (self .backend , str ):
151166 raise TypeError (
152167 f"backed must be a string, but got { self .backend !r} instead!"
153- )
154-
168+ )
169+
155170 if self .backend not in ["arx" , "anjana" ]:
156171 raise ValueError (
157172 f"The backend must be either 'arx' or 'anjana', but got { self .backend !r} instead!"
158173 )
159-
174+
160175 def _validate_attributes (self ) -> None :
161176 """
162177 Validates all the attribute lists.
@@ -175,7 +190,7 @@ def _validate_attributes(self) -> None:
175190 "identifiers" : self .identifiers ,
176191 "quasi_identifiers" : self .quasi_identifiers ,
177192 "sensitive_attributes" : self .sensitive_attributes ,
178- "insensitive_attributes" : self .insensitive_attributes
193+ "insensitive_attributes" : self .insensitive_attributes ,
179194 }
180195
181196 # Checks that the attributes are provided using lists.
@@ -185,10 +200,8 @@ def _validate_attributes(self) -> None:
185200 f"{ name } must be a list, but got { attrs !r} instead!"
186201 )
187202 if not all (isinstance (x , str ) for x in attrs ):
188- raise TypeError (
189- f"All entries in { name } must be strings!"
190- )
191-
203+ raise TypeError (f"All entries in { name } must be strings!" )
204+
192205 # --- Checks that the attribute names do not overlap.
193206 all_attrs = sum (attr_list .values (), [])
194207 if len (all_attrs ) != len (set (all_attrs )):
@@ -208,19 +221,19 @@ def _validate_dataset(self) -> None:
208221 TypeError: If the dataset path is not a string.
209222 FileNotFoundError: If the file does not exist at the given path.
210223 """
211-
224+
212225 # --- Checks that the dataset path is a string ---
213226 if not isinstance (self .data , str ):
214227 raise TypeError (
215228 f"The dataset path must be provided as a string, but got { self .data !r} instead!"
216229 )
217-
230+
218231 # --- Checks that the dataset file exists.
219232 if not os .path .exists (self .data ):
220233 raise FileNotFoundError (
221234 f"The dataset could not be located at { self .data !r} !"
222235 )
223-
236+
224237 def _validate_hierarchies (self ) -> None :
225238 """
226239 Validates the hierarchies provided for the quasi-identifiers.
@@ -251,19 +264,19 @@ def _validate_hierarchies(self) -> None:
251264 raise TypeError (
252265 f"Hierarchy quasi-identifier keys must be strings, but got { qid !r} instead!"
253266 )
254-
267+
255268 # --- Checks that the quasi-identifier exists ---
256269 if qid not in self .quasi_identifiers :
257270 raise TypeError (
258271 f"Cannot create hierarchy for { qid !r} , since it is not a quasi-identifier!"
259272 )
260-
273+
261274 # --- Checks that the hierarchy path is a string ---
262275 if not isinstance (hierarchy_path , str ):
263276 raise TypeError (
264277 f"The hierarchy path for { qid !r} must be a string, but got { hierarchy_path !r} instead!"
265278 )
266-
279+
267280 # --- Checks that the hierarchy path exists.
268281 if not os .path .exists (hierarchy_path ):
269282 raise FileNotFoundError (
@@ -276,11 +289,11 @@ def _validate_privacy_models(self) -> None:
276289 If sensitive attributes are present, requires that either:
277290 - l-diversity ('l') is specified, or
278291 - t-closeness ('t') is specified
279-
292+
280293 Raises:
281294 ValueError: If sensitive attributes exist but neither 'l' nor 't' is provided.
282295 """
283296 if self .sensitive_attributes and self .t is None and self .l is None :
284297 raise ValueError (
285298 f"sensitive-attributes={ self .sensitive_attributes } , l-Diversity or t-Closeness must be used when anonymizing with sensitive attributes!"
286- )
299+ )
0 commit comments