AI-team-UoA
diff --git a/‎src/anonymization_manager/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/anonymization_manager/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/anonymization_manager/adapters/arx/arx.py‎
Lines changed: 5 additions & 7 deletions b/‎src/anonymization_manager/adapters/arx/arx.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎src/anonymization_manager/config.py‎
Lines changed: 49 additions & 36 deletions b/‎src/anonymization_manager/config.py‎
Lines changed: 49 additions & 36 deletions
diff --git a/‎tests/common.py‎
Lines changed: 14 additions & 12 deletions b/‎tests/common.py‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎tests/test_anonymization/test_k_anonymity.py‎
Lines changed: 35 additions & 38 deletions b/‎tests/test_anonymization/test_k_anonymity.py‎
Lines changed: 35 additions & 38 deletions
@@ -1,4 +1,4 @@
-from .core import AnonymizationManager
 from .config import AnonymizationConfig
+from .core import AnonymizationManager
 
 __all__ = ["AnonymizationManager", "AnonymizationConfig"]
@@ -41,7 +41,7 @@ def _data_handle_to_dataframe(data_handle: JClass) -> pd.DataFrame:
         Args:
             data_handle (jpype._jclass.org.deidentifier.arx.DataHandle):
                 The ARX DataHandle Object.
-        
+
         Returns:
             pd.DataFrame: The dataset as a pandas DataFrame.
         """
@@ -314,7 +314,7 @@ def get_non_uniform_entropy_metric(self, attribute: str) -> float:
     def get_generalization_intensity_metric(self, attribute: str) -> float:
         """
         Returns the generalization intensity metric for a specific attribute.
-        
+
         Args:
             attribute (str): The attribute name.
 
@@ -361,9 +361,7 @@ def _load_arx_library(cls) -> None:
         libarx = os.path.join(os.path.dirname(__file__), "libarx-3.9.2.jar")
 
         if not os.path.exists(libarx):
-            raise FileNotFoundError(
-                f"Could not locate libarx at {libarx}"
-            )
+            raise FileNotFoundError(f"Could not locate libarx at {libarx}")
 
         if not jpype.isJVMStarted():
             jpype.startJVM(classpath=[libarx])
@@ -374,7 +372,7 @@ def _define_attribute_types(
     ) -> None:
         """
         Sets the attribute types for identifiers, quasi-identifiers, and sensitive/insensitive attributes.
-        
+
         Args:
             data (JClass): The ARX Data object.
             config (AnonymizationConfig): The anonymization configuration.
@@ -433,7 +431,7 @@ def _create_arx_configuration(cls, config: AnonymizationConfig) -> JClass:
 
         Args:
             config (AnonymizationConfig): The anonymization configuration.
-        
+
         Returns:
             JClass: The ARXConfiguration object ready for the anonymization.
         """
 
@@ -1,34 +1,36 @@
 import json
+import os
 from dataclasses import dataclass
+
 import pandas as pd
-import os
+
 
 @dataclass
 class AnonymizationConfig:
     """
     Configuration object for the anonymization workflow.
 
     Attributes:
-        data (str): 
+        data (str):
             Path to the input dataset. Supported formats include CSV, Excel,
             JSON, and SQLite (.db) files.
 
-        identifiers (list[str]): 
+        identifiers (list[str]):
             List of direct identifiers (e.g., name, SSN, phone number).
-        
+
         quasi_identifiers (list[str]):
             List of quasi-identifying attributes requiring generalization
             (e.g., age, zipcode, occupation)
-        
+
         sensitive_attributes (list[str]):
             Attributes considered sensitive (e.g., disease, salary)
             If not empty, either l-diversity or t-closeness must be specified.
 
         insensitive_attributes (list[str]):
             Attributes that are neither identifiers nor sensitive and are carried through unchanged.
 
-        
-        hierarchies (dict[str, str]): 
+
+        hierarchies (dict[str, str]):
             Mapping from quasi-identifiers to CSV hierarchy files.
 
         k (int, optional):
@@ -50,6 +52,7 @@ class AnonymizationConfig:
             Anonymization backend to use, either 'arx' or 'anjana'.
             Defaults to 'arx'
     """
+
     data: str
     identifiers: list[str]
     quasi_identifiers: list[str]
@@ -70,7 +73,11 @@ def from_json(cls, json_path: str):
         with open(json_path, "r") as file:
             config_json = json.load(file)
 
-        attributes = {key: config_json[key] for key in cls.__annotations__ if key in config_json}
+        attributes = {
+            key: config_json[key]
+            for key in cls.__annotations__
+            if key in config_json
+        }
         return cls(**attributes)
 
     def _validate(self) -> None:
@@ -93,7 +100,7 @@ def _validate(self) -> None:
     def _validate_parameters(self) -> None:
         """
         Validates the anonymization parameters.
-        
+
         Checks:
             - k is a positive integer if provided
             - l is a positive integer if provided
@@ -109,38 +116,46 @@ def _validate_parameters(self) -> None:
         # --- Checks if k is correct ---
         if self.k is not None:
             if not isinstance(self.k, int):
-                raise TypeError(f"k must be an integer, but got {self.k!r} instead")
-            
+                raise TypeError(
+                    f"k must be an integer, but got {self.k!r} instead"
+                )
+
             if self.k <= 0:
                 raise ValueError(
                     f"k must be positive, but got {self.k!r} instead"
                 )
-        
+
         # --- Checks if l is correct ---
         if self.l is not None:
             if not isinstance(self.l, int):
-                raise TypeError(f"l must be an integer, but got {self.l!r} instead")
-            
+                raise TypeError(
+                    f"l must be an integer, but got {self.l!r} instead"
+                )
+
             if self.l <= 0:
                 raise ValueError(
                     f"l must be positive, but got {self.l!r} instead"
                 )
-        
+
         # --- Checks if t is correct ---
         if self.t is not None:
             if not isinstance(self.t, (float, int)):
-                raise TypeError(f"t must be a float, but got {self.t!r} instead")
-            
+                raise TypeError(
+                    f"t must be a float, but got {self.t!r} instead"
+                )
+
             if not 0.0 <= self.t <= 1.0:
                 raise ValueError(
                     f"t must be in [0,1], but got {self.t!r} instead"
                 )
-        
+
         # --- Checks if the suppression limit is correct ---
         if self.suppression_limit is not None:
             if not isinstance(self.suppression_limit, int):
-                raise TypeError(f"suppression_limit must be an integer, but got {self.suppression_limit!r} instead")
-            
+                raise TypeError(
+                    f"suppression_limit must be an integer, but got {self.suppression_limit!r} instead"
+                )
+
             if not 0 <= self.suppression_limit <= 100:
                 raise ValueError(
                     f"t must be in [0,100], but got {self.suppression_limit!r} instead"
@@ -150,13 +165,13 @@ def _validate_parameters(self) -> None:
         if not isinstance(self.backend, str):
             raise TypeError(
                 f"backed must be a string, but got {self.backend!r} instead!"
-            )  
-          
+            )
+
         if self.backend not in ["arx", "anjana"]:
             raise ValueError(
                 f"The backend must be either 'arx' or 'anjana', but got {self.backend!r} instead!"
             )
-    
+
     def _validate_attributes(self) -> None:
         """
         Validates all the attribute lists.
@@ -175,7 +190,7 @@ def _validate_attributes(self) -> None:
             "identifiers": self.identifiers,
             "quasi_identifiers": self.quasi_identifiers,
             "sensitive_attributes": self.sensitive_attributes,
-            "insensitive_attributes": self.insensitive_attributes
+            "insensitive_attributes": self.insensitive_attributes,
         }
 
         # Checks that the attributes are provided using lists.
@@ -185,10 +200,8 @@ def _validate_attributes(self) -> None:
                     f"{name} must be a list, but got {attrs!r} instead!"
                 )
             if not all(isinstance(x, str) for x in attrs):
-                raise TypeError(
-                    f"All entries in {name} must be strings!"
-                )
-        
+                raise TypeError(f"All entries in {name} must be strings!")
+
         # --- Checks that the attribute names do not overlap.
         all_attrs = sum(attr_list.values(), [])
         if len(all_attrs) != len(set(all_attrs)):
@@ -208,19 +221,19 @@ def _validate_dataset(self) -> None:
             TypeError: If the dataset path is not a string.
             FileNotFoundError: If the file does not exist at the given path.
         """
-        
+
         # --- Checks that the dataset path is a string ---
         if not isinstance(self.data, str):
             raise TypeError(
                 f"The dataset path must be provided as a string, but got {self.data!r} instead!"
             )
-        
+
         # --- Checks that the dataset file exists.
         if not os.path.exists(self.data):
             raise FileNotFoundError(
                 f"The dataset could not be located at {self.data!r}!"
             )
-    
+
     def _validate_hierarchies(self) -> None:
         """
         Validates the hierarchies provided for the quasi-identifiers.
@@ -251,19 +264,19 @@ def _validate_hierarchies(self) -> None:
                 raise TypeError(
                     f"Hierarchy quasi-identifier keys must be strings, but got {qid!r} instead!"
                 )
-            
+
             # --- Checks that the quasi-identifier exists ---
             if qid not in self.quasi_identifiers:
                 raise TypeError(
                     f"Cannot create hierarchy for {qid!r}, since it is not a quasi-identifier!"
                 )
-            
+
             # --- Checks that the hierarchy path is a string ---
             if not isinstance(hierarchy_path, str):
                 raise TypeError(
                     f"The hierarchy path for {qid!r} must be a string, but got {hierarchy_path!r} instead!"
                 )
-            
+
             # --- Checks that the hierarchy path exists.
             if not os.path.exists(hierarchy_path):
                 raise FileNotFoundError(
@@ -276,11 +289,11 @@ def _validate_privacy_models(self) -> None:
         If sensitive attributes are present, requires that either:
             - l-diversity ('l') is specified, or
             - t-closeness ('t') is specified
-        
+
         Raises:
             ValueError: If sensitive attributes exist but neither 'l' nor 't' is provided.
         """
         if self.sensitive_attributes and self.t is None and self.l is None:
             raise ValueError(
                 f"sensitive-attributes={self.sensitive_attributes}, l-Diversity or t-Closeness must be used when anonymizing with sensitive attributes!"
-            )
+            )
@@ -1,16 +1,18 @@
-import pytest 
 import contextlib
-from anonymization_manager import *
 from pathlib import Path
 
+import pytest
+
+from anonymization_manager import *
+
 TEST_DIR = Path(__file__).parent
-PATH=str(TEST_DIR/"test_dataset/data/adult.csv")
-HIERARCHY_PATH=TEST_DIR/"test_dataset/hierarchies"
-AGE_PATH=str(HIERARCHY_PATH/"age.csv")
-COUNTRY_PATH=str(HIERARCHY_PATH/"country.csv")
-RACE_PATH=str(HIERARCHY_PATH/"race.csv")
-SEX_PATH=str(HIERARCHY_PATH/"sex.csv")
-MARITAL_PATH=str(HIERARCHY_PATH/"marital.csv")
-OCCUPATION_PATH=str(HIERARCHY_PATH/"occupation.csv")
-WORK_CLASS_PATH=str(HIERARCHY_PATH/"workclass.csv")
-EDUCATION_PATH=str(HIERARCHY_PATH/"education.csv")
+PATH = str(TEST_DIR / "test_dataset/data/adult.csv")
+HIERARCHY_PATH = TEST_DIR / "test_dataset/hierarchies"
+AGE_PATH = str(HIERARCHY_PATH / "age.csv")
+COUNTRY_PATH = str(HIERARCHY_PATH / "country.csv")
+RACE_PATH = str(HIERARCHY_PATH / "race.csv")
+SEX_PATH = str(HIERARCHY_PATH / "sex.csv")
+MARITAL_PATH = str(HIERARCHY_PATH / "marital.csv")
+OCCUPATION_PATH = str(HIERARCHY_PATH / "occupation.csv")
+WORK_CLASS_PATH = str(HIERARCHY_PATH / "workclass.csv")
+EDUCATION_PATH = str(HIERARCHY_PATH / "education.csv")
@@ -1,45 +1,42 @@
 from tests.common import *
 
+
 class TestKAnonymity:
-    @pytest.mark.parametrize("k", [
-        (1),
-        (10),
-        (40)                   
-    ])
+    @pytest.mark.parametrize("k", [(1), (10), (40)])
     def test_k_anonymity(self, k) -> None:
         for backend in ["arx", "anjana"]:
-                config = AnonymizationConfig(
-                    data=PATH,
-                    identifiers=["education-num"],
-                    quasi_identifiers=[
-                        "age",
-                        "native-country",
-                        "race",
-                        "sex",
-                        "marital-status",
-                        "occupation",
-                        "workclass",
-                        "education",
-                    ],
-                    sensitive_attributes=[],
-                    insensitive_attributes=[],
-                    hierarchies={
-                        "age": AGE_PATH,
-                        "native-country": COUNTRY_PATH,
-                        "race": RACE_PATH,
-                        "sex": SEX_PATH,
-                        "marital-status": MARITAL_PATH,
-                        "occupation": OCCUPATION_PATH,
-                        "workclass": WORK_CLASS_PATH,
-                        "education": EDUCATION_PATH,
-                    },
-                    k=k,
-                    backend=backend,
-                )
+            config = AnonymizationConfig(
+                data=PATH,
+                identifiers=["education-num"],
+                quasi_identifiers=[
+                    "age",
+                    "native-country",
+                    "race",
+                    "sex",
+                    "marital-status",
+                    "occupation",
+                    "workclass",
+                    "education",
+                ],
+                sensitive_attributes=[],
+                insensitive_attributes=[],
+                hierarchies={
+                    "age": AGE_PATH,
+                    "native-country": COUNTRY_PATH,
+                    "race": RACE_PATH,
+                    "sex": SEX_PATH,
+                    "marital-status": MARITAL_PATH,
+                    "occupation": OCCUPATION_PATH,
+                    "workclass": WORK_CLASS_PATH,
+                    "education": EDUCATION_PATH,
+                },
+                k=k,
+                backend=backend,
+            )
 
-                data = AnonymizationManager.anonymize(config)
-                df = data.get_anonymized_data_as_dataframe()
+            data = AnonymizationManager.anonymize(config)
+            df = data.get_anonymized_data_as_dataframe()
 
-                # Checks k-anonymity.
-                group_sizes = df.groupby(config.quasi_identifiers).size()
-                assert (group_sizes >= k).all()
+            # Checks k-anonymity.
+            group_sizes = df.groupby(config.quasi_identifiers).size()
+            assert (group_sizes >= k).all()