Skip to content

Commit 11eb9c0

Browse files
committed
tests formatting
1 parent 739f6ee commit 11eb9c0

File tree

11 files changed

+393
-367
lines changed

11 files changed

+393
-367
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .core import AnonymizationManager
21
from .config import AnonymizationConfig
2+
from .core import AnonymizationManager
33

44
__all__ = ["AnonymizationManager", "AnonymizationConfig"]

src/anonymization_manager/adapters/arx/arx.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _data_handle_to_dataframe(data_handle: JClass) -> pd.DataFrame:
4141
Args:
4242
data_handle (jpype._jclass.org.deidentifier.arx.DataHandle):
4343
The ARX DataHandle Object.
44-
44+
4545
Returns:
4646
pd.DataFrame: The dataset as a pandas DataFrame.
4747
"""
@@ -314,7 +314,7 @@ def get_non_uniform_entropy_metric(self, attribute: str) -> float:
314314
def get_generalization_intensity_metric(self, attribute: str) -> float:
315315
"""
316316
Returns the generalization intensity metric for a specific attribute.
317-
317+
318318
Args:
319319
attribute (str): The attribute name.
320320
@@ -361,9 +361,7 @@ def _load_arx_library(cls) -> None:
361361
libarx = os.path.join(os.path.dirname(__file__), "libarx-3.9.2.jar")
362362

363363
if not os.path.exists(libarx):
364-
raise FileNotFoundError(
365-
f"Could not locate libarx at {libarx}"
366-
)
364+
raise FileNotFoundError(f"Could not locate libarx at {libarx}")
367365

368366
if not jpype.isJVMStarted():
369367
jpype.startJVM(classpath=[libarx])
@@ -374,7 +372,7 @@ def _define_attribute_types(
374372
) -> None:
375373
"""
376374
Sets the attribute types for identifiers, quasi-identifiers, and sensitive/insensitive attributes.
377-
375+
378376
Args:
379377
data (JClass): The ARX Data object.
380378
config (AnonymizationConfig): The anonymization configuration.
@@ -433,7 +431,7 @@ def _create_arx_configuration(cls, config: AnonymizationConfig) -> JClass:
433431
434432
Args:
435433
config (AnonymizationConfig): The anonymization configuration.
436-
434+
437435
Returns:
438436
JClass: The ARXConfiguration object ready for the anonymization.
439437
"""

src/anonymization_manager/config.py

Lines changed: 49 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,36 @@
11
import json
2+
import os
23
from dataclasses import dataclass
4+
35
import pandas as pd
4-
import os
6+
57

68
@dataclass
79
class AnonymizationConfig:
810
"""
911
Configuration object for the anonymization workflow.
1012
1113
Attributes:
12-
data (str):
14+
data (str):
1315
Path to the input dataset. Supported formats include CSV, Excel,
1416
JSON, and SQLite (.db) files.
1517
16-
identifiers (list[str]):
18+
identifiers (list[str]):
1719
List of direct identifiers (e.g., name, SSN, phone number).
18-
20+
1921
quasi_identifiers (list[str]):
2022
List of quasi-identifying attributes requiring generalization
2123
(e.g., age, zipcode, occupation)
22-
24+
2325
sensitive_attributes (list[str]):
2426
Attributes considered sensitive (e.g., disease, salary)
2527
If not empty, either l-diversity or t-closeness must be specified.
2628
2729
insensitive_attributes (list[str]):
2830
Attributes that are neither identifiers nor sensitive and are carried through unchanged.
2931
30-
31-
hierarchies (dict[str, str]):
32+
33+
hierarchies (dict[str, str]):
3234
Mapping from quasi-identifiers to CSV hierarchy files.
3335
3436
k (int, optional):
@@ -50,6 +52,7 @@ class AnonymizationConfig:
5052
Anonymization backend to use, either 'arx' or 'anjana'.
5153
Defaults to 'arx'
5254
"""
55+
5356
data: str
5457
identifiers: list[str]
5558
quasi_identifiers: list[str]
@@ -70,7 +73,11 @@ def from_json(cls, json_path: str):
7073
with open(json_path, "r") as file:
7174
config_json = json.load(file)
7275

73-
attributes = {key: config_json[key] for key in cls.__annotations__ if key in config_json}
76+
attributes = {
77+
key: config_json[key]
78+
for key in cls.__annotations__
79+
if key in config_json
80+
}
7481
return cls(**attributes)
7582

7683
def _validate(self) -> None:
@@ -93,7 +100,7 @@ def _validate(self) -> None:
93100
def _validate_parameters(self) -> None:
94101
"""
95102
Validates the anonymization parameters.
96-
103+
97104
Checks:
98105
- k is a positive integer if provided
99106
- l is a positive integer if provided
@@ -109,38 +116,46 @@ def _validate_parameters(self) -> None:
109116
# --- Checks if k is correct ---
110117
if self.k is not None:
111118
if not isinstance(self.k, int):
112-
raise TypeError(f"k must be an integer, but got {self.k!r} instead")
113-
119+
raise TypeError(
120+
f"k must be an integer, but got {self.k!r} instead"
121+
)
122+
114123
if self.k <= 0:
115124
raise ValueError(
116125
f"k must be positive, but got {self.k!r} instead"
117126
)
118-
127+
119128
# --- Checks if l is correct ---
120129
if self.l is not None:
121130
if not isinstance(self.l, int):
122-
raise TypeError(f"l must be an integer, but got {self.l!r} instead")
123-
131+
raise TypeError(
132+
f"l must be an integer, but got {self.l!r} instead"
133+
)
134+
124135
if self.l <= 0:
125136
raise ValueError(
126137
f"l must be positive, but got {self.l!r} instead"
127138
)
128-
139+
129140
# --- Checks if t is correct ---
130141
if self.t is not None:
131142
if not isinstance(self.t, (float, int)):
132-
raise TypeError(f"t must be a float, but got {self.t!r} instead")
133-
143+
raise TypeError(
144+
f"t must be a float, but got {self.t!r} instead"
145+
)
146+
134147
if not 0.0 <= self.t <= 1.0:
135148
raise ValueError(
136149
f"t must be in [0,1], but got {self.t!r} instead"
137150
)
138-
151+
139152
# --- Checks if the suppression limit is correct ---
140153
if self.suppression_limit is not None:
141154
if not isinstance(self.suppression_limit, int):
142-
raise TypeError(f"suppression_limit must be an integer, but got {self.suppression_limit!r} instead")
143-
155+
raise TypeError(
156+
f"suppression_limit must be an integer, but got {self.suppression_limit!r} instead"
157+
)
158+
144159
if not 0 <= self.suppression_limit <= 100:
145160
raise ValueError(
146161
f"t must be in [0,100], but got {self.suppression_limit!r} instead"
@@ -150,13 +165,13 @@ def _validate_parameters(self) -> None:
150165
if not isinstance(self.backend, str):
151166
raise TypeError(
152167
f"backed must be a string, but got {self.backend!r} instead!"
153-
)
154-
168+
)
169+
155170
if self.backend not in ["arx", "anjana"]:
156171
raise ValueError(
157172
f"The backend must be either 'arx' or 'anjana', but got {self.backend!r} instead!"
158173
)
159-
174+
160175
def _validate_attributes(self) -> None:
161176
"""
162177
Validates all the attribute lists.
@@ -175,7 +190,7 @@ def _validate_attributes(self) -> None:
175190
"identifiers": self.identifiers,
176191
"quasi_identifiers": self.quasi_identifiers,
177192
"sensitive_attributes": self.sensitive_attributes,
178-
"insensitive_attributes": self.insensitive_attributes
193+
"insensitive_attributes": self.insensitive_attributes,
179194
}
180195

181196
# Checks that the attributes are provided using lists.
@@ -185,10 +200,8 @@ def _validate_attributes(self) -> None:
185200
f"{name} must be a list, but got {attrs!r} instead!"
186201
)
187202
if not all(isinstance(x, str) for x in attrs):
188-
raise TypeError(
189-
f"All entries in {name} must be strings!"
190-
)
191-
203+
raise TypeError(f"All entries in {name} must be strings!")
204+
192205
# --- Checks that the attribute names do not overlap.
193206
all_attrs = sum(attr_list.values(), [])
194207
if len(all_attrs) != len(set(all_attrs)):
@@ -208,19 +221,19 @@ def _validate_dataset(self) -> None:
208221
TypeError: If the dataset path is not a string.
209222
FileNotFoundError: If the file does not exist at the given path.
210223
"""
211-
224+
212225
# --- Checks that the dataset path is a string ---
213226
if not isinstance(self.data, str):
214227
raise TypeError(
215228
f"The dataset path must be provided as a string, but got {self.data!r} instead!"
216229
)
217-
230+
218231
# --- Checks that the dataset file exists.
219232
if not os.path.exists(self.data):
220233
raise FileNotFoundError(
221234
f"The dataset could not be located at {self.data!r}!"
222235
)
223-
236+
224237
def _validate_hierarchies(self) -> None:
225238
"""
226239
Validates the hierarchies provided for the quasi-identifiers.
@@ -251,19 +264,19 @@ def _validate_hierarchies(self) -> None:
251264
raise TypeError(
252265
f"Hierarchy quasi-identifier keys must be strings, but got {qid!r} instead!"
253266
)
254-
267+
255268
# --- Checks that the quasi-identifier exists ---
256269
if qid not in self.quasi_identifiers:
257270
raise TypeError(
258271
f"Cannot create hierarchy for {qid!r}, since it is not a quasi-identifier!"
259272
)
260-
273+
261274
# --- Checks that the hierarchy path is a string ---
262275
if not isinstance(hierarchy_path, str):
263276
raise TypeError(
264277
f"The hierarchy path for {qid!r} must be a string, but got {hierarchy_path!r} instead!"
265278
)
266-
279+
267280
# --- Checks that the hierarchy path exists.
268281
if not os.path.exists(hierarchy_path):
269282
raise FileNotFoundError(
@@ -276,11 +289,11 @@ def _validate_privacy_models(self) -> None:
276289
If sensitive attributes are present, requires that either:
277290
- l-diversity ('l') is specified, or
278291
- t-closeness ('t') is specified
279-
292+
280293
Raises:
281294
ValueError: If sensitive attributes exist but neither 'l' nor 't' is provided.
282295
"""
283296
if self.sensitive_attributes and self.t is None and self.l is None:
284297
raise ValueError(
285298
f"sensitive-attributes={self.sensitive_attributes}, l-Diversity or t-Closeness must be used when anonymizing with sensitive attributes!"
286-
)
299+
)

tests/common.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
1-
import pytest
21
import contextlib
3-
from anonymization_manager import *
42
from pathlib import Path
53

4+
import pytest
5+
6+
from anonymization_manager import *
7+
68
TEST_DIR = Path(__file__).parent
7-
PATH=str(TEST_DIR/"test_dataset/data/adult.csv")
8-
HIERARCHY_PATH=TEST_DIR/"test_dataset/hierarchies"
9-
AGE_PATH=str(HIERARCHY_PATH/"age.csv")
10-
COUNTRY_PATH=str(HIERARCHY_PATH/"country.csv")
11-
RACE_PATH=str(HIERARCHY_PATH/"race.csv")
12-
SEX_PATH=str(HIERARCHY_PATH/"sex.csv")
13-
MARITAL_PATH=str(HIERARCHY_PATH/"marital.csv")
14-
OCCUPATION_PATH=str(HIERARCHY_PATH/"occupation.csv")
15-
WORK_CLASS_PATH=str(HIERARCHY_PATH/"workclass.csv")
16-
EDUCATION_PATH=str(HIERARCHY_PATH/"education.csv")
9+
PATH = str(TEST_DIR / "test_dataset/data/adult.csv")
10+
HIERARCHY_PATH = TEST_DIR / "test_dataset/hierarchies"
11+
AGE_PATH = str(HIERARCHY_PATH / "age.csv")
12+
COUNTRY_PATH = str(HIERARCHY_PATH / "country.csv")
13+
RACE_PATH = str(HIERARCHY_PATH / "race.csv")
14+
SEX_PATH = str(HIERARCHY_PATH / "sex.csv")
15+
MARITAL_PATH = str(HIERARCHY_PATH / "marital.csv")
16+
OCCUPATION_PATH = str(HIERARCHY_PATH / "occupation.csv")
17+
WORK_CLASS_PATH = str(HIERARCHY_PATH / "workclass.csv")
18+
EDUCATION_PATH = str(HIERARCHY_PATH / "education.csv")
Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,42 @@
11
from tests.common import *
22

3+
34
class TestKAnonymity:
4-
@pytest.mark.parametrize("k", [
5-
(1),
6-
(10),
7-
(40)
8-
])
5+
@pytest.mark.parametrize("k", [(1), (10), (40)])
96
def test_k_anonymity(self, k) -> None:
107
for backend in ["arx", "anjana"]:
11-
config = AnonymizationConfig(
12-
data=PATH,
13-
identifiers=["education-num"],
14-
quasi_identifiers=[
15-
"age",
16-
"native-country",
17-
"race",
18-
"sex",
19-
"marital-status",
20-
"occupation",
21-
"workclass",
22-
"education",
23-
],
24-
sensitive_attributes=[],
25-
insensitive_attributes=[],
26-
hierarchies={
27-
"age": AGE_PATH,
28-
"native-country": COUNTRY_PATH,
29-
"race": RACE_PATH,
30-
"sex": SEX_PATH,
31-
"marital-status": MARITAL_PATH,
32-
"occupation": OCCUPATION_PATH,
33-
"workclass": WORK_CLASS_PATH,
34-
"education": EDUCATION_PATH,
35-
},
36-
k=k,
37-
backend=backend,
38-
)
8+
config = AnonymizationConfig(
9+
data=PATH,
10+
identifiers=["education-num"],
11+
quasi_identifiers=[
12+
"age",
13+
"native-country",
14+
"race",
15+
"sex",
16+
"marital-status",
17+
"occupation",
18+
"workclass",
19+
"education",
20+
],
21+
sensitive_attributes=[],
22+
insensitive_attributes=[],
23+
hierarchies={
24+
"age": AGE_PATH,
25+
"native-country": COUNTRY_PATH,
26+
"race": RACE_PATH,
27+
"sex": SEX_PATH,
28+
"marital-status": MARITAL_PATH,
29+
"occupation": OCCUPATION_PATH,
30+
"workclass": WORK_CLASS_PATH,
31+
"education": EDUCATION_PATH,
32+
},
33+
k=k,
34+
backend=backend,
35+
)
3936

40-
data = AnonymizationManager.anonymize(config)
41-
df = data.get_anonymized_data_as_dataframe()
37+
data = AnonymizationManager.anonymize(config)
38+
df = data.get_anonymized_data_as_dataframe()
4239

43-
# Checks k-anonymity.
44-
group_sizes = df.groupby(config.quasi_identifiers).size()
45-
assert (group_sizes >= k).all()
40+
# Checks k-anonymity.
41+
group_sizes = df.groupby(config.quasi_identifiers).size()
42+
assert (group_sizes >= k).all()

0 commit comments

Comments
 (0)