Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ convert-notebooks:
@echo "Converting Python tutorials to notebooks and executing..."
@mkdir -p docs/notebooks
uv run --group notebooks python -m ipykernel install --user --name anonymizer-venv
uv run --group notebooks --group docs jupytext --to ipynb --execute docs/notebook_source/*.py
uv run --group notebooks --group docs jupytext --to ipynb --set-kernel anonymizer-venv --execute docs/notebook_source/*.py
mv docs/notebook_source/*.ipynb docs/notebooks/
@echo "Notebooks created in docs/notebooks/"

Expand Down
3 changes: 1 addition & 2 deletions docs/notebook_source/04_rewriting_biographies.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@
os.environ["NVIDIA_API_KEY"] = key

# %%
from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, Rewrite, configure_logging
from anonymizer.config.rewrite import PrivacyGoal
from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, PrivacyGoal, Rewrite, configure_logging

configure_logging(enabled=False)

Expand Down
3 changes: 1 addition & 2 deletions docs/notebook_source/05_rewriting_legal_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@
os.environ["NVIDIA_API_KEY"] = key

# %%
from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, Detect, Rewrite, configure_logging
from anonymizer.config.rewrite import PrivacyGoal
from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, Detect, PrivacyGoal, Rewrite, configure_logging

configure_logging(enabled=False)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires-python = ">=3.11"
license = "Apache-2.0"

dependencies = [
"data-designer==0.5.7",
"data-designer==0.6.0",
"pydantic>=2.9,<3",
"cyclopts>=3",
"pygments>=2.20.0",
Expand Down
2 changes: 1 addition & 1 deletion src/anonymizer/config/default_model_configs/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ model_configs:
model: openai/gpt-oss-120b
provider: nvidia
inference_parameters:
max_parallel_requests: 2
max_parallel_requests: 16
max_tokens: 16384
temperature: 0.3
top_p: 0.95
Expand Down
25 changes: 17 additions & 8 deletions src/anonymizer/engine/replace/llm_replace_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import logging
from collections import Counter
from dataclasses import dataclass

import pandas as pd
Expand Down Expand Up @@ -169,19 +170,27 @@ def _filter_replacement_map_to_input_entities(
filtered.append(replacement.model_dump())

if logger.isEnabledFor(logging.DEBUG):
Comment thread
lipikaramaswamy marked this conversation as resolved.
raw_pairs = {(r.original, r.label) for r in parsed_map.replacements}
filtered_pairs = {(f["original"], f["label"]) for f in filtered}
unrequested_labels = Counter(label for _, label in (raw_pairs - allowed_pairs))
unfilled_labels = Counter(label for _, label in (allowed_pairs - filtered_pairs))
logger.debug(
"Replacement map record %s: requested=%s raw=%s filtered=%s",
"Replacement map record %s: requested=%d raw=%d filtered=%d%s%s",
record_id or "<unknown>",
sorted(allowed_pairs),
[entry.model_dump() for entry in parsed_map.replacements],
filtered,
len(allowed_pairs),
len(parsed_map.replacements),
len(filtered),
f" unrequested_by_label={dict(unrequested_labels)}" if unrequested_labels else "",
f" unfilled_by_label={dict(unfilled_labels)}" if unfilled_labels else "",
)
elif not filtered and allowed_pairs:
if not filtered and allowed_pairs:
requested_labels = Counter(label for _, label in allowed_pairs)
logger.warning(
"Replacement map empty after filtering for record %s; requested=%s raw=%s",
"Replacement map empty after filtering for record %s; requested=%d raw=%d (requested_by_label=%s)",
record_id or "<unknown>",
sorted(allowed_pairs),
[entry.model_dump() for entry in parsed_map.replacements],
len(allowed_pairs),
len(parsed_map.replacements),
dict(requested_labels),
)
return {"replacements": filtered}

Expand Down
116 changes: 115 additions & 1 deletion tests/engine/test_llm_replace_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,22 @@

from __future__ import annotations

import logging
from unittest.mock import Mock

import pandas as pd
import pytest
from data_designer.config.models import ModelConfig

from anonymizer.config.models import ReplaceModelSelection
from anonymizer.engine.constants import COL_ENTITIES_BY_VALUE, COL_REPLACEMENT_MAP, COL_TEXT
from anonymizer.engine.ndd.adapter import WorkflowRunResult
from anonymizer.engine.replace.llm_replace_workflow import _INTERNAL_COLUMNS, LlmReplaceWorkflow
from anonymizer.engine.replace.llm_replace_workflow import (
_INTERNAL_COLUMNS,
LlmReplaceWorkflow,
_filter_replacement_map_to_input_entities,
)
from anonymizer.engine.schemas.detection import EntitiesByValueSchema


def test_generate_map_only_returns_replacement_map(
Expand Down Expand Up @@ -242,3 +249,110 @@ def test_generate_map_only_strips_internal_prompt_columns_when_no_entities(
adapter.run_workflow.assert_not_called()
for col in _INTERNAL_COLUMNS:
assert col not in result.dataframe.columns, f"workflow-internal column {col!r} leaked into result"


# ---------------------------------------------------------------------------
# PII-free logging regression tests for _filter_replacement_map_to_input_entities
#
# The replacement map filter runs after the LLM proposes substitutions and
# emits a DEBUG summary plus a WARNING when the filter empties out. Both
# log paths must report counts and labels only, never raw entity values.
# ---------------------------------------------------------------------------

_ORIGINAL_PII = ("Jane Doe", "jane.doe@example.com", "+1-555-867-5309")
_SYNTHETIC_PII = ("Maya Chen", "maya.chen@example.com", "+1-555-000-1234")


def _assert_no_pii_in_logs(caplog: pytest.LogCaptureFixture, extra_secrets: tuple[str, ...] = ()) -> None:
for secret in (*_ORIGINAL_PII, *_SYNTHETIC_PII, *extra_secrets):
assert secret not in caplog.text, f"PII leak in logs: {secret!r} appeared in:\n{caplog.text}"


def test_filter_replacement_map_debug_log_does_not_leak_pii(
caplog: pytest.LogCaptureFixture,
) -> None:
"""The DEBUG summary on the happy path must not emit raw entity values."""
parsed_entities = EntitiesByValueSchema.model_validate(
{
"entities_by_value": [
{"value": "Jane Doe", "labels": ["first_name"]},
{"value": "jane.doe@example.com", "labels": ["email"]},
{"value": "+1-555-867-5309", "labels": ["phone_number"]},
]
}
)
raw_map = {
"replacements": [
{"original": "Jane Doe", "label": "first_name", "synthetic": "Maya Chen"},
{"original": "jane.doe@example.com", "label": "email", "synthetic": "maya.chen@example.com"},
{"original": "+1-555-867-5309", "label": "phone_number", "synthetic": "+1-555-000-1234"},
]
}

with caplog.at_level(logging.DEBUG, logger="anonymizer"):
result = _filter_replacement_map_to_input_entities(
raw_map=raw_map, parsed_entities=parsed_entities, record_id="row-abc123"
)

assert "Replacement map record" in caplog.text
assert "requested=3" in caplog.text
_assert_no_pii_in_logs(caplog)
assert len(result["replacements"]) == 3


def test_filter_replacement_map_anomaly_summaries_do_not_leak_pii(
caplog: pytest.LogCaptureFixture,
) -> None:
"""Unrequested-by-label / unfilled-by-label extras must name the LABEL, not the value."""
parsed_entities = EntitiesByValueSchema.model_validate(
{
"entities_by_value": [
{"value": "Jane Doe", "labels": ["first_name"]},
{"value": "+1-555-867-5309", "labels": ["phone_number"]},
]
}
)
raw_map = {
"replacements": [
{"original": "Jane Doe", "label": "first_name", "synthetic": "Maya Chen"},
{"original": "Acme Corp", "label": "organization_name", "synthetic": "NovaCorp"},
]
}

with caplog.at_level(logging.DEBUG, logger="anonymizer"):
_filter_replacement_map_to_input_entities(raw_map=raw_map, parsed_entities=parsed_entities, record_id="row-xyz")

assert "unrequested_by_label" in caplog.text
assert "unfilled_by_label" in caplog.text
assert "organization_name" in caplog.text
assert "phone_number" in caplog.text

_assert_no_pii_in_logs(caplog, extra_secrets=("Acme Corp", "NovaCorp"))


def test_filter_replacement_map_empty_warning_does_not_leak_pii(
caplog: pytest.LogCaptureFixture,
) -> None:
"""The empty-after-filtering WARNING must report counts + labels only."""
parsed_entities = EntitiesByValueSchema.model_validate(
{
"entities_by_value": [
{"value": "Jane Doe", "labels": ["first_name"]},
]
}
)
raw_map = {
"replacements": [
{"original": "Acme Corp", "label": "organization_name", "synthetic": "NovaCorp"},
]
}

with caplog.at_level(logging.WARNING, logger="anonymizer"):
result = _filter_replacement_map_to_input_entities(
raw_map=raw_map, parsed_entities=parsed_entities, record_id="row-empty"
)

assert "Replacement map empty after filtering" in caplog.text
assert "first_name" in caplog.text
_assert_no_pii_in_logs(caplog, extra_secrets=("Acme Corp", "NovaCorp"))
assert result == {"replacements": []}
Loading
Loading