Skip to content

Commit 3e6da2c

Browse files
docs: Example on PII obfuscation (#2459)
* added example on PII obfuscation Signed-off-by: Peter Staar <[email protected]> * reformatting code Signed-off-by: Peter Staar <[email protected]> * add in index and fix heading formatting Signed-off-by: Michele Dolfi <[email protected]> * add GLINER to PII Signed-off-by: Peter Staar <[email protected]> * final commit Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent cd7f7ba commit 3e6da2c

File tree

4 files changed

+472
-3
lines changed

4 files changed

+472
-3
lines changed

docs/examples/pii_obfuscate.py

Lines changed: 390 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,390 @@
1+
# %% [markdown]
2+
# Detect and obfuscate PII using a Hugging Face NER model.
3+
#
4+
# What this example does
5+
# - Converts a PDF and saves original Markdown with embedded images.
6+
# - Runs a HF token-classification pipeline (NER) to detect PII-like entities.
7+
# - Obfuscates occurrences in TextItem and TableItem by stable, type-based IDs.
8+
#
9+
# Prerequisites
10+
# - Install Docling. Install Transformers: `pip install transformers`.
11+
# - Optional (advanced): Install GLiNER for richer PII labels:
12+
# `pip install gliner`
13+
# If needed for CPU-only envs:
14+
# `pip install torch --extra-index-url https://download.pytorch.org/whl/cpu`
15+
# - Optionally, set `HF_MODEL` to a different NER/PII model.
16+
#
17+
# How to run
18+
# - From the repo root: `python docs/examples/pii_obfuscate.py`.
19+
# - To use GLiNER instead of HF pipeline:
20+
# python docs/examples/pii_obfuscate.py --engine gliner
21+
# or set env var `PII_ENGINE=gliner`.
22+
# - The script writes original and obfuscated Markdown to `scratch/`.
23+
#
24+
# Notes
25+
# - This is a simple demonstration. For production PII detection, consider
26+
# specialized models/pipelines and thorough evaluation.
27+
# %%
28+
29+
import argparse
30+
import logging
31+
import os
32+
import re
33+
from pathlib import Path
34+
from typing import Dict, List, Tuple
35+
36+
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
37+
from tabulate import tabulate
38+
39+
from docling.datamodel.base_models import InputFormat
40+
from docling.datamodel.pipeline_options import PdfPipelineOptions
41+
from docling.document_converter import DocumentConverter, PdfFormatOption
42+
43+
_log = logging.getLogger(__name__)
44+
45+
IMAGE_RESOLUTION_SCALE = 2.0
46+
HF_MODEL = "dslim/bert-base-NER" # Swap with another HF NER/PII model if desired, eg https://huggingface.co/urchade/gliner_multi_pii-v1 looks very promising too!
47+
GLINER_MODEL = "urchade/gliner_multi_pii-v1"
48+
49+
50+
def _build_simple_ner_pipeline():
51+
"""Create a Hugging Face token-classification pipeline for NER.
52+
53+
Returns a callable like: ner(text) -> List[dict]
54+
"""
55+
try:
56+
from transformers import (
57+
AutoModelForTokenClassification,
58+
AutoTokenizer,
59+
pipeline,
60+
)
61+
except Exception:
62+
_log.error("Transformers not installed. Please run: pip install transformers")
63+
raise
64+
65+
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
66+
model = AutoModelForTokenClassification.from_pretrained(HF_MODEL)
67+
ner = pipeline(
68+
"token-classification",
69+
model=model,
70+
tokenizer=tokenizer,
71+
aggregation_strategy="simple", # groups subwords into complete entities
72+
# Note: modern Transformers returns `start`/`end` when possible with aggregation
73+
)
74+
return ner
75+
76+
77+
class SimplePiiObfuscator:
78+
"""Tracks PII strings and replaces them with stable IDs per entity type."""
79+
80+
def __init__(self, ner_callable):
81+
self.ner = ner_callable
82+
self.entity_map: Dict[str, str] = {}
83+
self.counters: Dict[str, int] = {
84+
"person": 0,
85+
"org": 0,
86+
"location": 0,
87+
"misc": 0,
88+
}
89+
# Map model labels to our coarse types
90+
self.label_map = {
91+
"PER": "person",
92+
"PERSON": "person",
93+
"ORG": "org",
94+
"ORGANIZATION": "org",
95+
"LOC": "location",
96+
"LOCATION": "location",
97+
"GPE": "location",
98+
# Fallbacks
99+
"MISC": "misc",
100+
"O": "misc",
101+
}
102+
# Only obfuscate these by default. Adjust as needed.
103+
self.allowed_types = {"person", "org", "location"}
104+
105+
def _next_id(self, typ: str) -> str:
106+
self.counters[typ] += 1
107+
return f"{typ}-{self.counters[typ]}"
108+
109+
def _normalize(self, s: str) -> str:
110+
return re.sub(r"\s+", " ", s).strip()
111+
112+
def _extract_entities(self, text: str) -> List[Tuple[str, str]]:
113+
"""Run NER and return a list of (surface_text, type) to obfuscate."""
114+
if not text:
115+
return []
116+
results = self.ner(text)
117+
# Collect normalized items with optional span info
118+
items = []
119+
for r in results:
120+
raw_label = r.get("entity_group") or r.get("entity") or "MISC"
121+
label = self.label_map.get(raw_label, "misc")
122+
if label not in self.allowed_types:
123+
continue
124+
start = r.get("start")
125+
end = r.get("end")
126+
word = self._normalize(r.get("word") or r.get("text") or "")
127+
items.append({"label": label, "start": start, "end": end, "word": word})
128+
129+
found: List[Tuple[str, str]] = []
130+
# If the pipeline provides character spans, merge consecutive/overlapping
131+
# entities of the same type into a single span, then take the substring
132+
# from the original text. This handles cases like subword tokenization
133+
# where multiple adjacent pieces belong to the same named entity.
134+
have_spans = any(i["start"] is not None and i["end"] is not None for i in items)
135+
if have_spans:
136+
spans = [
137+
i for i in items if i["start"] is not None and i["end"] is not None
138+
]
139+
# Ensure processing order by start (then end)
140+
spans.sort(key=lambda x: (x["start"], x["end"]))
141+
142+
merged = []
143+
for s in spans:
144+
if not merged:
145+
merged.append(dict(s))
146+
continue
147+
last = merged[-1]
148+
if s["label"] == last["label"] and s["start"] <= last["end"]:
149+
# Merge identical, overlapping, or touching spans of same type
150+
last["start"] = min(last["start"], s["start"])
151+
last["end"] = max(last["end"], s["end"])
152+
else:
153+
merged.append(dict(s))
154+
155+
for m in merged:
156+
surface = self._normalize(text[m["start"] : m["end"]])
157+
if surface:
158+
found.append((surface, m["label"]))
159+
160+
# Include any items lacking spans as-is (fallback)
161+
for i in items:
162+
if i["start"] is None or i["end"] is None:
163+
if i["word"]:
164+
found.append((i["word"], i["label"]))
165+
else:
166+
# Fallback when spans aren't provided: return normalized words
167+
for i in items:
168+
if i["word"]:
169+
found.append((i["word"], i["label"]))
170+
return found
171+
172+
def obfuscate_text(self, text: str) -> str:
173+
if not text:
174+
return text
175+
176+
entities = self._extract_entities(text)
177+
if not entities:
178+
return text
179+
180+
# Deduplicate per text, keep stable global mapping
181+
unique_words: Dict[str, str] = {}
182+
for word, label in entities:
183+
if word not in self.entity_map:
184+
replacement = self._next_id(label)
185+
self.entity_map[word] = replacement
186+
unique_words[word] = self.entity_map[word]
187+
188+
# Replace longer matches first to avoid partial overlaps
189+
sorted_pairs = sorted(
190+
unique_words.items(), key=lambda x: len(x[0]), reverse=True
191+
)
192+
193+
def replace_once(s: str, old: str, new: str) -> str:
194+
# Use simple substring replacement; for stricter matching, use word boundaries
195+
# when appropriate (e.g., names). This is a demo, keep it simple.
196+
pattern = re.escape(old)
197+
return re.sub(pattern, new, s)
198+
199+
obfuscated = text
200+
for old, new in sorted_pairs:
201+
obfuscated = replace_once(obfuscated, old, new)
202+
return obfuscated
203+
204+
205+
def _build_gliner_model():
206+
"""Create a GLiNER model for PII-like entity extraction.
207+
208+
Returns a tuple (model, labels) where model.predict_entities(text, labels)
209+
yields entities with "text" and "label" fields.
210+
"""
211+
try:
212+
from gliner import GLiNER # type: ignore
213+
except Exception:
214+
_log.error(
215+
"GLiNER not installed. Please run: pip install gliner torch --extra-index-url https://download.pytorch.org/whl/cpu"
216+
)
217+
raise
218+
219+
model = GLiNER.from_pretrained(GLINER_MODEL)
220+
# Curated set of labels for PII detection. Adjust as needed.
221+
labels = [
222+
# "work",
223+
"booking number",
224+
"personally identifiable information",
225+
"driver licence",
226+
"person",
227+
"full address",
228+
"company",
229+
# "actor",
230+
# "character",
231+
"email",
232+
"passport number",
233+
"Social Security Number",
234+
"phone number",
235+
]
236+
return model, labels
237+
238+
239+
class AdvancedPIIObfuscator:
240+
"""PII obfuscator powered by GLiNER with fine-grained labels.
241+
242+
- Uses GLiNER's `predict_entities(text, labels)` to detect entities.
243+
- Obfuscates with stable IDs per fine-grained label, e.g. `email-1`.
244+
"""
245+
246+
def __init__(self, gliner_model, labels: List[str]):
247+
self.model = gliner_model
248+
self.labels = labels
249+
self.entity_map: Dict[str, str] = {}
250+
self.counters: Dict[str, int] = {}
251+
252+
def _normalize(self, s: str) -> str:
253+
return re.sub(r"\s+", " ", s).strip()
254+
255+
def _norm_label(self, label: str) -> str:
256+
return (
257+
re.sub(
258+
r"[^a-z0-9_]+", "_", label.lower().replace(" ", "_").replace("-", "_")
259+
).strip("_")
260+
or "pii"
261+
)
262+
263+
def _next_id(self, typ: str) -> str:
264+
self.cc(typ)
265+
self.counters[typ] += 1
266+
return f"{typ}-{self.counters[typ]}"
267+
268+
def cc(self, typ: str) -> None:
269+
if typ not in self.counters:
270+
self.counters[typ] = 0
271+
272+
def _extract_entities(self, text: str) -> List[Tuple[str, str]]:
273+
if not text:
274+
return []
275+
results = self.model.predict_entities(
276+
text, self.labels
277+
) # expects dicts with text/label
278+
found: List[Tuple[str, str]] = []
279+
for r in results:
280+
label = self._norm_label(str(r.get("label", "pii")))
281+
surface = self._normalize(str(r.get("text", "")))
282+
if surface:
283+
found.append((surface, label))
284+
return found
285+
286+
def obfuscate_text(self, text: str) -> str:
287+
if not text:
288+
return text
289+
entities = self._extract_entities(text)
290+
if not entities:
291+
return text
292+
293+
unique_words: Dict[str, str] = {}
294+
for word, label in entities:
295+
if word not in self.entity_map:
296+
replacement = self._next_id(label)
297+
self.entity_map[word] = replacement
298+
unique_words[word] = self.entity_map[word]
299+
300+
sorted_pairs = sorted(
301+
unique_words.items(), key=lambda x: len(x[0]), reverse=True
302+
)
303+
304+
def replace_once(s: str, old: str, new: str) -> str:
305+
pattern = re.escape(old)
306+
return re.sub(pattern, new, s)
307+
308+
obfuscated = text
309+
for old, new in sorted_pairs:
310+
obfuscated = replace_once(obfuscated, old, new)
311+
return obfuscated
312+
313+
314+
def main():
315+
logging.basicConfig(level=logging.INFO)
316+
317+
data_folder = Path(__file__).parent / "../../tests/data"
318+
input_doc_path = data_folder / "pdf/2206.01062.pdf"
319+
output_dir = Path("scratch") # ensure this directory exists before saving
320+
321+
# Choose engine via CLI flag or env var (default: hf)
322+
parser = argparse.ArgumentParser(description="PII obfuscation example")
323+
parser.add_argument(
324+
"--engine",
325+
choices=["hf", "gliner"],
326+
default=os.getenv("PII_ENGINE", "hf"),
327+
help="NER engine: 'hf' (Transformers) or 'gliner' (GLiNER)",
328+
)
329+
args = parser.parse_args()
330+
331+
# Ensure output dir exists
332+
output_dir.mkdir(parents=True, exist_ok=True)
333+
334+
# Keep and generate images so Markdown can embed them
335+
pipeline_options = PdfPipelineOptions()
336+
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
337+
pipeline_options.generate_page_images = True
338+
pipeline_options.generate_picture_images = True
339+
340+
doc_converter = DocumentConverter(
341+
format_options={
342+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
343+
}
344+
)
345+
346+
conv_res = doc_converter.convert(input_doc_path)
347+
conv_doc = conv_res.document
348+
doc_filename = conv_res.input.file.name
349+
350+
# Save markdown with embedded pictures in original text
351+
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
352+
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
353+
354+
# Build NER pipeline and obfuscator
355+
if args.engine == "gliner":
356+
_log.info("Using GLiNER-based AdvancedPIIObfuscator")
357+
gliner_model, gliner_labels = _build_gliner_model()
358+
obfuscator = AdvancedPIIObfuscator(gliner_model, gliner_labels)
359+
else:
360+
_log.info("Using HF Transformers-based SimplePiiObfuscator")
361+
ner = _build_simple_ner_pipeline()
362+
obfuscator = SimplePiiObfuscator(ner)
363+
364+
for element, _level in conv_res.document.iterate_items():
365+
if isinstance(element, TextItem):
366+
element.orig = element.text
367+
element.text = obfuscator.obfuscate_text(element.text)
368+
# print(element.orig, " => ", element.text)
369+
370+
elif isinstance(element, TableItem):
371+
for cell in element.data.table_cells:
372+
cell.text = obfuscator.obfuscate_text(cell.text)
373+
374+
# Save markdown with embedded pictures and obfuscated text
375+
md_filename = output_dir / f"{doc_filename}-with-images-pii-obfuscated.md"
376+
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
377+
378+
# Optional: log mapping summary
379+
if obfuscator.entity_map:
380+
data = []
381+
for key, val in obfuscator.entity_map.items():
382+
data.append([key, val])
383+
384+
_log.info(
385+
f"Obfuscated entities:\n\n{tabulate(data)}",
386+
)
387+
388+
389+
if __name__ == "__main__":
390+
main()

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ nav:
9494
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
9595
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
9696
- "Accelerator options": examples/run_with_accelerator.py
97+
- "Detect and obfuscate PII": examples/pii_obfuscate.py
9798
- "Simple translation": examples/translate.py
9899
- examples/backend_csv.ipynb
99100
- examples/backend_xml_rag.ipynb

0 commit comments

Comments
 (0)