Skip to content
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
6c82ee7
Add failing test for - gliner truncates text and misses names (PII)
Nov 25, 2025
b04d9c7
Update gliner recognizer to implement basic chunking
Nov 25, 2025
e0eb745
Add changes for chunking capabilities including local chuking and cal…
Nov 25, 2025
71fb611
Remove gliner image redaction test - not required
Nov 26, 2025
c986737
Rename local text chunker to character based text chunker
Nov 26, 2025
ea49b70
Fix rename leftovers
Nov 26, 2025
83e2bd4
Update doc string
Nov 27, 2025
5553245
Add test for text without spaces and unicodes
Dec 2, 2025
0d53ce1
Resove linting - format code
Dec 2, 2025
c1ae52f
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Dec 3, 2025
560021c
Add logging to character based text chunker
Dec 3, 2025
1556d73
Update to remove redundent chunk_overlap parameter
Dec 3, 2025
9324450
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Dec 4, 2025
c073bb7
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
SharonHart Dec 17, 2025
d722aaa
Remove chunk size and chunk overlap from GlinerRecognizer constructor
Jan 6, 2026
8f637de
Updated the utilities to use RecognizerResult
Jan 7, 2026
86f16c1
Update so that utils methods are part of base chunker
Jan 8, 2026
0aea1e1
Add chunker factory
Jan 11, 2026
3f4e5b8
Create Lang chain text chunker
Jan 11, 2026
72de850
Remove Character based inhouse chunker
Jan 12, 2026
afc610a
Fixed - deterministic offset tracking, fail-fast on misalignment
Jan 13, 2026
e89ca8e
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Jan 13, 2026
bcc2868
Resolve merge issue
Jan 13, 2026
4858981
Add chunk parameter validation
Jan 13, 2026
8562844
Fix chunk size tests
Jan 13, 2026
d884818
Fix liniting
Jan 13, 2026
43f65a0
Make langchain splitter mandetory
Jan 14, 2026
6bf5ee8
Add clearer type error - review comment
Jan 14, 2026
bf3ac56
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Jan 15, 2026
b818e2b
Fix langchain installtion - review comment
Jan 15, 2026
b6eca56
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Jan 15, 2026
69610e4
Merge remote-tracking branch 'origin/jedheaj314/1569-fix-gliner-trunc…
Jan 15, 2026
58b33d9
Add conditional import of lang chain
Jan 15, 2026
ef5a566
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Jan 20, 2026
3b916e0
Revert to use in-house chunker
Jan 20, 2026
34ea19a
Merge remote branch with in-house chunker changes
Jan 20, 2026
fe9ebb7
Fix line too long (lint)
Jan 20, 2026
f8a6017
Fix trailing whitespace lint error
Jan 20, 2026
b102a52
Revemo not required comment
Jan 22, 2026
60816f3
Remove gliner extras from e2e tests to fix CI disk space issue
Jan 22, 2026
2ce4662
Remove trailing comma in pyproject.toml to match main
Jan 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e-tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
requests>=2.32.4
pytest
-e ../presidio-analyzer[langextract]
-e ../presidio-analyzer[langextract,gliner]
-e ../presidio-anonymizer
13 changes: 13 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Text chunking strategies for handling long texts."""

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
from presidio_analyzer.chunkers.langchain_text_chunker import LangChainTextChunker
from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider

__all__ = [
"BaseTextChunker",
"TextChunk",
"LangChainTextChunker",
"TextChunkerProvider",
]

140 changes: 140 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Abstract base class for text chunking strategies."""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, List

if TYPE_CHECKING:
from presidio_analyzer import RecognizerResult


@dataclass
class TextChunk:
"""Represents a chunk of text with its position in the original text.

:param text: The chunk content
:param start: Start position in the original text (inclusive)
:param end: End position in the original text (exclusive)
"""

text: str
start: int
end: int


class BaseTextChunker(ABC):
"""Abstract base class for text chunking strategies.

Subclasses must implement the chunk() method to split text into
TextChunk objects that include both content and position information.

Provides methods for processing predictions across chunks and
deduplicating overlapping entities.
"""

@abstractmethod
def chunk(self, text: str) -> List[TextChunk]:
"""Split text into chunks with position information.

:param text: The input text to split
:return: List of TextChunk objects with text and position data
"""
pass

def predict_with_chunking(
self,
text: str,
predict_func: Callable[[str], List["RecognizerResult"]],
) -> List["RecognizerResult"]:
"""Process text with automatic chunking for long texts.

For short text, calls predict_func directly.
For long text, chunks it and merges predictions with deduplication.

:param text: Input text to process
:param predict_func: Function that takes text and returns
RecognizerResult objects
:return: List of RecognizerResult with correct offsets
"""
chunks = self.chunk(text)
if not chunks:
return []
if len(chunks) == 1:
return predict_func(text)

predictions = self._process_chunks(chunks, predict_func)
return self.deduplicate_overlapping_entities(predictions)

def _process_chunks(
self,
chunks: List[TextChunk],
process_func: Callable[[str], List["RecognizerResult"]],
) -> List["RecognizerResult"]:
"""Process text chunks and adjust entity offsets.

:param chunks: List of TextChunk objects with text and position information
:param process_func: Function that takes chunk text and returns
RecognizerResult objects
:return: List of RecognizerResult with adjusted offsets
"""
all_predictions = []

for chunk in chunks:
chunk_predictions = process_func(chunk.text)

# Adjust offsets to match original text position
for pred in chunk_predictions:
pred.start += chunk.start
pred.end += chunk.start

all_predictions.extend(chunk_predictions)

return all_predictions

def deduplicate_overlapping_entities(
self,
predictions: List["RecognizerResult"],
overlap_threshold: float = 0.5,
) -> List["RecognizerResult"]:
"""Remove duplicate entities from overlapping chunks.

:param predictions: List of RecognizerResult objects
:param overlap_threshold: Overlap ratio threshold to consider duplicates
(default: 0.5)
:return: Deduplicated list of RecognizerResult sorted by position
"""
if not predictions:
return predictions

# Sort by score descending to keep highest scoring entities
sorted_preds = sorted(predictions, key=lambda p: p.score, reverse=True)
unique = []

for pred in sorted_preds:
is_duplicate = False
for kept in unique:
# Check if same entity type and overlapping positions
if pred.entity_type == kept.entity_type:
overlap_start = max(pred.start, kept.start)
overlap_end = min(pred.end, kept.end)

if overlap_start < overlap_end:
# Calculate overlap ratio
overlap_len = overlap_end - overlap_start
pred_len = pred.end - pred.start
kept_len = kept.end - kept.start

# Skip zero-length spans to avoid division by zero
# and malformed data
if pred_len <= 0 or kept_len <= 0:
continue

# Check if overlap exceeds threshold
if overlap_len / min(pred_len, kept_len) > overlap_threshold:
is_duplicate = True
break

if not is_duplicate:
unique.append(pred)

# Sort by position for consistent output
return sorted(unique, key=lambda p: p.start)
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Text chunker using LangChain's RecursiveCharacterTextSplitter.

Requires: pip install langchain-text-splitters
"""

from typing import List

from langchain_text_splitters import RecursiveCharacterTextSplitter

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk


class LangChainTextChunker(BaseTextChunker):
"""Text chunker using LangChain's RecursiveCharacterTextSplitter.

Uses separator hierarchy: paragraph β†’ line β†’ word β†’ char.
Requires: pip install langchain-text-splitters
"""

def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
"""Initialize the chunker.

:param chunk_size: Maximum characters per chunk
:param chunk_overlap: Characters to overlap between chunks
"""
if chunk_size <= 0:
raise ValueError("chunk_size must be positive")
if chunk_overlap < 0:
raise ValueError("chunk_overlap cannot be negative")
if chunk_overlap >= chunk_size:
raise ValueError("chunk_overlap must be smaller than chunk_size")

self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

@property
def chunk_size(self) -> int:
"""Get the chunk size."""
return self._chunk_size

@property
def chunk_overlap(self) -> int:
"""Get the chunk overlap."""
return self._chunk_overlap

def chunk(self, text: str) -> List[TextChunk]:
"""Split text into chunks with position information.

:param text: The input text to chunk
:return: List of TextChunk objects
"""
if not text:
return []

chunks_text = self._splitter.split_text(text)

# Calculate offsets deterministically using a running cursor to avoid
# ambiguous find() matches when chunks repeat.
chunks = []
cursor = 0
for chunk_text in chunks_text:
# Ensure the chunk_text actually appears at or after the cursor.
offset = text.find(chunk_text, cursor)
if offset == -1:
raise ValueError(
"Chunk text not found in source; chunking misalignment detected"
)
if offset < cursor:
raise ValueError(
"Chunk offsets would go backwards; chunking misalignment detected"
)

chunks.append(TextChunk(
text=chunk_text,
start=offset,
end=offset + len(chunk_text),
))

# Advance cursor accounting for configured overlap
cursor = offset + len(chunk_text) - self._chunk_overlap
if cursor < offset:
cursor = offset

return chunks
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Factory provider for creating text chunkers from configuration."""

import logging
from typing import Any, Dict, Optional, Type

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
from presidio_analyzer.chunkers.langchain_text_chunker import LangChainTextChunker

logger = logging.getLogger("presidio-analyzer")

# Registry mapping chunker type names to classes
_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
"langchain": LangChainTextChunker,
}


class TextChunkerProvider:
"""Create text chunkers from configuration.

:param chunker_configuration: Dict with chunker_type and optional params.
Example::

{"chunker_type": "langchain", "chunk_size": 300, "chunk_overlap": 75}

If no configuration provided, uses langchain chunker with default params
tuned for boundary coverage (chunk_size=250, chunk_overlap=50).
Requires: pip install langchain-text-splitters
"""

def __init__(
self,
chunker_configuration: Optional[Dict[str, Any]] = None,
):
# Default to a safe overlap to avoid boundary losses for cross-chunk entities.
self.chunker_configuration = chunker_configuration or {
"chunker_type": "langchain",
"chunk_size": 250,
"chunk_overlap": 50,
}

def create_chunker(self) -> BaseTextChunker:
"""Create a text chunker instance from configuration."""
config = self.chunker_configuration.copy()
chunker_type = config.pop("chunker_type", "langchain")

if chunker_type not in _CHUNKER_REGISTRY:
raise ValueError(
f"Unknown chunker_type '{chunker_type}'. "
f"Available: {list(_CHUNKER_REGISTRY.keys())}"
)

chunker_class = _CHUNKER_REGISTRY[chunker_type]
try:
return chunker_class(**config)
except TypeError as exc:
raise ValueError(
f"Invalid configuration for chunker_type '{chunker_type}': {config}"
) from exc

Loading
Loading