Skip to content
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
6c82ee7
Add failing test for - gliner truncates text and misses names (PII)
Nov 25, 2025
b04d9c7
Update gliner recognizer to implement basic chunking
Nov 25, 2025
e0eb745
Add changes for chunking capabilities including local chuking and cal…
Nov 25, 2025
71fb611
Remove gliner image redaction test - not required
Nov 26, 2025
c986737
Rename local text chunker to character based text chunker
Nov 26, 2025
ea49b70
Fix rename leftovers
Nov 26, 2025
83e2bd4
Update doc string
Nov 27, 2025
5553245
Add test for text without spaces and unicodes
Dec 2, 2025
0d53ce1
Resove linting - format code
Dec 2, 2025
c1ae52f
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Dec 3, 2025
560021c
Add logging to character based text chunker
Dec 3, 2025
1556d73
Update to remove redundent chunk_overlap parameter
Dec 3, 2025
9324450
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Dec 4, 2025
c073bb7
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
SharonHart Dec 17, 2025
d722aaa
Remove chunk size and chunk overlap from GlinerRecognizer constructor
Jan 6, 2026
8f637de
Updated the utilities to use RecognizerResult
Jan 7, 2026
86f16c1
Update so that utils methods are part of base chunker
Jan 8, 2026
0aea1e1
Add chunker factory
Jan 11, 2026
3f4e5b8
Create Lang chain text chunker
Jan 11, 2026
72de850
Remove Character based inhouse chunker
Jan 12, 2026
afc610a
Fixed - deterministic offset tracking, fail-fast on misalignment
Jan 13, 2026
e89ca8e
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Jan 13, 2026
bcc2868
Resolve merge issue
Jan 13, 2026
4858981
Add chunk parameter validation
Jan 13, 2026
8562844
Fix chunk size tests
Jan 13, 2026
d884818
Fix liniting
Jan 13, 2026
43f65a0
Make langchain splitter mandetory
Jan 14, 2026
6bf5ee8
Add clearer type error - review comment
Jan 14, 2026
bf3ac56
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Jan 15, 2026
b818e2b
Fix langchain installtion - review comment
Jan 15, 2026
b6eca56
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
RonShakutai Jan 15, 2026
69610e4
Merge remote-tracking branch 'origin/jedheaj314/1569-fix-gliner-trunc…
Jan 15, 2026
58b33d9
Add conditional import of lang chain
Jan 15, 2026
ef5a566
Merge branch 'main' into jedheaj314/1569-fix-gliner-truncates-text
jedheaj314 Jan 20, 2026
3b916e0
Revert to use in-house chunker
Jan 20, 2026
34ea19a
Merge remote branch with in-house chunker changes
Jan 20, 2026
fe9ebb7
Fix line too long (lint)
Jan 20, 2026
f8a6017
Fix trailing whitespace lint error
Jan 20, 2026
b102a52
Revemo not required comment
Jan 22, 2026
60816f3
Remove gliner extras from e2e tests to fix CI disk space issue
Jan 22, 2026
2ce4662
Remove trailing comma in pyproject.toml to match main
Jan 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e-tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
requests>=2.32.4
pytest
-e ../presidio-analyzer[langextract]
-e ../presidio-analyzer[langextract,gliner]
-e ../presidio-anonymizer
15 changes: 15 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Text chunking strategies for handling long texts."""

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
from presidio_analyzer.chunkers.character_based_text_chunker import (
CharacterBasedTextChunker,
)
from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider

__all__ = [
"BaseTextChunker",
"TextChunk",
"CharacterBasedTextChunker",
"TextChunkerProvider",
]

146 changes: 146 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Abstract base class for text chunking strategies."""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, List

if TYPE_CHECKING:
from presidio_analyzer import RecognizerResult


@dataclass
class TextChunk:
"""Represents a chunk of text with its position in the original text.

:param text: The chunk content
:param start: Start position in the original text (inclusive)
:param end: End position in the original text (exclusive)
"""

text: str
start: int
end: int


class BaseTextChunker(ABC):
"""Abstract base class for text chunking strategies.

Subclasses must implement the chunk() method to split text into
TextChunk objects that include both content and position information.

Provides methods for processing predictions across chunks and
deduplicating overlapping entities.
"""

@abstractmethod
def chunk(self, text: str) -> List[TextChunk]:
"""Split text into chunks with position information.

:param text: The input text to split
:return: List of TextChunk objects with text and position data
"""
pass

def predict_with_chunking(
self,
text: str,
predict_func: Callable[[str], List["RecognizerResult"]],
) -> List["RecognizerResult"]:
"""Process text with automatic chunking for long texts.

For short text, calls predict_func directly.
For long text, chunks it and merges predictions with deduplication.

:param text: Input text to process
:param predict_func: Function that takes text and returns
RecognizerResult objects
:return: List of RecognizerResult with correct offsets
"""
chunks = self.chunk(text)
if not chunks:
return []
if len(chunks) == 1:
return predict_func(text)

predictions = self._process_chunks(chunks, predict_func)
return self.deduplicate_overlapping_entities(predictions)

def _process_chunks(
self,
chunks: List[TextChunk],
process_func: Callable[[str], List["RecognizerResult"]],
) -> List["RecognizerResult"]:
"""Process text chunks and adjust entity offsets.

:param chunks: List of TextChunk objects with text and position information
:param process_func: Function that takes chunk text and returns
RecognizerResult objects
:return: List of RecognizerResult with adjusted offsets
"""
from presidio_analyzer import RecognizerResult

all_predictions = []

for chunk in chunks:
chunk_predictions = process_func(chunk.text)

# Create new RecognizerResult objects with adjusted offsets
# to avoid mutating the original predictions
for pred in chunk_predictions:
adjusted_pred = RecognizerResult(
entity_type=pred.entity_type,
start=pred.start + chunk.start,
end=pred.end + chunk.start,
score=pred.score,
analysis_explanation=pred.analysis_explanation,
recognition_metadata=pred.recognition_metadata,
)
all_predictions.append(adjusted_pred)

return all_predictions

def deduplicate_overlapping_entities(
self,
predictions: List["RecognizerResult"],
overlap_threshold: float = 0.5,
) -> List["RecognizerResult"]:
"""Remove duplicate entities from overlapping chunks.

:param predictions: List of RecognizerResult objects
:param overlap_threshold: Overlap ratio threshold to consider duplicates
(default: 0.5)
:return: Deduplicated list of RecognizerResult sorted by position
"""
if not predictions:
return predictions

# Sort by score descending to keep highest scoring entities
sorted_preds = sorted(predictions, key=lambda p: p.score, reverse=True)
unique = []

for pred in sorted_preds:
is_duplicate = False
for kept in unique:
# Check if same entity type and overlapping positions
if pred.entity_type == kept.entity_type:
overlap_start = max(pred.start, kept.start)
overlap_end = min(pred.end, kept.end)

if overlap_start < overlap_end:
# Calculate overlap ratio
overlap_len = overlap_end - overlap_start
pred_len = pred.end - pred.start
kept_len = kept.end - kept.start

if pred_len <= 0 or kept_len <= 0:
continue

# Check if overlap exceeds threshold
if overlap_len / min(pred_len, kept_len) > overlap_threshold:
is_duplicate = True
break

if not is_duplicate:
unique.append(pred)

# Sort by position for consistent output
return sorted(unique, key=lambda p: p.start)
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Character-based text chunker with word boundary preservation.

Based on gliner-spacy implementation:
https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96
"""
import logging
from typing import Iterable, List, Tuple

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk

logger = logging.getLogger("presidio-analyzer")


WORD_BOUNDARY_CHARS: Tuple[str, ...] = (" ", "\n")


class CharacterBasedTextChunker(BaseTextChunker):
"""Character-based text chunker with word boundary preservation."""

def __init__(
self,
chunk_size: int = 250,
chunk_overlap: int = 50,
boundary_chars: Iterable[str] | None = None,
):
"""Initialize the character-based text chunker.

Note: Chunks may slightly exceed chunk_size to preserve complete words.
When this occurs, the actual overlap may vary from the specified value.

:param chunk_size: Target maximum characters per chunk (must be > 0)
:param chunk_overlap: Target characters to overlap between chunks
(must be >= 0 and < chunk_size)
:param boundary_chars: Characters that count as word boundaries.
Defaults to space/newline to keep current behavior.
"""
if chunk_size <= 0:
logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size)
raise ValueError("chunk_size must be greater than 0")
if chunk_overlap < 0 or chunk_overlap >= chunk_size:
logger.error(
"Invalid chunk_overlap. Must be non-negative and less than chunk_size"
)
raise ValueError(
"chunk_overlap must be non-negative and less than chunk_size"
)

self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
# Allow callers to tune boundaries
# (e.g., punctuation, tabs) without changing defaults.
self._boundary_chars: Tuple[str, ...] = (
tuple(boundary_chars) if boundary_chars is not None else WORD_BOUNDARY_CHARS
)

@property
def chunk_size(self) -> int:
"""Get the chunk size.

:return: The chunk size
"""
return self._chunk_size

@property
def chunk_overlap(self) -> int:
"""Get the chunk overlap.

:return: The chunk overlap
"""
return self._chunk_overlap

@property
def boundary_chars(self) -> Tuple[str, ...]:
"""Characters treated as word boundaries when extending chunks."""

return self._boundary_chars

def chunk(self, text: str) -> List[TextChunk]:
"""Split text into overlapping chunks at word boundaries.

Chunks are extended to the nearest word boundary (space or newline)
to avoid splitting words. This means chunks may slightly exceed
chunk_size. For texts without spaces (e.g., CJK languages), chunks
may extend to end of text.

:param text: The input text to chunk
:return: List of TextChunk objects with text and position information
"""
if not text:
logger.debug("Empty text provided, returning empty chunk list")
return []

logger.debug(
"Chunking text: length=%d, chunk_size=%d, overlap=%d",
len(text),
self._chunk_size,
self._chunk_overlap,
)

chunks = []
start = 0

while start < len(text):
# Calculate end position
end = (
start + self._chunk_size
if start + self._chunk_size < len(text)
else len(text)
)

# Extend to complete word boundary (space or newline by default)
while end < len(text) and text[end] not in self._boundary_chars:
end += 1

chunks.append(TextChunk(text=text[start:end], start=start, end=end))

# Move start position with overlap (stop if we've covered all text)
if end >= len(text):
break
start = end - self._chunk_overlap

logger.debug("Created %d chunks from text", len(chunks))
return chunks
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Factory provider for creating text chunkers from configuration."""

import logging
from typing import Any, Dict, Optional, Type

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
from presidio_analyzer.chunkers.character_based_text_chunker import (
CharacterBasedTextChunker,
)

logger = logging.getLogger("presidio-analyzer")

# Registry mapping chunker type names to classes
_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
"character": CharacterBasedTextChunker,
}


class TextChunkerProvider:
"""Create text chunkers from configuration.

:param chunker_configuration: Dict with chunker_type and optional params.
Example::

{"chunker_type": "character", "chunk_size": 300, "chunk_overlap": 75}

If no configuration provided, uses character-based chunker with default params
tuned for boundary coverage (chunk_size=250, chunk_overlap=50).
"""

def __init__(
self,
chunker_configuration: Optional[Dict[str, Any]] = None,
):
# Default to a safe overlap to avoid boundary losses for cross-chunk entities.
self.chunker_configuration = chunker_configuration or {
"chunker_type": "character",
"chunk_size": 250,
"chunk_overlap": 50,
}

def create_chunker(self) -> BaseTextChunker:
"""Create a text chunker instance from configuration."""
config = self.chunker_configuration.copy()
chunker_type = config.pop("chunker_type", "character")

if chunker_type not in _CHUNKER_REGISTRY:
raise ValueError(
f"Unknown chunker_type '{chunker_type}'. "
f"Available: {list(_CHUNKER_REGISTRY.keys())}"
)

chunker_class = _CHUNKER_REGISTRY[chunker_type]
try:
return chunker_class(**config)
except TypeError as exc:
raise ValueError(
f"Invalid configuration for chunker_type '{chunker_type}': {config}"
) from exc

Loading
Loading