Skip to content

Commit 5d92cf8

Browse files
jedheaj314AJ (Ashitosh Jedhe)RonShakutaiSharonHart
authored
Fix gliner truncates text (#1805)
* Add failing test for - gliner truncates text and misses names (PII) * Update gliner recognizer to implement basic chunking * Add changes for chunking capabilities including local chuking and call to chunking from gliner recognizer * Remove gliner image redaction test - not required * Rename local text chunker to character based text chunker * Fix rename leftovers * Update doc string * Add test for text without spaces and unicodes * Resove linting - format code * Add logging to character based text chunker * Update to remove redundent chunk_overlap parameter * Remove chunk size and chunk overlap from GlinerRecognizer constructor * Updated the utilities to use RecognizerResult * Update so that utils methods are part of base chunker * Add chunker factory * Create Lang chain text chunker * Remove Character based inhouse chunker * Fixed - deterministic offset tracking, fail-fast on misalignment * Resolve merge issue * Add chunk parameter validation * Fix chunk size tests * Fix liniting * Make langchain splitter mandetory * Add clearer type error - review comment * Fix langchain installtion - review comment * Add conditional import of lang chain * Revert to use in-house chunker * Fix line too long (lint) * Fix trailing whitespace lint error * Revemo not required comment * Remove gliner extras from e2e tests to fix CI disk space issue * Remove trailing comma in pyproject.toml to match main --------- Co-authored-by: AJ (Ashitosh Jedhe) <[email protected]> Co-authored-by: Ron Shakutai <[email protected]> Co-authored-by: Sharon Hart <[email protected]> Co-authored-by: Ron Shakutai <[email protected]>
1 parent adf9bbe commit 5d92cf8

File tree

9 files changed

+845
-30
lines changed

9 files changed

+845
-30
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Text chunking strategies for handling long texts."""
2+
3+
from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
4+
from presidio_analyzer.chunkers.character_based_text_chunker import (
5+
CharacterBasedTextChunker,
6+
)
7+
from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider
8+
9+
__all__ = [
10+
"BaseTextChunker",
11+
"TextChunk",
12+
"CharacterBasedTextChunker",
13+
"TextChunkerProvider",
14+
]
15+
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Abstract base class for text chunking strategies."""
2+
from abc import ABC, abstractmethod
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING, Callable, List
5+
6+
if TYPE_CHECKING:
7+
from presidio_analyzer import RecognizerResult
8+
9+
10+
@dataclass
11+
class TextChunk:
12+
"""Represents a chunk of text with its position in the original text.
13+
14+
:param text: The chunk content
15+
:param start: Start position in the original text (inclusive)
16+
:param end: End position in the original text (exclusive)
17+
"""
18+
19+
text: str
20+
start: int
21+
end: int
22+
23+
24+
class BaseTextChunker(ABC):
25+
"""Abstract base class for text chunking strategies.
26+
27+
Subclasses must implement the chunk() method to split text into
28+
TextChunk objects that include both content and position information.
29+
30+
Provides methods for processing predictions across chunks and
31+
deduplicating overlapping entities.
32+
"""
33+
34+
@abstractmethod
35+
def chunk(self, text: str) -> List[TextChunk]:
36+
"""Split text into chunks with position information.
37+
38+
:param text: The input text to split
39+
:return: List of TextChunk objects with text and position data
40+
"""
41+
pass
42+
43+
def predict_with_chunking(
44+
self,
45+
text: str,
46+
predict_func: Callable[[str], List["RecognizerResult"]],
47+
) -> List["RecognizerResult"]:
48+
"""Process text with automatic chunking for long texts.
49+
50+
For short text, calls predict_func directly.
51+
For long text, chunks it and merges predictions with deduplication.
52+
53+
:param text: Input text to process
54+
:param predict_func: Function that takes text and returns
55+
RecognizerResult objects
56+
:return: List of RecognizerResult with correct offsets
57+
"""
58+
chunks = self.chunk(text)
59+
if not chunks:
60+
return []
61+
if len(chunks) == 1:
62+
return predict_func(text)
63+
64+
predictions = self._process_chunks(chunks, predict_func)
65+
return self.deduplicate_overlapping_entities(predictions)
66+
67+
def _process_chunks(
68+
self,
69+
chunks: List[TextChunk],
70+
process_func: Callable[[str], List["RecognizerResult"]],
71+
) -> List["RecognizerResult"]:
72+
"""Process text chunks and adjust entity offsets.
73+
74+
:param chunks: List of TextChunk objects with text and position information
75+
:param process_func: Function that takes chunk text and returns
76+
RecognizerResult objects
77+
:return: List of RecognizerResult with adjusted offsets
78+
"""
79+
from presidio_analyzer import RecognizerResult
80+
81+
all_predictions = []
82+
83+
for chunk in chunks:
84+
chunk_predictions = process_func(chunk.text)
85+
86+
# Create new RecognizerResult objects with adjusted offsets
87+
# to avoid mutating the original predictions
88+
for pred in chunk_predictions:
89+
adjusted_pred = RecognizerResult(
90+
entity_type=pred.entity_type,
91+
start=pred.start + chunk.start,
92+
end=pred.end + chunk.start,
93+
score=pred.score,
94+
analysis_explanation=pred.analysis_explanation,
95+
recognition_metadata=pred.recognition_metadata,
96+
)
97+
all_predictions.append(adjusted_pred)
98+
99+
return all_predictions
100+
101+
def deduplicate_overlapping_entities(
102+
self,
103+
predictions: List["RecognizerResult"],
104+
overlap_threshold: float = 0.5,
105+
) -> List["RecognizerResult"]:
106+
"""Remove duplicate entities from overlapping chunks.
107+
108+
:param predictions: List of RecognizerResult objects
109+
:param overlap_threshold: Overlap ratio threshold to consider duplicates
110+
(default: 0.5)
111+
:return: Deduplicated list of RecognizerResult sorted by position
112+
"""
113+
if not predictions:
114+
return predictions
115+
116+
# Sort by score descending to keep highest scoring entities
117+
sorted_preds = sorted(predictions, key=lambda p: p.score, reverse=True)
118+
unique = []
119+
120+
for pred in sorted_preds:
121+
is_duplicate = False
122+
for kept in unique:
123+
# Check if same entity type and overlapping positions
124+
if pred.entity_type == kept.entity_type:
125+
overlap_start = max(pred.start, kept.start)
126+
overlap_end = min(pred.end, kept.end)
127+
128+
if overlap_start < overlap_end:
129+
# Calculate overlap ratio
130+
overlap_len = overlap_end - overlap_start
131+
pred_len = pred.end - pred.start
132+
kept_len = kept.end - kept.start
133+
134+
if pred_len <= 0 or kept_len <= 0:
135+
continue
136+
137+
# Check if overlap exceeds threshold
138+
if overlap_len / min(pred_len, kept_len) > overlap_threshold:
139+
is_duplicate = True
140+
break
141+
142+
if not is_duplicate:
143+
unique.append(pred)
144+
145+
# Sort by position for consistent output
146+
return sorted(unique, key=lambda p: p.start)
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""Character-based text chunker with word boundary preservation.
2+
3+
Based on gliner-spacy implementation:
4+
https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96
5+
"""
6+
import logging
7+
from typing import Iterable, List, Tuple
8+
9+
from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
10+
11+
logger = logging.getLogger("presidio-analyzer")
12+
13+
14+
WORD_BOUNDARY_CHARS: Tuple[str, ...] = (" ", "\n")
15+
16+
17+
class CharacterBasedTextChunker(BaseTextChunker):
18+
"""Character-based text chunker with word boundary preservation."""
19+
20+
def __init__(
21+
self,
22+
chunk_size: int = 250,
23+
chunk_overlap: int = 50,
24+
boundary_chars: Iterable[str] | None = None,
25+
):
26+
"""Initialize the character-based text chunker.
27+
28+
Note: Chunks may slightly exceed chunk_size to preserve complete words.
29+
When this occurs, the actual overlap may vary from the specified value.
30+
31+
:param chunk_size: Target maximum characters per chunk (must be > 0)
32+
:param chunk_overlap: Target characters to overlap between chunks
33+
(must be >= 0 and < chunk_size)
34+
:param boundary_chars: Characters that count as word boundaries.
35+
Defaults to space/newline to keep current behavior.
36+
"""
37+
if chunk_size <= 0:
38+
logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size)
39+
raise ValueError("chunk_size must be greater than 0")
40+
if chunk_overlap < 0 or chunk_overlap >= chunk_size:
41+
logger.error(
42+
"Invalid chunk_overlap. Must be non-negative and less than chunk_size"
43+
)
44+
raise ValueError(
45+
"chunk_overlap must be non-negative and less than chunk_size"
46+
)
47+
48+
self._chunk_size = chunk_size
49+
self._chunk_overlap = chunk_overlap
50+
# Allow callers to tune boundaries
51+
# (e.g., punctuation, tabs) without changing defaults.
52+
self._boundary_chars: Tuple[str, ...] = (
53+
tuple(boundary_chars) if boundary_chars is not None else WORD_BOUNDARY_CHARS
54+
)
55+
56+
@property
57+
def chunk_size(self) -> int:
58+
"""Get the chunk size.
59+
60+
:return: The chunk size
61+
"""
62+
return self._chunk_size
63+
64+
@property
65+
def chunk_overlap(self) -> int:
66+
"""Get the chunk overlap.
67+
68+
:return: The chunk overlap
69+
"""
70+
return self._chunk_overlap
71+
72+
@property
73+
def boundary_chars(self) -> Tuple[str, ...]:
74+
"""Characters treated as word boundaries when extending chunks."""
75+
76+
return self._boundary_chars
77+
78+
def chunk(self, text: str) -> List[TextChunk]:
79+
"""Split text into overlapping chunks at word boundaries.
80+
81+
Chunks are extended to the nearest word boundary (space or newline)
82+
to avoid splitting words. This means chunks may slightly exceed
83+
chunk_size. For texts without spaces (e.g., CJK languages), chunks
84+
may extend to end of text.
85+
86+
:param text: The input text to chunk
87+
:return: List of TextChunk objects with text and position information
88+
"""
89+
if not text:
90+
logger.debug("Empty text provided, returning empty chunk list")
91+
return []
92+
93+
logger.debug(
94+
"Chunking text: length=%d, chunk_size=%d, overlap=%d",
95+
len(text),
96+
self._chunk_size,
97+
self._chunk_overlap,
98+
)
99+
100+
chunks = []
101+
start = 0
102+
103+
while start < len(text):
104+
# Calculate end position
105+
end = (
106+
start + self._chunk_size
107+
if start + self._chunk_size < len(text)
108+
else len(text)
109+
)
110+
111+
# Extend to complete word boundary (space or newline by default)
112+
while end < len(text) and text[end] not in self._boundary_chars:
113+
end += 1
114+
115+
chunks.append(TextChunk(text=text[start:end], start=start, end=end))
116+
117+
# Move start position with overlap (stop if we've covered all text)
118+
if end >= len(text):
119+
break
120+
start = end - self._chunk_overlap
121+
122+
logger.debug("Created %d chunks from text", len(chunks))
123+
return chunks
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""Factory provider for creating text chunkers from configuration."""
2+
3+
import logging
4+
from typing import Any, Dict, Optional, Type
5+
6+
from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
7+
from presidio_analyzer.chunkers.character_based_text_chunker import (
8+
CharacterBasedTextChunker,
9+
)
10+
11+
logger = logging.getLogger("presidio-analyzer")
12+
13+
# Registry mapping chunker type names to classes
14+
_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
15+
"character": CharacterBasedTextChunker,
16+
}
17+
18+
19+
class TextChunkerProvider:
20+
"""Create text chunkers from configuration.
21+
22+
:param chunker_configuration: Dict with chunker_type and optional params.
23+
Example::
24+
25+
{"chunker_type": "character", "chunk_size": 300, "chunk_overlap": 75}
26+
27+
If no configuration provided, uses character-based chunker with default params
28+
tuned for boundary coverage (chunk_size=250, chunk_overlap=50).
29+
"""
30+
31+
def __init__(
32+
self,
33+
chunker_configuration: Optional[Dict[str, Any]] = None,
34+
):
35+
# Default to a safe overlap to avoid boundary losses for cross-chunk entities.
36+
self.chunker_configuration = chunker_configuration or {
37+
"chunker_type": "character",
38+
"chunk_size": 250,
39+
"chunk_overlap": 50,
40+
}
41+
42+
def create_chunker(self) -> BaseTextChunker:
43+
"""Create a text chunker instance from configuration."""
44+
config = self.chunker_configuration.copy()
45+
chunker_type = config.pop("chunker_type", "character")
46+
47+
if chunker_type not in _CHUNKER_REGISTRY:
48+
raise ValueError(
49+
f"Unknown chunker_type '{chunker_type}'. "
50+
f"Available: {list(_CHUNKER_REGISTRY.keys())}"
51+
)
52+
53+
chunker_class = _CHUNKER_REGISTRY[chunker_type]
54+
try:
55+
return chunker_class(**config)
56+
except TypeError as exc:
57+
raise ValueError(
58+
f"Invalid configuration for chunker_type '{chunker_type}': {config}"
59+
) from exc
60+

0 commit comments

Comments
 (0)