microsoft
diff --git a/‎presidio-analyzer/presidio_analyzer/chunkers/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎presidio-analyzer/presidio_analyzer/chunkers/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py‎
Lines changed: 146 additions & 0 deletions b/‎presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py‎
Lines changed: 123 additions & 0 deletions b/‎presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py‎
Lines changed: 60 additions & 0 deletions b/‎presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py‎
Lines changed: 60 additions & 0 deletions
@@ -0,0 +1,15 @@
+"""Text chunking strategies for handling long texts."""
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+from presidio_analyzer.chunkers.character_based_text_chunker import (
+    CharacterBasedTextChunker,
+)
+from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider
+
+__all__ = [
+    "BaseTextChunker",
+    "TextChunk",
+    "CharacterBasedTextChunker",
+    "TextChunkerProvider",
+]
+
@@ -0,0 +1,146 @@
+"""Abstract base class for text chunking strategies."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, List
+
+if TYPE_CHECKING:
+    from presidio_analyzer import RecognizerResult
+
+
+@dataclass
+class TextChunk:
+    """Represents a chunk of text with its position in the original text.
+
+    :param text: The chunk content
+    :param start: Start position in the original text (inclusive)
+    :param end: End position in the original text (exclusive)
+    """
+
+    text: str
+    start: int
+    end: int
+
+
+class BaseTextChunker(ABC):
+    """Abstract base class for text chunking strategies.
+
+    Subclasses must implement the chunk() method to split text into
+    TextChunk objects that include both content and position information.
+
+    Provides methods for processing predictions across chunks and
+    deduplicating overlapping entities.
+    """
+
+    @abstractmethod
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into chunks with position information.
+
+        :param text: The input text to split
+        :return: List of TextChunk objects with text and position data
+        """
+        pass
+
+    def predict_with_chunking(
+        self,
+        text: str,
+        predict_func: Callable[[str], List["RecognizerResult"]],
+    ) -> List["RecognizerResult"]:
+        """Process text with automatic chunking for long texts.
+
+        For short text, calls predict_func directly.
+        For long text, chunks it and merges predictions with deduplication.
+
+        :param text: Input text to process
+        :param predict_func: Function that takes text and returns
+            RecognizerResult objects
+        :return: List of RecognizerResult with correct offsets
+        """
+        chunks = self.chunk(text)
+        if not chunks:
+            return []
+        if len(chunks) == 1:
+            return predict_func(text)
+
+        predictions = self._process_chunks(chunks, predict_func)
+        return self.deduplicate_overlapping_entities(predictions)
+
+    def _process_chunks(
+        self,
+        chunks: List[TextChunk],
+        process_func: Callable[[str], List["RecognizerResult"]],
+    ) -> List["RecognizerResult"]:
+        """Process text chunks and adjust entity offsets.
+
+        :param chunks: List of TextChunk objects with text and position information
+        :param process_func: Function that takes chunk text and returns
+            RecognizerResult objects
+        :return: List of RecognizerResult with adjusted offsets
+        """
+        from presidio_analyzer import RecognizerResult
+
+        all_predictions = []
+
+        for chunk in chunks:
+            chunk_predictions = process_func(chunk.text)
+
+            # Create new RecognizerResult objects with adjusted offsets
+            # to avoid mutating the original predictions
+            for pred in chunk_predictions:
+                adjusted_pred = RecognizerResult(
+                    entity_type=pred.entity_type,
+                    start=pred.start + chunk.start,
+                    end=pred.end + chunk.start,
+                    score=pred.score,
+                    analysis_explanation=pred.analysis_explanation,
+                    recognition_metadata=pred.recognition_metadata,
+                )
+                all_predictions.append(adjusted_pred)
+
+        return all_predictions
+
+    def deduplicate_overlapping_entities(
+        self,
+        predictions: List["RecognizerResult"],
+        overlap_threshold: float = 0.5,
+    ) -> List["RecognizerResult"]:
+        """Remove duplicate entities from overlapping chunks.
+
+        :param predictions: List of RecognizerResult objects
+        :param overlap_threshold: Overlap ratio threshold to consider duplicates
+            (default: 0.5)
+        :return: Deduplicated list of RecognizerResult sorted by position
+        """
+        if not predictions:
+            return predictions
+
+        # Sort by score descending to keep highest scoring entities
+        sorted_preds = sorted(predictions, key=lambda p: p.score, reverse=True)
+        unique = []
+
+        for pred in sorted_preds:
+            is_duplicate = False
+            for kept in unique:
+                # Check if same entity type and overlapping positions
+                if pred.entity_type == kept.entity_type:
+                    overlap_start = max(pred.start, kept.start)
+                    overlap_end = min(pred.end, kept.end)
+
+                    if overlap_start < overlap_end:
+                        # Calculate overlap ratio
+                        overlap_len = overlap_end - overlap_start
+                        pred_len = pred.end - pred.start
+                        kept_len = kept.end - kept.start
+
+                        if pred_len <= 0 or kept_len <= 0:
+                            continue
+
+                        # Check if overlap exceeds threshold
+                        if overlap_len / min(pred_len, kept_len) > overlap_threshold:
+                            is_duplicate = True
+                            break
+
+            if not is_duplicate:
+                unique.append(pred)
+
+        # Sort by position for consistent output
+        return sorted(unique, key=lambda p: p.start)
@@ -0,0 +1,123 @@
+"""Character-based text chunker with word boundary preservation.
+
+Based on gliner-spacy implementation:
+https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96
+"""
+import logging
+from typing import Iterable, List, Tuple
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+
+logger = logging.getLogger("presidio-analyzer")
+
+
+WORD_BOUNDARY_CHARS: Tuple[str, ...] = (" ", "\n")
+
+
+class CharacterBasedTextChunker(BaseTextChunker):
+    """Character-based text chunker with word boundary preservation."""
+
+    def __init__(
+        self,
+        chunk_size: int = 250,
+        chunk_overlap: int = 50,
+        boundary_chars: Iterable[str] | None = None,
+    ):
+        """Initialize the character-based text chunker.
+
+        Note: Chunks may slightly exceed chunk_size to preserve complete words.
+        When this occurs, the actual overlap may vary from the specified value.
+
+        :param chunk_size: Target maximum characters per chunk (must be > 0)
+        :param chunk_overlap: Target characters to overlap between chunks
+            (must be >= 0 and < chunk_size)
+        :param boundary_chars: Characters that count as word boundaries.
+            Defaults to space/newline to keep current behavior.
+        """
+        if chunk_size <= 0:
+            logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size)
+            raise ValueError("chunk_size must be greater than 0")
+        if chunk_overlap < 0 or chunk_overlap >= chunk_size:
+            logger.error(
+                "Invalid chunk_overlap. Must be non-negative and less than chunk_size"
+            )
+            raise ValueError(
+                "chunk_overlap must be non-negative and less than chunk_size"
+            )
+
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        # Allow callers to tune boundaries
+        # (e.g., punctuation, tabs) without changing defaults.
+        self._boundary_chars: Tuple[str, ...] = (
+            tuple(boundary_chars) if boundary_chars is not None else WORD_BOUNDARY_CHARS
+        )
+
+    @property
+    def chunk_size(self) -> int:
+        """Get the chunk size.
+
+        :return: The chunk size
+        """
+        return self._chunk_size
+
+    @property
+    def chunk_overlap(self) -> int:
+        """Get the chunk overlap.
+
+        :return: The chunk overlap
+        """
+        return self._chunk_overlap
+
+    @property
+    def boundary_chars(self) -> Tuple[str, ...]:
+        """Characters treated as word boundaries when extending chunks."""
+
+        return self._boundary_chars
+
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into overlapping chunks at word boundaries.
+
+        Chunks are extended to the nearest word boundary (space or newline)
+        to avoid splitting words. This means chunks may slightly exceed
+        chunk_size. For texts without spaces (e.g., CJK languages), chunks
+        may extend to end of text.
+
+        :param text: The input text to chunk
+        :return: List of TextChunk objects with text and position information
+        """
+        if not text:
+            logger.debug("Empty text provided, returning empty chunk list")
+            return []
+
+        logger.debug(
+            "Chunking text: length=%d, chunk_size=%d, overlap=%d",
+            len(text),
+            self._chunk_size,
+            self._chunk_overlap,
+        )
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            # Calculate end position
+            end = (
+                start + self._chunk_size
+                if start + self._chunk_size < len(text)
+                else len(text)
+            )
+
+            # Extend to complete word boundary (space or newline by default)
+            while end < len(text) and text[end] not in self._boundary_chars:
+                end += 1
+
+            chunks.append(TextChunk(text=text[start:end], start=start, end=end))
+
+            # Move start position with overlap (stop if we've covered all text)
+            if end >= len(text):
+                break
+            start = end - self._chunk_overlap
+
+        logger.debug("Created %d chunks from text", len(chunks))
+        return chunks
@@ -0,0 +1,60 @@
+"""Factory provider for creating text chunkers from configuration."""
+
+import logging
+from typing import Any, Dict, Optional, Type
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
+from presidio_analyzer.chunkers.character_based_text_chunker import (
+    CharacterBasedTextChunker,
+)
+
+logger = logging.getLogger("presidio-analyzer")
+
+# Registry mapping chunker type names to classes
+_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
+    "character": CharacterBasedTextChunker,
+}
+
+
+class TextChunkerProvider:
+    """Create text chunkers from configuration.
+
+    :param chunker_configuration: Dict with chunker_type and optional params.
+        Example::
+
+            {"chunker_type": "character", "chunk_size": 300, "chunk_overlap": 75}
+
+    If no configuration provided, uses character-based chunker with default params
+    tuned for boundary coverage (chunk_size=250, chunk_overlap=50).
+    """
+
+    def __init__(
+        self,
+        chunker_configuration: Optional[Dict[str, Any]] = None,
+    ):
+        # Default to a safe overlap to avoid boundary losses for cross-chunk entities.
+        self.chunker_configuration = chunker_configuration or {
+            "chunker_type": "character",
+            "chunk_size": 250,
+            "chunk_overlap": 50,
+        }
+
+    def create_chunker(self) -> BaseTextChunker:
+        """Create a text chunker instance from configuration."""
+        config = self.chunker_configuration.copy()
+        chunker_type = config.pop("chunker_type", "character")
+
+        if chunker_type not in _CHUNKER_REGISTRY:
+            raise ValueError(
+                f"Unknown chunker_type '{chunker_type}'. "
+                f"Available: {list(_CHUNKER_REGISTRY.keys())}"
+            )
+
+        chunker_class = _CHUNKER_REGISTRY[chunker_type]
+        try:
+            return chunker_class(**config)
+        except TypeError as exc:
+            raise ValueError(
+                f"Invalid configuration for chunker_type '{chunker_type}': {config}"
+            ) from exc
+