microsoft · SharonHart · Jan 27, 2026 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/e2e-tests/requirements.txt b/e2e-tests/requirements.txt
@@ -1,4 +1,4 @@
 requests>=2.32.4
 pytest
--e ../presidio-analyzer[langextract]
+-e ../presidio-analyzer[langextract,gliner]
 -e ../presidio-anonymizer
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/__init__.py b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py
@@ -0,0 +1,13 @@
+"""Text chunking strategies for handling long texts."""
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+from presidio_analyzer.chunkers.langchain_text_chunker import LangChainTextChunker
+from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider
+
+__all__ = [
+    "BaseTextChunker",
+    "TextChunk",
+    "LangChainTextChunker",
+    "TextChunkerProvider",
+]
+
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py
@@ -0,0 +1,140 @@
+"""Abstract base class for text chunking strategies."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, List
+
+if TYPE_CHECKING:
+    from presidio_analyzer import RecognizerResult
+
+
+@dataclass
+class TextChunk:
+    """Represents a chunk of text with its position in the original text.
+
+    :param text: The chunk content
+    :param start: Start position in the original text (inclusive)
+    :param end: End position in the original text (exclusive)
+    """
+
+    text: str
+    start: int
+    end: int
+
+
+class BaseTextChunker(ABC):
+    """Abstract base class for text chunking strategies.
+
+    Subclasses must implement the chunk() method to split text into
+    TextChunk objects that include both content and position information.
+
+    Provides methods for processing predictions across chunks and
+    deduplicating overlapping entities.
+    """
+
+    @abstractmethod
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into chunks with position information.
+
+        :param text: The input text to split
+        :return: List of TextChunk objects with text and position data
+        """
+        pass
+
+    def predict_with_chunking(
+        self,
+        text: str,
+        predict_func: Callable[[str], List["RecognizerResult"]],
+    ) -> List["RecognizerResult"]:
+        """Process text with automatic chunking for long texts.
+
+        For short text, calls predict_func directly.
+        For long text, chunks it and merges predictions with deduplication.
+
+        :param text: Input text to process
+        :param predict_func: Function that takes text and returns
+            RecognizerResult objects
+        :return: List of RecognizerResult with correct offsets
+        """
+        chunks = self.chunk(text)
+        if not chunks:
+            return []
+        if len(chunks) == 1:
+            return predict_func(text)
+
+        predictions = self._process_chunks(chunks, predict_func)
+        return self.deduplicate_overlapping_entities(predictions)
+
+    def _process_chunks(
+        self,
+        chunks: List[TextChunk],
+        process_func: Callable[[str], List["RecognizerResult"]],
+    ) -> List["RecognizerResult"]:
+        """Process text chunks and adjust entity offsets.
+
+        :param chunks: List of TextChunk objects with text and position information
+        :param process_func: Function that takes chunk text and returns
+            RecognizerResult objects
+        :return: List of RecognizerResult with adjusted offsets
+        """
+        all_predictions = []
+
+        for chunk in chunks:
+            chunk_predictions = process_func(chunk.text)
+
+            # Adjust offsets to match original text position
+            for pred in chunk_predictions:
+                pred.start += chunk.start
+                pred.end += chunk.start
+
+            all_predictions.extend(chunk_predictions)
+
+        return all_predictions
+
+    def deduplicate_overlapping_entities(
+        self,
+        predictions: List["RecognizerResult"],
+        overlap_threshold: float = 0.5,
+    ) -> List["RecognizerResult"]:
+        """Remove duplicate entities from overlapping chunks.
+
+        :param predictions: List of RecognizerResult objects
+        :param overlap_threshold: Overlap ratio threshold to consider duplicates
+            (default: 0.5)
+        :return: Deduplicated list of RecognizerResult sorted by position
+        """
+        if not predictions:
+            return predictions
+
+        # Sort by score descending to keep highest scoring entities
+        sorted_preds = sorted(predictions, key=lambda p: p.score, reverse=True)
+        unique = []
+
+        for pred in sorted_preds:
+            is_duplicate = False
+            for kept in unique:
+                # Check if same entity type and overlapping positions
+                if pred.entity_type == kept.entity_type:
+                    overlap_start = max(pred.start, kept.start)
+                    overlap_end = min(pred.end, kept.end)
+
+                    if overlap_start < overlap_end:
+                        # Calculate overlap ratio
+                        overlap_len = overlap_end - overlap_start
+                        pred_len = pred.end - pred.start
+                        kept_len = kept.end - kept.start
+
+                        # Skip zero-length spans to avoid division by zero
+                        # and malformed data
+                        if pred_len <= 0 or kept_len <= 0:
+                            continue
+
+                        # Check if overlap exceeds threshold
+                        if overlap_len / min(pred_len, kept_len) > overlap_threshold:
+                            is_duplicate = True
+                            break
+
+            if not is_duplicate:
+                unique.append(pred)
+
+        # Sort by position for consistent output
+        return sorted(unique, key=lambda p: p.start)
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/langchain_text_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/langchain_text_chunker.py
@@ -0,0 +1,88 @@
+"""Text chunker using LangChain's RecursiveCharacterTextSplitter.
+
+Requires: pip install langchain-text-splitters
+"""
+
+from typing import List
+
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+
+
+class LangChainTextChunker(BaseTextChunker):
+    """Text chunker using LangChain's RecursiveCharacterTextSplitter.
+
+    Uses separator hierarchy: paragraph → line → word → char.
+    Requires: pip install langchain-text-splitters
+    """
+
+    def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
+        """Initialize the chunker.
+
+        :param chunk_size: Maximum characters per chunk
+        :param chunk_overlap: Characters to overlap between chunks
+        """
+        if chunk_size <= 0:
+            raise ValueError("chunk_size must be positive")
+        if chunk_overlap < 0:
+            raise ValueError("chunk_overlap cannot be negative")
+        if chunk_overlap >= chunk_size:
+            raise ValueError("chunk_overlap must be smaller than chunk_size")
+
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self._splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+        )
+
+    @property
+    def chunk_size(self) -> int:
+        """Get the chunk size."""
+        return self._chunk_size
+
+    @property
+    def chunk_overlap(self) -> int:
+        """Get the chunk overlap."""
+        return self._chunk_overlap
+
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into chunks with position information.
+
+        :param text: The input text to chunk
+        :return: List of TextChunk objects
+        """
+        if not text:
+            return []
+
+        chunks_text = self._splitter.split_text(text)
+
+        # Calculate offsets deterministically using a running cursor to avoid
+        # ambiguous find() matches when chunks repeat.
+        chunks = []
+        cursor = 0
+        for chunk_text in chunks_text:
+            # Ensure the chunk_text actually appears at or after the cursor.
+            offset = text.find(chunk_text, cursor)
+            if offset == -1:
+                raise ValueError(
+                    "Chunk text not found in source; chunking misalignment detected"
+                )
+            if offset < cursor:
+                raise ValueError(
+                    "Chunk offsets would go backwards; chunking misalignment detected"
+                )
+
+            chunks.append(TextChunk(
+                text=chunk_text,
+                start=offset,
+                end=offset + len(chunk_text),
+            ))
+
+            # Advance cursor accounting for configured overlap
+            cursor = offset + len(chunk_text) - self._chunk_overlap
+            if cursor < offset:
+                cursor = offset
+
+        return chunks
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py b/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py
@@ -0,0 +1,59 @@
+"""Factory provider for creating text chunkers from configuration."""
+
+import logging
+from typing import Any, Dict, Optional, Type
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
+from presidio_analyzer.chunkers.langchain_text_chunker import LangChainTextChunker
+
+logger = logging.getLogger("presidio-analyzer")
+
+# Registry mapping chunker type names to classes
+_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
+    "langchain": LangChainTextChunker,
+}
+
+
+class TextChunkerProvider:
+    """Create text chunkers from configuration.
+
+    :param chunker_configuration: Dict with chunker_type and optional params.
+        Example::
+
+            {"chunker_type": "langchain", "chunk_size": 300, "chunk_overlap": 75}
+
+    If no configuration provided, uses langchain chunker with default params
+    tuned for boundary coverage (chunk_size=250, chunk_overlap=50).
+    Requires: pip install langchain-text-splitters
+    """
+
+    def __init__(
+        self,
+        chunker_configuration: Optional[Dict[str, Any]] = None,
+    ):
+        # Default to a safe overlap to avoid boundary losses for cross-chunk entities.
+        self.chunker_configuration = chunker_configuration or {
+            "chunker_type": "langchain",
+            "chunk_size": 250,
+            "chunk_overlap": 50,
+        }
+
+    def create_chunker(self) -> BaseTextChunker:
+        """Create a text chunker instance from configuration."""
+        config = self.chunker_configuration.copy()
+        chunker_type = config.pop("chunker_type", "langchain")
+
+        if chunker_type not in _CHUNKER_REGISTRY:
+            raise ValueError(
+                f"Unknown chunker_type '{chunker_type}'. "
+                f"Available: {list(_CHUNKER_REGISTRY.keys())}"
+            )
+
+        chunker_class = _CHUNKER_REGISTRY[chunker_type]
+        try:
+            return chunker_class(**config)
+        except TypeError as exc:
+            raise ValueError(
+                f"Invalid configuration for chunker_type '{chunker_type}': {config}"
+            ) from exc
+