Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,10 @@ recognizers:
type: predefined
enabled: false
config_path: presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml

- name: BasicLangExtractRecognizer
supported_languages:
- en
type: predefined
enabled: false
config_path: presidio_analyzer/conf/langextract_config_basic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Configurable LangExtract Configuration
# Supports multiple LLM providers via LangExtract's ModelConfig

lm_recognizer:
supported_entities:
- PERSON
- EMAIL_ADDRESS
- PHONE_NUMBER
- US_SSN
- LOCATION
- ORGANIZATION
- DATE_TIME
- CREDIT_CARD
- IP_ADDRESS
- URL

labels_to_ignore:
- payment_status
- metadata
- annotation

enable_generic_consolidation: true
min_score: 0.5

langextract:
prompt_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2"
examples_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml"

model:
model_id: "gpt-4o"
provider:
name: "openai"
kwargs:
base_url: "https://api.openai.com/v1"
# api_key: "API_KEY_GOES_HERE" or set env LANGEXTRACT_API_KEY

entity_mappings:
person: PERSON
name: PERSON
email: EMAIL_ADDRESS
phone: PHONE_NUMBER
ssn: US_SSN
location: LOCATION
address: LOCATION
organization: ORGANIZATION
date: DATE_TIME
credit_card: CREDIT_CARD
ip_address: IP_ADDRESS
url: URL
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/llm_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
extract_lm_config,
get_supported_entities,
lx,
lx_factory,
)
from .prompt_loader import load_file_from_conf, load_prompt_file, render_jinja_template

Expand All @@ -52,6 +53,7 @@
"extract_lm_config",
"get_supported_entities",
"lx",
"lx_factory",
"load_file_from_conf",
"load_prompt_file",
"render_jinja_template",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@

try:
import langextract as lx
import langextract.factory as lx_factory
except ImportError:
lx = None
lx_factory = None

__all__ = [
"lx",
"lx_factory",
"check_langextract_available",
"extract_lm_config",
"get_supported_entities",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
)
from .third_party.langextract_recognizer import LangExtractRecognizer
from .third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
from .third_party.basic_langextract_recognizer import BasicLangExtractRecognizer

PREDEFINED_RECOGNIZERS = [
"PhoneRecognizer",
Expand Down Expand Up @@ -159,4 +160,5 @@
"LangExtractRecognizer",
"AzureOpenAILangExtractRecognizer",
"OllamaLangExtractRecognizer",
"BasicLangExtractRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
import os
from pathlib import Path
from typing import Optional

from presidio_analyzer.llm_utils import lx, lx_factory
from presidio_analyzer.predefined_recognizers.third_party.\
langextract_recognizer import LangExtractRecognizer

logger = logging.getLogger("presidio-analyzer")

class BasicLangExtractRecognizer(LangExtractRecognizer):
"""Basic LangExtract recognizer using configurable backend."""

DEFAULT_CONFIG_PATH = (
Path(__file__).parent.parent.parent / "conf" / "langextract_config_basic.yaml"
)

def __init__(
self,
config_path: Optional[str] = None,
supported_language: str = "en",
context: Optional[list] = None
):
"""Initialize Basic LangExtract recognizer.

:param config_path: Path to configuration file (optional).
:param supported_language: Language this recognizer supports
(optional, default: "en").
:param context: List of context words
(optional, currently not used by LLM recognizers).
"""
actual_config_path = (
config_path if config_path else str(self.DEFAULT_CONFIG_PATH)
)

super().__init__(
config_path=actual_config_path,
name="Basic LangExtract PII",
supported_language=supported_language
)

model_config = self.config.get("model", {})
provider_config = model_config.get("provider", {})
self.model_id = model_config.get("model_id")
self.provider = provider_config.get("name")
self.provider_kwargs = provider_config.get("kwargs", {})
if not self.model_id:
raise ValueError("Configuration must contain 'model_id'")
if not self.provider:
raise ValueError("Configuration must contain 'provider'")

self.fence_output = model_config.get("fence_output", "openai" in self.provider.lower())
self.use_schema_constraints = model_config.get("use_schema_constraints", False)

if "api_key" not in self.provider_kwargs and "LANGEXTRACT_API_KEY" in os.environ:
self.provider_kwargs["api_key"] = os.environ["LANGEXTRACT_API_KEY"]

self.lx_model_config = lx_factory.ModelConfig(
model_id=self.model_id,
provider=self.provider,
provider_kwargs=self.provider_kwargs,
)

def _get_provider_params(self):
"""Return Azure OpenAI-specific params."""
return {
"config": self.lx_model_config,
"fence_output": self.fence_output,
"use_schema_constraints": self.use_schema_constraints,
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -57,11 +58,13 @@ def __init__(
conf_file=conf_file, registry_configuration=registry_configuration
)

self.configuration = (
ConfigurationValidator.validate_recognizer_registry_configuration(
self.configuration
if os.environ.get("PRESIDIO_ENABLE_SCHEMA_VALIDATION", "").lower() == "true":
self.configuration = (
ConfigurationValidator.validate_recognizer_registry_configuration(
self.configuration
)
)
)

self.nlp_engine = nlp_engine

def create_recognizer_registry(self) -> RecognizerRegistry:
Expand Down
Loading
Loading