AmadeusITGroup · OrianeLanfranchi · May 30, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,12 +26,14 @@ dependencies = [
     "llama-index-vector-stores-chroma>=0.4.1",
     "markdown>=3.7",
     "openpyxl>=3.1.5",
+    "pip-system-certs>=4.0",
     "pymongo>=4.11.1",
     "pystemmer>=2.2.0.3",
     "pytest>=8.3.4",
     "python-docx>=1.1.2",
     "python-dotenv>=1.0.1",
     "python-pptx>=1.0.2",
+    "travel-pii-anonymisation==0.3.3",
     "unstructured>=0.14.8",
 ]
 

diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -10,7 +10,7 @@ definitions:
         schema:
           type:
             type: string
-            allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader']
+            allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader', 'anonymizer']            
             required: True
           name:
             type: string
@@ -21,6 +21,8 @@ definitions:
               tag:
                 type: string
                 required: False
+              use_placeholders:
+                type: boolean
               api_url:
                 type: string
                 regex: '^http.*'
@@ -207,4 +209,4 @@ indexer:
       required: True
     tracker:
       type: dict
-      required: False
+      required: False
diff --git a/src/docs2vecs/subcommands/indexer/skills/anonymizer_skill.py b/src/docs2vecs/subcommands/indexer/skills/anonymizer_skill.py
@@ -0,0 +1,45 @@
+from typing import List, Optional
+from tspii.reversible_anonymizers.reversible_anonymizer import ReversibleAnonymizer
+from tspii.recognizers.recognizers import create_travel_specific_recognizers
+from tspii.operators.faker_operators import create_fake_data_operators
+from docs2vecs.subcommands.indexer.config import Config
+from docs2vecs.subcommands.indexer.document import Document
+from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
+
+
+class AnonymizerSkill(IndexerSkill):
+    def __init__(self, skill_config: dict, global_config: Config) -> None:
+        super().__init__(skill_config, global_config)
+        self.usePlaceholders = self._config.get("use_placeholders", True)
+
+    def run(self, input: Optional[List[Document]] = None) -> List[Document]:
+        if not input:
+            self.logger.info(f"No documents to anonymize")
+            return input
+
+        for document in input:
+            reversible_anonymizer = ReversibleAnonymizer()
+
+            # Add recognizers
+            for recognizer in create_travel_specific_recognizers():
+                reversible_anonymizer.add_recognizer(recognizer)
+
+            if not self.usePlaceholders:
+                # Add fake data operators
+                reversible_anonymizer.add_operators(create_fake_data_operators())
+
+            # Analyze and anonymize the text
+            reversible_anonymizer.analyze(document.text)
+            if reversible_anonymizer._analyzer_results:
+                self.logger.info(
+                    f"Found {len(reversible_anonymizer._analyzer_results)} entities to anonymize in the current document ({document.filename})."
+                )
+                result = reversible_anonymizer.anonymize()
+                document.text = result.text
+            else:
+                self.logger.info(
+                    f"No entities to anonymize found in the current document ({document.filename})."
+                )
+
+        self.logger.info(f"Successfully anonymized {len(input)} documents")
+        return input
diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py
@@ -2,19 +2,38 @@
 
 from docs2vecs.subcommands.indexer.config import Config
 from docs2vecs.subcommands.indexer.db.mongodb import MongoDbConnection
-from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import AzureAda002EmbeddingSkill
-from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
-from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import AzureVectorStoreSkill
-from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import ChromaDBVectorStoreSkill
+from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import (
+    AzureAda002EmbeddingSkill,
+)
+from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import (
+    AzureBlobStoreUploaderSkill,
+)
+from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import (
+    AzureVectorStoreSkill,
+)
+from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import (
+    ChromaDBVectorStoreSkill,
+)
 from docs2vecs.subcommands.indexer.skills.default_file_reader import DefaultFileReader
-from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import AzureDocumentIntelligenceSkill
+from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import (
+    AzureDocumentIntelligenceSkill,
+)
 from docs2vecs.subcommands.indexer.skills.file_scanner_skill import FileScannerSkill
 from docs2vecs.subcommands.indexer.skills.jira_loader_skill import JiraLoaderSkill
-from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
-from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import RecursiveCharacterTextSplitter
-from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import ScrollWorldExporterSkill
-from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import SemanticSplitter
+from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import (
+    LlamaFastembedEmbeddingSkill,
+)
+from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import (
+    RecursiveCharacterTextSplitter,
+)
+from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import (
+    ScrollWorldExporterSkill,
+)
+from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import (
+    SemanticSplitter,
+)
 from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
+from docs2vecs.subcommands.indexer.skills.anonymizer_skill import AnonymizerSkill
 
 
 class SkillType(StrEnum):
@@ -26,6 +45,7 @@ class SkillType(StrEnum):
     UPLOADER = "uploader"
     SPLITTER = "splitter"
     LOADER = "loader"
+    ANONYMIZER = "anonymizer"
 
 
 class AvailableSkillName(StrEnum):
@@ -57,6 +77,9 @@ class AvailableSkillName(StrEnum):
     # web loaders
     JIRA_LOADER = "jira-loader"
 
+    # anonymizers
+    ANONYMIZER_SKILL = "anonymizer"
+
 
 AVAILABLE_SKILLS = {
     SkillType.EXPORTER: {
@@ -81,6 +104,7 @@ class AvailableSkillName(StrEnum):
         AvailableSkillName.RECURSIVE_CHARACTER_SPLITTER: RecursiveCharacterTextSplitter,
     },
     SkillType.LOADER: {AvailableSkillName.JIRA_LOADER: JiraLoaderSkill},
+    SkillType.ANONYMIZER: {AvailableSkillName.ANONYMIZER_SKILL: AnonymizerSkill},
 }
 
 
@@ -90,9 +114,13 @@ def get_skill(cls, skill_config_dict: dict, global_config: Config):
         try:
             skill_type = SkillType(skill_config_dict["type"])
             avail_skill_name = AvailableSkillName(skill_config_dict["name"])
-            return AVAILABLE_SKILLS[skill_type][avail_skill_name](skill_config_dict, global_config)
+            return AVAILABLE_SKILLS[skill_type][avail_skill_name](
+                skill_config_dict, global_config
+            )
         except ValueError as error:
-            raise ValueError(f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}") from error
+            raise ValueError(
+                f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}"
+            ) from error
 
 
 class TrackerFactory: