Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ dependencies = [
"llama-index-vector-stores-chroma>=0.4.1",
"markdown>=3.7",
"openpyxl>=3.1.5",
"pip-system-certs>=4.0",
"pymongo>=4.11.1",
"pystemmer>=2.2.0.3",
"pytest>=8.3.4",
"python-docx>=1.1.2",
"python-dotenv>=1.0.1",
"python-pptx>=1.0.2",
"travel-pii-anonymisation==0.3.3",
"unstructured>=0.14.8",
]

Expand Down
6 changes: 4 additions & 2 deletions src/docs2vecs/subcommands/indexer/config/config_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ definitions:
schema:
type:
type: string
allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader']
allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader', 'anonymizer']
required: True
name:
type: string
Expand All @@ -21,6 +21,8 @@ definitions:
tag:
type: string
required: False
use_placeholders:
type: boolean
api_url:
type: string
regex: '^http.*'
Expand Down Expand Up @@ -207,4 +209,4 @@ indexer:
required: True
tracker:
type: dict
required: False
required: False
45 changes: 45 additions & 0 deletions src/docs2vecs/subcommands/indexer/skills/anonymizer_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import List, Optional
from tspii.reversible_anonymizers.reversible_anonymizer import ReversibleAnonymizer
from tspii.recognizers.recognizers import create_travel_specific_recognizers
from tspii.operators.faker_operators import create_fake_data_operators
from docs2vecs.subcommands.indexer.config import Config
from docs2vecs.subcommands.indexer.document import Document
from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill


class AnonymizerSkill(IndexerSkill):
def __init__(self, skill_config: dict, global_config: Config) -> None:
super().__init__(skill_config, global_config)
self.usePlaceholders = self._config.get("use_placeholders", True)

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
if not input:
self.logger.info(f"No documents to anonymize")
return input

for document in input:
reversible_anonymizer = ReversibleAnonymizer()

# Add recognizers
for recognizer in create_travel_specific_recognizers():
reversible_anonymizer.add_recognizer(recognizer)

if not self.usePlaceholders:
# Add fake data operators
reversible_anonymizer.add_operators(create_fake_data_operators())

# Analyze and anonymize the text
reversible_anonymizer.analyze(document.text)
if reversible_anonymizer._analyzer_results:
self.logger.info(
f"Found {len(reversible_anonymizer._analyzer_results)} entities to anonymize in the current document ({document.filename})."
)
result = reversible_anonymizer.anonymize()
document.text = result.text
else:
self.logger.info(
f"No entities to anonymize found in the current document ({document.filename})."
)

self.logger.info(f"Successfully anonymized {len(input)} documents")
return input
50 changes: 39 additions & 11 deletions src/docs2vecs/subcommands/indexer/skills/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,38 @@

from docs2vecs.subcommands.indexer.config import Config
from docs2vecs.subcommands.indexer.db.mongodb import MongoDbConnection
from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import AzureAda002EmbeddingSkill
from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import AzureVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import ChromaDBVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import (
AzureAda002EmbeddingSkill,
)
from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import (
AzureBlobStoreUploaderSkill,
)
from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import (
AzureVectorStoreSkill,
)
from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import (
ChromaDBVectorStoreSkill,
)
from docs2vecs.subcommands.indexer.skills.default_file_reader import DefaultFileReader
from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import AzureDocumentIntelligenceSkill
from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import (
AzureDocumentIntelligenceSkill,
)
from docs2vecs.subcommands.indexer.skills.file_scanner_skill import FileScannerSkill
from docs2vecs.subcommands.indexer.skills.jira_loader_skill import JiraLoaderSkill
from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import RecursiveCharacterTextSplitter
from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import ScrollWorldExporterSkill
from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import SemanticSplitter
from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import (
LlamaFastembedEmbeddingSkill,
)
from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import (
RecursiveCharacterTextSplitter,
)
from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import (
ScrollWorldExporterSkill,
)
from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import (
SemanticSplitter,
)
from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills.anonymizer_skill import AnonymizerSkill


class SkillType(StrEnum):
Expand All @@ -26,6 +45,7 @@ class SkillType(StrEnum):
UPLOADER = "uploader"
SPLITTER = "splitter"
LOADER = "loader"
ANONYMIZER = "anonymizer"


class AvailableSkillName(StrEnum):
Expand Down Expand Up @@ -57,6 +77,9 @@ class AvailableSkillName(StrEnum):
# web loaders
JIRA_LOADER = "jira-loader"

# anonymizers
ANONYMIZER_SKILL = "anonymizer"


AVAILABLE_SKILLS = {
SkillType.EXPORTER: {
Expand All @@ -81,6 +104,7 @@ class AvailableSkillName(StrEnum):
AvailableSkillName.RECURSIVE_CHARACTER_SPLITTER: RecursiveCharacterTextSplitter,
},
SkillType.LOADER: {AvailableSkillName.JIRA_LOADER: JiraLoaderSkill},
SkillType.ANONYMIZER: {AvailableSkillName.ANONYMIZER_SKILL: AnonymizerSkill},
}


Expand All @@ -90,9 +114,13 @@ def get_skill(cls, skill_config_dict: dict, global_config: Config):
try:
skill_type = SkillType(skill_config_dict["type"])
avail_skill_name = AvailableSkillName(skill_config_dict["name"])
return AVAILABLE_SKILLS[skill_type][avail_skill_name](skill_config_dict, global_config)
return AVAILABLE_SKILLS[skill_type][avail_skill_name](
skill_config_dict, global_config
)
except ValueError as error:
raise ValueError(f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}") from error
raise ValueError(
f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}"
) from error


class TrackerFactory:
Expand Down
Loading
Loading