diff --git a/pyproject.toml b/pyproject.toml index 4df6392..f287be6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,7 @@ streamlit = [ "Bug Tracker" = "https://github.com/Intugle/data-tools/issues" [project.scripts] +intugle = "intugle.cli:main" intugle-mcp = "intugle.mcp.server:main" intugle-streamlit = "intugle.cli:run_streamlit_app" diff --git a/src/intugle/__init__.py b/src/intugle/__init__.py index 272058c..ed7fb5e 100644 --- a/src/intugle/__init__.py +++ b/src/intugle/__init__.py @@ -1,3 +1,14 @@ from intugle.analysis.models import DataSet as DataSet from intugle.data_product import DataProduct as DataProduct from intugle.semantic_model import SemanticModel as SemanticModel + +__all__ = ["DataSet", "DataProduct", "SemanticModel"] + +# Expose text processor for unstructured text-to-semantic conversion +try: + from intugle.text_processor import TextToSemanticProcessor # noqa: F401 + + __all__.append("TextToSemanticProcessor") +except ImportError: + # Text processor dependencies might not be available + pass diff --git a/src/intugle/cli.py b/src/intugle/cli.py index ab126a9..a4c9c39 100644 --- a/src/intugle/cli.py +++ b/src/intugle/cli.py @@ -1,6 +1,16 @@ +import argparse import importlib.util +import json +import logging import os import subprocess +import sys + +# Setup basic logging for CLI commands +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) def run_streamlit_app(): @@ -30,7 +40,7 @@ def run_streamlit_app(): # Get the absolute path to the main.py of the Streamlit app app_dir = os.path.join(os.path.dirname(__file__), 'streamlit_app') app_path = os.path.join(app_dir, 'main.py') - + # Ensure the app_path exists if not os.path.exists(app_path): print(f"Error: Streamlit app not found at {app_path}") @@ -41,5 +51,109 @@ def run_streamlit_app(): subprocess.run(["streamlit", "run", app_path], cwd=app_dir) +def run_text_to_semantic(args): + """Execute the text-to-semantic conversion command.""" + try: + from intugle.text_processor import TextToSemanticProcessor + except ImportError as e: + logger.error("Text processor not available.") + logger.error(f"Error: {e}") + sys.exit(1) + + try: + # Read input text + if args.input == "-": + text = sys.stdin.read() + else: + with open(args.input, "r", encoding="utf-8") as f: + text = f.read() + + logger.info(f"Processing text of length {len(text)} characters") + + # Initialize processor + processor = TextToSemanticProcessor( + model=args.model, + output_format=args.format, + ) + + # Parse text to RDF + logger.info("Extracting entities and relationships...") + rdf_graph = processor.parse(text) + + logger.info( + f"Extracted {len(rdf_graph.entities)} entities, " + f"{len(rdf_graph.relationships)} relationships, " + f"{len(rdf_graph.triples)} triples" + ) + + # Output results + if args.output_format == "turtle": + output = rdf_graph.to_turtle() + elif args.output_format == "json-ld": + output = json.dumps(rdf_graph.to_json_ld(), indent=2) + else: # json + output = json.dumps(rdf_graph.to_dict(), indent=2) + + if args.output: + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + with open(args.output, "w", encoding="utf-8") as f: + f.write(output) + logger.info(f"Output written to: {args.output}") + else: + print(output) + + logger.info("Text-to-semantic conversion complete.") + + except Exception as e: + logger.error(f"Job failed: {str(e)}") + sys.exit(1) + + +def main(): + """Main entry point for the intugle CLI.""" + parser = argparse.ArgumentParser( + description="Intugle - GenAI-powered semantic layer toolkit" + ) + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Streamlit command + subparsers.add_parser("streamlit", help="Launch the Streamlit web application") + + # Text-to-semantic command + text_parser = subparsers.add_parser( + "text-to-semantic", help="Convert unstructured text to RDF/semantic triples" + ) + text_parser.add_argument( + "--input", "-i", required=True, + help="Input text file path (use '-' for stdin)" + ) + text_parser.add_argument( + "--output", "-o", + help="Output file path (prints to stdout if not specified)" + ) + text_parser.add_argument( + "--model", "-m", default="gpt-4o-mini", + help="LLM model for extraction (default: gpt-4o-mini)" + ) + text_parser.add_argument( + "--format", "-f", choices=["rdf", "rdf_star"], default="rdf_star", + help="RDF format: 'rdf' or 'rdf_star' (default: rdf_star)" + ) + text_parser.add_argument( + "--output-format", choices=["json", "turtle", "json-ld"], default="json", + help="Output format: json, turtle, or json-ld (default: json)" + ) + + args = parser.parse_args() + + if args.command == "streamlit": + run_streamlit_app() + elif args.command == "text-to-semantic": + run_text_to_semantic(args) + else: + parser.print_help() + + if __name__ == "__main__": - run_streamlit_app() + main() + diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py index 31d1af0..20cbf93 100644 --- a/src/intugle/semantic_model.py +++ b/src/intugle/semantic_model.py @@ -195,6 +195,55 @@ def search(self, query: str): log.error(f"Could not perform semantic search: {e}") raise e + def overlay(self, rdf_graph: Any, match_threshold: float = 0.85) -> "SemanticModel": + """ + Overlay an RDF graph from unstructured text onto this semantic model. + + Maps extracted entities and relationships from the RDF graph to existing + semantic nodes, enabling integration of text-derived knowledge. + + Args: + rdf_graph: An RDFGraph instance from TextToSemanticProcessor. + match_threshold: Minimum similarity score for entity matching (0.0-1.0). + + Returns: + Self for method chaining. + """ + from intugle.text_processor.mapper import SemanticMapper + + console.print( + f"Overlaying RDF graph with {len(rdf_graph.entities)} entities...", + style="yellow", + ) + + mapper = SemanticMapper(match_threshold=match_threshold) + mapping_results = mapper.map_to_semantic_model(rdf_graph, self) + + # Store mapping results for later use + if not hasattr(self, "_text_mappings"): + self._text_mappings = [] + self._text_mappings.extend(mapping_results) + + # Generate suggestions for new nodes + new_suggestions = mapper.suggest_new_nodes(mapping_results) + if new_suggestions: + console.print( + f"Found {len(new_suggestions)} unmapped entities that could be new concepts.", + style="yellow", + ) + + matched = sum(1 for r in mapping_results if not r.is_new) + console.print( + f"Overlay complete: {matched}/{len(mapping_results)} entities matched to existing nodes.", + style="bold green", + ) + + return self + + def get_text_mappings(self) -> list: + """Get all text-to-semantic mappings from overlay operations.""" + return getattr(self, "_text_mappings", []) + def deploy(self, target: str, **kwargs): """ Deploys the semantic model to a specified target platform based on the persisted YAML files. diff --git a/src/intugle/text_processor/__init__.py b/src/intugle/text_processor/__init__.py new file mode 100644 index 0000000..6e228bf --- /dev/null +++ b/src/intugle/text_processor/__init__.py @@ -0,0 +1,17 @@ +""" +Unstructured Text Processor Module. + +Provides functionality to convert unstructured text into RDF triples +and map them to the existing Semantic Model. +""" + +from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship +from intugle.text_processor.processor import TextToSemanticProcessor + +__all__ = [ + "TextToSemanticProcessor", + "RDFTriple", + "RDFGraph", + "Entity", + "Relationship", +] diff --git a/src/intugle/text_processor/extractors/__init__.py b/src/intugle/text_processor/extractors/__init__.py new file mode 100644 index 0000000..07a3962 --- /dev/null +++ b/src/intugle/text_processor/extractors/__init__.py @@ -0,0 +1,6 @@ +"""Extractors subpackage for NLP backends.""" + +from intugle.text_processor.extractors.base import BaseExtractor +from intugle.text_processor.extractors.llm_extractor import LLMExtractor + +__all__ = ["BaseExtractor", "LLMExtractor"] diff --git a/src/intugle/text_processor/extractors/base.py b/src/intugle/text_processor/extractors/base.py new file mode 100644 index 0000000..491e9fb --- /dev/null +++ b/src/intugle/text_processor/extractors/base.py @@ -0,0 +1,57 @@ +"""Base extractor interface for pluggable NLP backends.""" + +from abc import ABC, abstractmethod +from typing import List, Tuple + +from intugle.text_processor.models import Entity, Relationship + + +class BaseExtractor(ABC): + """ + Abstract base class for text extractors. + + Implementations should extract entities and relationships from text. + """ + + @abstractmethod + def extract_entities(self, text: str) -> List[Entity]: + """ + Extract named entities from text. + + Args: + text: Input text to process. + + Returns: + List of extracted Entity objects. + """ + pass + + @abstractmethod + def extract_relationships( + self, text: str, entities: List[Entity] + ) -> List[Relationship]: + """ + Extract relationships between entities. + + Args: + text: Input text to process. + entities: Previously extracted entities. + + Returns: + List of extracted Relationship objects. + """ + pass + + def extract(self, text: str) -> Tuple[List[Entity], List[Relationship]]: + """ + Full extraction pipeline: entities then relationships. + + Args: + text: Input text to process. + + Returns: + Tuple of (entities, relationships). + """ + entities = self.extract_entities(text) + relationships = self.extract_relationships(text, entities) + return entities, relationships diff --git a/src/intugle/text_processor/extractors/llm_extractor.py b/src/intugle/text_processor/extractors/llm_extractor.py new file mode 100644 index 0000000..0c4d97a --- /dev/null +++ b/src/intugle/text_processor/extractors/llm_extractor.py @@ -0,0 +1,201 @@ +""" +LLM-based extractor using LangChain infrastructure. + +Uses the existing LangChain setup in intugle for entity and relationship extraction. +""" + +import hashlib +import logging + +from typing import List + +from pydantic import BaseModel, Field + +from intugle.text_processor.extractors.base import BaseExtractor +from intugle.text_processor.models import Entity, Relationship + +log = logging.getLogger(__name__) + + +class ExtractedEntity(BaseModel): + """Schema for LLM-extracted entity.""" + + text: str = Field(..., description="The entity text as it appears in the document") + label: str = Field( + ..., + description="Entity type: PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER", + ) + normalized_id: str = Field( + ..., description="A normalized identifier (e.g., 'Invoice_123' from 'Invoice 123')" + ) + + +class ExtractedRelationship(BaseModel): + """Schema for LLM-extracted relationship.""" + + subject: str = Field(..., description="The normalized_id of the subject entity") + predicate: str = Field( + ..., + description="The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy', 'locatedIn')", + ) + object: str = Field( + ..., description="The normalized_id of the object entity OR a literal value" + ) + + +class ExtractionResult(BaseModel): + """Combined extraction result from LLM.""" + + entities: List[ExtractedEntity] = Field(default_factory=list) + relationships: List[ExtractedRelationship] = Field(default_factory=list) + + +class LLMExtractor(BaseExtractor): + """ + LLM-based entity and relationship extractor. + + Uses LangChain structured output for reliable extraction. + Supports multiple LLM providers via LLM_PROVIDER environment variable. + """ + + def __init__(self, model_name: str = "gpt-4o-mini"): + """ + Initialize the LLM extractor. + + Args: + model_name: Name of the LLM model to use. + """ + self.model_name = model_name + self._llm = None + + def _get_llm(self): + """Lazy initialization of LLM based on provider.""" + if self._llm is None: + import os + provider = os.environ.get("LLM_PROVIDER", "openai").lower() + + if provider == "google-genai" or self.model_name.startswith("gemini"): + from langchain_google_genai import ChatGoogleGenerativeAI + model = self.model_name if self.model_name.startswith("gemini") else "gemini-2.5-flash" + self._llm = ChatGoogleGenerativeAI(model=model, temperature=0) + else: + from langchain_openai import ChatOpenAI + self._llm = ChatOpenAI(model=self.model_name, temperature=0) + + return self._llm + + def _generate_entity_id(self, text: str, label: str) -> str: + """Generate a unique ID for an entity.""" + content = f"{label}:{text}".lower() + return hashlib.md5(content.encode()).hexdigest()[:8] + + def extract_entities(self, text: str) -> List[Entity]: + """Extract entities using LLM.""" + llm = self._get_llm() + + prompt = f"""Extract all named entities from the following text. +For each entity, identify: +- The exact text as it appears +- The entity type (PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER) +- A normalized identifier (convert spaces to underscores, remove special chars) + +Text: +{text} + +Extract entities:""" + + structured_llm = llm.with_structured_output(ExtractionResult) + result: ExtractionResult = structured_llm.invoke(prompt) + + entities = [] + for ext in result.entities: + entity = Entity( + id=ext.normalized_id or self._generate_entity_id(ext.text, ext.label), + text=ext.text, + label=ext.label, + confidence=0.9, + ) + entities.append(entity) + + log.info(f"Extracted {len(entities)} entities from text") + return entities + + def extract_relationships( + self, text: str, entities: List[Entity] + ) -> List[Relationship]: + """Extract relationships between entities using LLM.""" + if not entities: + return [] + + llm = self._get_llm() + + entity_list = "\n".join([f"- {e.id} ({e.label}): {e.text}" for e in entities]) + + prompt = f"""Given the following text and extracted entities, identify relationships between them. + +Text: +{text} + +Entities: +{entity_list} + +For each relationship, specify: +- subject: The normalized_id of the subject entity +- predicate: The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy', 'worksFor') +- object: The normalized_id of the object entity OR a literal value + +Extract relationships:""" + + structured_llm = llm.with_structured_output(ExtractionResult) + result: ExtractionResult = structured_llm.invoke(prompt) + + relationships = [] + {e.id for e in entities} + + for rel in result.relationships: + relationship = Relationship( + subject_id=rel.subject, + predicate=rel.predicate, + object_id=rel.object, + confidence=0.85, + ) + relationships.append(relationship) + + log.info(f"Extracted {len(relationships)} relationships from text") + return relationships + + def extract_all(self, text: str) -> ExtractionResult: + """ + Single-pass extraction of both entities and relationships. + + More efficient than separate calls as it uses one LLM invocation. + """ + llm = self._get_llm() + + prompt = f"""Analyze the following text and extract: +1. All named entities (PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER) +2. All relationships between entities + +For entities: +- text: The exact text as it appears +- label: The entity type +- normalized_id: A normalized identifier (e.g., 'Invoice_123' from 'Invoice 123') + +For relationships: +- subject: The normalized_id of the subject entity +- predicate: The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy') +- object: The normalized_id of the object entity OR a literal value + +Text: +{text} + +Extract:""" + + structured_llm = llm.with_structured_output(ExtractionResult) + result: ExtractionResult = structured_llm.invoke(prompt) + + log.info( + f"Single-pass extraction: {len(result.entities)} entities, " + f"{len(result.relationships)} relationships" + ) + return result diff --git a/src/intugle/text_processor/mapper.py b/src/intugle/text_processor/mapper.py new file mode 100644 index 0000000..25c529a --- /dev/null +++ b/src/intugle/text_processor/mapper.py @@ -0,0 +1,189 @@ +""" +Semantic Mapper for aligning RDF graphs with SemanticModel. + +Maps extracted entities and relationships to existing semantic nodes. +""" + +import logging + +from difflib import SequenceMatcher +from typing import Any, Dict, List, Optional, Tuple + +from intugle.text_processor.models import Entity, RDFGraph + +log = logging.getLogger(__name__) + + +class MappingResult: + """Result of mapping an entity to a semantic node.""" + + def __init__( + self, + entity: Entity, + matched_table: Optional[str] = None, + matched_column: Optional[str] = None, + confidence: float = 0.0, + is_new: bool = False, + ): + self.entity = entity + self.matched_table = matched_table + self.matched_column = matched_column + self.confidence = confidence + self.is_new = is_new + + def to_dict(self) -> Dict[str, Any]: + return { + "entity_id": self.entity.id, + "entity_text": self.entity.text, + "entity_label": self.entity.label, + "matched_table": self.matched_table, + "matched_column": self.matched_column, + "confidence": self.confidence, + "is_new": self.is_new, + } + + +class SemanticMapper: + """ + Maps RDF graph entities to existing SemanticModel nodes. + + Uses string similarity and optional embeddings for matching. + """ + + def __init__( + self, + match_threshold: float = 0.85, + use_embeddings: bool = False, + ): + """ + Initialize the semantic mapper. + + Args: + match_threshold: Minimum similarity score for a match (0.0-1.0). + use_embeddings: Whether to use embedding-based matching. + """ + self.match_threshold = match_threshold + self.use_embeddings = use_embeddings + + def _string_similarity(self, s1: str, s2: str) -> float: + """Calculate string similarity using SequenceMatcher.""" + s1_lower = s1.lower().replace("_", " ") + s2_lower = s2.lower().replace("_", " ") + return SequenceMatcher(None, s1_lower, s2_lower).ratio() + + def _find_best_match( + self, + entity: Entity, + candidates: List[Tuple[str, str]], # (table_name, column_name) + ) -> Optional[Tuple[str, str, float]]: + """ + Find the best matching candidate for an entity. + + Args: + entity: The entity to match. + candidates: List of (table_name, column_name) tuples to match against. + + Returns: + Tuple of (table_name, column_name, confidence) or None if no match. + """ + best_match = None + best_score = 0.0 + + for table_name, column_name in candidates: + # Compare against column name + score = self._string_similarity(entity.text, column_name) + + # Also try the entity ID + id_score = self._string_similarity(entity.id, column_name) + score = max(score, id_score) + + if score > best_score: + best_score = score + best_match = (table_name, column_name, score) + + if best_match and best_match[2] >= self.match_threshold: + return best_match + return None + + def map_to_semantic_model( + self, + rdf_graph: RDFGraph, + semantic_model: Any, # SemanticModel type + ) -> List[MappingResult]: + """ + Map RDF graph entities to a SemanticModel. + + Args: + rdf_graph: The RDF graph to map. + semantic_model: The target SemanticModel instance. + + Returns: + List of MappingResult objects describing the mappings. + """ + results = [] + + # Extract candidate columns from semantic model datasets + candidates = [] + for dataset_name, dataset in semantic_model.datasets.items(): + if hasattr(dataset, "source") and hasattr(dataset.source, "table"): + for col in dataset.source.table.columns: + candidates.append((dataset_name, col.name)) + + log.info(f"Mapping {len(rdf_graph.entities)} entities against {len(candidates)} candidates") + + for entity in rdf_graph.entities: + match = self._find_best_match(entity, candidates) + + if match: + table_name, column_name, confidence = match + result = MappingResult( + entity=entity, + matched_table=table_name, + matched_column=column_name, + confidence=confidence, + is_new=False, + ) + log.debug( + f"Matched entity '{entity.id}' to {table_name}.{column_name} " + f"(confidence: {confidence:.2f})" + ) + else: + result = MappingResult( + entity=entity, + is_new=True, + ) + log.debug(f"No match found for entity '{entity.id}' - marked as new") + + results.append(result) + + matched = sum(1 for r in results if not r.is_new) + log.info(f"Mapping complete: {matched}/{len(results)} entities matched") + + return results + + def suggest_new_nodes( + self, + mapping_results: List[MappingResult], + ) -> List[Dict[str, Any]]: + """ + Generate suggestions for new semantic nodes from unmapped entities. + + Args: + mapping_results: Results from map_to_semantic_model. + + Returns: + List of suggested new node definitions. + """ + suggestions = [] + + for result in mapping_results: + if result.is_new: + suggestion = { + "suggested_name": result.entity.id, + "entity_type": result.entity.label, + "original_text": result.entity.text, + "suggested_table_name": f"text_extracted_{result.entity.label.lower()}", + } + suggestions.append(suggestion) + + return suggestions diff --git a/src/intugle/text_processor/models.py b/src/intugle/text_processor/models.py new file mode 100644 index 0000000..7329af5 --- /dev/null +++ b/src/intugle/text_processor/models.py @@ -0,0 +1,124 @@ +""" +Pydantic models for RDF triples and graph representation. + +Supports RDF-star annotations for provenance and confidence metadata. +""" + +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + + +class Entity(BaseModel): + """Represents an extracted entity from text.""" + + id: str = Field(..., description="Unique identifier for the entity") + text: str = Field(..., description="Original text of the entity") + label: str = Field(..., description="Entity type/label (e.g., PERSON, ORG, MONEY)") + start_char: Optional[int] = Field(None, description="Start character position in source text") + end_char: Optional[int] = Field(None, description="End character position in source text") + confidence: float = Field(1.0, description="Confidence score for the entity extraction") + attributes: Dict[str, Any] = Field(default_factory=dict, description="Additional entity attributes") + + +class Relationship(BaseModel): + """Represents a relationship between two entities.""" + + subject_id: str = Field(..., description="ID of the subject entity") + predicate: str = Field(..., description="Relationship type/predicate") + object_id: str = Field(..., description="ID of the object entity") + confidence: float = Field(1.0, description="Confidence score for the relationship") + + +class RDFTriple(BaseModel): + """ + Represents an RDF triple (subject, predicate, object). + + Supports RDF-star annotations via the metadata field for provenance, + confidence, and other contextual information. + """ + + subject: str = Field(..., description="Subject of the triple") + predicate: str = Field(..., description="Predicate/relationship of the triple") + object: str = Field(..., description="Object of the triple") + object_type: str = Field("literal", description="Type of object: 'uri' or 'literal'") + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="RDF-star annotations (provenance, confidence, source, etc.)", + ) + + def to_turtle(self) -> str: + """Convert triple to Turtle format.""" + obj = f'"{self.object}"' if self.object_type == "literal" else f"<{self.object}>" + return f"<{self.subject}> <{self.predicate}> {obj} ." + + def to_ntriples(self) -> str: + """Convert triple to N-Triples format.""" + return self.to_turtle() + + +class RDFGraph(BaseModel): + """ + Represents a collection of RDF triples forming a graph. + + Includes extracted entities, relationships, and the resulting triples. + """ + + entities: List[Entity] = Field(default_factory=list, description="Extracted entities") + relationships: List[Relationship] = Field(default_factory=list, description="Extracted relationships") + triples: List[RDFTriple] = Field(default_factory=list, description="RDF triples") + source_text: Optional[str] = Field(None, description="Original source text") + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Graph-level metadata (processing info, model used, etc.)", + ) + + def add_triple( + self, + subject: str, + predicate: str, + obj: str, + object_type: str = "literal", + metadata: Optional[Dict[str, Any]] = None, + ) -> "RDFGraph": + """Add a triple to the graph.""" + triple = RDFTriple( + subject=subject, + predicate=predicate, + object=obj, + object_type=object_type, + metadata=metadata or {}, + ) + self.triples.append(triple) + return self + + def get_entity_by_id(self, entity_id: str) -> Optional[Entity]: + """Retrieve an entity by its ID.""" + for entity in self.entities: + if entity.id == entity_id: + return entity + return None + + def to_turtle(self) -> str: + """Export graph to Turtle format.""" + lines = ["@prefix ex: .", ""] + for triple in self.triples: + lines.append(triple.to_turtle()) + return "\n".join(lines) + + def to_json_ld(self) -> Dict[str, Any]: + """Export graph to JSON-LD format.""" + return { + "@context": {"ex": "http://example.org/"}, + "@graph": [ + { + "@id": t.subject, + t.predicate: {"@value": t.object} if t.object_type == "literal" else {"@id": t.object}, + } + for t in self.triples + ], + } + + def to_dict(self) -> Dict[str, Any]: + """Export graph as a dictionary.""" + return self.model_dump() diff --git a/src/intugle/text_processor/processor.py b/src/intugle/text_processor/processor.py new file mode 100644 index 0000000..e457401 --- /dev/null +++ b/src/intugle/text_processor/processor.py @@ -0,0 +1,136 @@ +""" +Main TextToSemanticProcessor class. + +High-level orchestrator for converting unstructured text to RDF graphs. +""" + +import logging + +from typing import Any, Dict, Literal, Optional + +from intugle.text_processor.extractors.base import BaseExtractor +from intugle.text_processor.extractors.llm_extractor import LLMExtractor +from intugle.text_processor.models import Entity, RDFGraph, Relationship +from intugle.text_processor.rdf.builder import RDFBuilder + +log = logging.getLogger(__name__) + + +class TextToSemanticProcessor: + """ + High-level orchestrator for converting unstructured text to RDF graphs. + + Provides a simple API for text-to-semantic conversion as specified in the + feature requirements. + + Example: + >>> processor = TextToSemanticProcessor(model="gpt-4o-mini", output_format="rdf_star") + >>> rdf_graph = processor.parse(text_input) + """ + + def __init__( + self, + model: str = "gpt-4o-mini", + output_format: Literal["rdf", "rdf_star"] = "rdf_star", + extractor: Optional[BaseExtractor] = None, + namespace: Optional[str] = None, + ): + """ + Initialize the text processor. + + Args: + model: Name of the LLM model for extraction (e.g., 'gpt-4o-mini', 'en_core_web_lg'). + output_format: Output format - 'rdf' for standard RDF, 'rdf_star' for RDF with annotations. + extractor: Optional custom extractor instance. If not provided, uses LLMExtractor. + namespace: Base namespace URI for generated entities. + """ + self.model = model + self.output_format = output_format + self.namespace = namespace + + # Use provided extractor or default to LLM-based + if extractor is not None: + self.extractor = extractor + else: + self.extractor = LLMExtractor(model_name=model) + + # Configure RDF builder based on output format + include_provenance = output_format == "rdf_star" + self.rdf_builder = RDFBuilder( + namespace=namespace, + include_provenance=include_provenance, + ) + + log.info(f"TextToSemanticProcessor initialized with model={model}, format={output_format}") + + def parse(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> RDFGraph: + """ + Parse unstructured text and convert to an RDF graph. + + This is the main entry point for text-to-RDF conversion. + + Args: + text: Input text to process (e.g., from OCR, documents, etc.). + metadata: Optional metadata to include in the graph (e.g., source document ID). + + Returns: + RDFGraph containing extracted entities, relationships, and triples. + """ + log.info(f"Processing text of length {len(text)}") + + # Extract entities and relationships + entities, relationships = self.extractor.extract(text) + + # Build RDF graph + graph_metadata = metadata or {} + graph_metadata["model"] = self.model + graph_metadata["output_format"] = self.output_format + + rdf_graph = self.rdf_builder.build( + entities=entities, + relationships=relationships, + source_text=text, + metadata=graph_metadata, + ) + + log.info( + f"Parsed text into RDF graph: {len(entities)} entities, " + f"{len(relationships)} relationships, {len(rdf_graph.triples)} triples" + ) + + return rdf_graph + + def extract_entities(self, text: str) -> list[Entity]: + """ + Extract only entities from text (no relationships). + + Args: + text: Input text to process. + + Returns: + List of extracted Entity objects. + """ + return self.extractor.extract_entities(text) + + def extract_relationships( + self, text: str, entities: list[Entity] + ) -> list[Relationship]: + """ + Extract relationships between known entities. + + Args: + text: Input text to process. + entities: Pre-extracted entities. + + Returns: + List of extracted Relationship objects. + """ + return self.extractor.extract_relationships(text, entities) + + def export_turtle(self, rdf_graph: RDFGraph) -> str: + """Export RDF graph to Turtle format.""" + return rdf_graph.to_turtle() + + def export_json_ld(self, rdf_graph: RDFGraph) -> Dict[str, Any]: + """Export RDF graph to JSON-LD format.""" + return rdf_graph.to_json_ld() diff --git a/src/intugle/text_processor/rdf/__init__.py b/src/intugle/text_processor/rdf/__init__.py new file mode 100644 index 0000000..6046453 --- /dev/null +++ b/src/intugle/text_processor/rdf/__init__.py @@ -0,0 +1,5 @@ +"""RDF subpackage for graph building and serialization.""" + +from intugle.text_processor.rdf.builder import RDFBuilder + +__all__ = ["RDFBuilder"] diff --git a/src/intugle/text_processor/rdf/builder.py b/src/intugle/text_processor/rdf/builder.py new file mode 100644 index 0000000..38a136d --- /dev/null +++ b/src/intugle/text_processor/rdf/builder.py @@ -0,0 +1,165 @@ +""" +RDF Graph Builder. + +Constructs RDF triples from extracted entities and relationships. +""" + +import logging + +from typing import Any, Dict, List, Optional + +from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship + +log = logging.getLogger(__name__) + + +class RDFBuilder: + """ + Builds RDF graphs from extracted entities and relationships. + + Supports configurable ontology prefixes and RDF-star annotations. + """ + + DEFAULT_NAMESPACE = "http://intugle.ai/ontology/" + + # Standard predicates for entity properties + PREDICATE_TYPE = "rdf:type" + PREDICATE_LABEL = "rdfs:label" + PREDICATE_VALUE = "hasValue" + + def __init__( + self, + namespace: Optional[str] = None, + include_entity_triples: bool = True, + include_provenance: bool = True, + ): + """ + Initialize the RDF builder. + + Args: + namespace: Base namespace URI for generated entities. + include_entity_triples: Whether to generate type/label triples for entities. + include_provenance: Whether to include provenance metadata in triples. + """ + self.namespace = namespace or self.DEFAULT_NAMESPACE + self.include_entity_triples = include_entity_triples + self.include_provenance = include_provenance + + def _make_uri(self, local_name: str) -> str: + """Create a full URI from a local name.""" + # Clean the local name for URI usage + clean_name = local_name.replace(" ", "_").replace("$", "").replace(",", "") + return f"{self.namespace}{clean_name}" + + def _entity_to_triples(self, entity: Entity) -> List[RDFTriple]: + """Generate RDF triples for an entity.""" + triples = [] + entity_uri = self._make_uri(entity.id) + + # Type triple + triples.append( + RDFTriple( + subject=entity_uri, + predicate=self._make_uri(self.PREDICATE_TYPE), + object=self._make_uri(entity.label), + object_type="uri", + metadata={"confidence": entity.confidence} if self.include_provenance else {}, + ) + ) + + # Label triple + triples.append( + RDFTriple( + subject=entity_uri, + predicate=self._make_uri(self.PREDICATE_LABEL), + object=entity.text, + object_type="literal", + metadata={"confidence": entity.confidence} if self.include_provenance else {}, + ) + ) + + # Additional attribute triples + for attr_key, attr_value in entity.attributes.items(): + triples.append( + RDFTriple( + subject=entity_uri, + predicate=self._make_uri(attr_key), + object=str(attr_value), + object_type="literal", + ) + ) + + return triples + + def _relationship_to_triple( + self, + relationship: Relationship, + entities: Dict[str, Entity], + ) -> RDFTriple: + """Convert a relationship to an RDF triple.""" + subject_uri = self._make_uri(relationship.subject_id) + predicate_uri = self._make_uri(relationship.predicate) + + # Check if object is an entity or a literal + if relationship.object_id in entities: + object_value = self._make_uri(relationship.object_id) + object_type = "uri" + else: + # Treat as literal value + object_value = relationship.object_id + object_type = "literal" + + metadata = {} + if self.include_provenance: + metadata["confidence"] = relationship.confidence + + return RDFTriple( + subject=subject_uri, + predicate=predicate_uri, + object=object_value, + object_type=object_type, + metadata=metadata, + ) + + def build( + self, + entities: List[Entity], + relationships: List[Relationship], + source_text: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> RDFGraph: + """ + Build an RDF graph from entities and relationships. + + Args: + entities: List of extracted entities. + relationships: List of extracted relationships. + source_text: Original source text (for provenance). + metadata: Additional graph-level metadata. + + Returns: + RDFGraph containing all generated triples. + """ + triples = [] + entity_map = {e.id: e for e in entities} + + # Generate entity triples + if self.include_entity_triples: + for entity in entities: + triples.extend(self._entity_to_triples(entity)) + + # Generate relationship triples + for relationship in relationships: + triple = self._relationship_to_triple(relationship, entity_map) + triples.append(triple) + + graph = RDFGraph( + entities=entities, + relationships=relationships, + triples=triples, + source_text=source_text, + metadata=metadata or {}, + ) + + log.info(f"Built RDF graph with {len(triples)} triples") + return graph diff --git a/tests/streamlit_app/test_helper.py b/tests/streamlit_app/test_helper.py index 93d5d61..70eddab 100644 --- a/tests/streamlit_app/test_helper.py +++ b/tests/streamlit_app/test_helper.py @@ -1,14 +1,16 @@ """Tests for streamlit_app/helper.py file reading functions.""" import io -import pytest -import pandas as pd + from unittest.mock import Mock +import pandas as pd +import pytest + from src.intugle.streamlit_app.helper import ( + _read_bytes_to_df_core, read_bytes_to_df, read_file_to_df, - _read_bytes_to_df_core, ) diff --git a/tests/text_processor/__init__.py b/tests/text_processor/__init__.py new file mode 100644 index 0000000..995b374 --- /dev/null +++ b/tests/text_processor/__init__.py @@ -0,0 +1 @@ +# Text processor tests diff --git a/tests/text_processor/test_builder.py b/tests/text_processor/test_builder.py new file mode 100644 index 0000000..bd8e2a3 --- /dev/null +++ b/tests/text_processor/test_builder.py @@ -0,0 +1,114 @@ +"""Tests for the RDF Builder.""" + +from intugle.text_processor.models import Entity, Relationship +from intugle.text_processor.rdf.builder import RDFBuilder + + +class TestRDFBuilder: + def test_builder_initialization(self): + builder = RDFBuilder() + assert builder.namespace == RDFBuilder.DEFAULT_NAMESPACE + assert builder.include_entity_triples is True + assert builder.include_provenance is True + + def test_builder_custom_namespace(self): + builder = RDFBuilder(namespace="http://custom.org/") + assert builder.namespace == "http://custom.org/" + + def test_build_empty(self): + builder = RDFBuilder() + graph = builder.build(entities=[], relationships=[]) + assert len(graph.triples) == 0 + assert len(graph.entities) == 0 + + def test_build_with_entities(self): + builder = RDFBuilder() + entities = [ + Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"), + Entity(id="Vendor_A", text="Vendor A", label="ORGANIZATION"), + ] + graph = builder.build(entities=entities, relationships=[]) + + # Each entity should produce type and label triples + assert len(graph.triples) == 4 # 2 entities x 2 triples each + assert len(graph.entities) == 2 + + def test_build_with_relationships(self): + builder = RDFBuilder() + entities = [ + Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"), + Entity(id="Vendor_A", text="Vendor A", label="ORGANIZATION"), + ] + relationships = [ + Relationship( + subject_id="Invoice_123", + predicate="issuedBy", + object_id="Vendor_A", + ) + ] + graph = builder.build(entities=entities, relationships=relationships) + + # 4 entity triples + 1 relationship triple + assert len(graph.triples) == 5 + assert len(graph.relationships) == 1 + + def test_build_with_literal_object(self): + builder = RDFBuilder() + entities = [ + Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"), + ] + relationships = [ + Relationship( + subject_id="Invoice_123", + predicate="hasAmount", + object_id="5400", # Not an entity ID, should be treated as literal + ) + ] + graph = builder.build(entities=entities, relationships=relationships) + + # Find the hasAmount triple + amount_triple = None + for t in graph.triples: + if "hasAmount" in t.predicate: + amount_triple = t + break + + assert amount_triple is not None + assert amount_triple.object_type == "literal" + + def test_build_without_entity_triples(self): + builder = RDFBuilder(include_entity_triples=False) + entities = [ + Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"), + ] + relationships = [ + Relationship( + subject_id="Invoice_123", + predicate="hasAmount", + object_id="5400", + ) + ] + graph = builder.build(entities=entities, relationships=relationships) + + # Only the relationship triple, no entity type/label triples + assert len(graph.triples) == 1 + + def test_provenance_metadata(self): + builder = RDFBuilder(include_provenance=True) + entities = [ + Entity(id="Test", text="Test", label="TEST", confidence=0.95), + ] + graph = builder.build(entities=entities, relationships=[]) + + # Check that confidence is in metadata + assert graph.triples[0].metadata.get("confidence") == 0.95 + + def test_no_provenance_metadata(self): + builder = RDFBuilder(include_provenance=False) + entities = [ + Entity(id="Test", text="Test", label="TEST", confidence=0.95), + ] + graph = builder.build(entities=entities, relationships=[]) + + # Metadata should be empty + assert graph.triples[0].metadata == {} diff --git a/tests/text_processor/test_mapper.py b/tests/text_processor/test_mapper.py new file mode 100644 index 0000000..14dbf05 --- /dev/null +++ b/tests/text_processor/test_mapper.py @@ -0,0 +1,123 @@ +"""Tests for the SemanticMapper.""" + +from unittest.mock import MagicMock + +from intugle.text_processor.mapper import MappingResult, SemanticMapper +from intugle.text_processor.models import Entity, RDFGraph + + +class TestSemanticMapper: + def test_mapper_initialization(self): + mapper = SemanticMapper() + assert mapper.match_threshold == 0.85 + assert mapper.use_embeddings is False + + def test_mapper_custom_threshold(self): + mapper = SemanticMapper(match_threshold=0.7) + assert mapper.match_threshold == 0.7 + + def test_string_similarity(self): + mapper = SemanticMapper() + + # Exact match + assert mapper._string_similarity("invoice", "invoice") == 1.0 + + # Case insensitive + assert mapper._string_similarity("Invoice", "invoice") == 1.0 + + # Underscore vs space + assert mapper._string_similarity("invoice_id", "invoice id") == 1.0 + + # Partial match + similarity = mapper._string_similarity("invoice", "invoices") + assert 0.8 < similarity < 1.0 + + def test_map_to_semantic_model_no_match(self): + mapper = SemanticMapper(match_threshold=0.9) + + entity = Entity(id="random_entity", text="Random Thing", label="OTHER") + graph = RDFGraph(entities=[entity]) + + # Mock semantic model with no matching columns + mock_model = MagicMock() + mock_dataset = MagicMock() + mock_column = MagicMock() + mock_column.name = "unrelated_column" + mock_dataset.source.table.columns = [mock_column] + mock_model.datasets = {"table1": mock_dataset} + + results = mapper.map_to_semantic_model(graph, mock_model) + + assert len(results) == 1 + assert results[0].is_new is True + assert results[0].matched_table is None + + def test_map_to_semantic_model_with_match(self): + mapper = SemanticMapper(match_threshold=0.8) + + entity = Entity(id="customer_id", text="Customer ID", label="IDENTIFIER") + graph = RDFGraph(entities=[entity]) + + # Mock semantic model with matching column + mock_model = MagicMock() + mock_dataset = MagicMock() + mock_column = MagicMock() + mock_column.name = "customer_id" + mock_dataset.source.table.columns = [mock_column] + mock_model.datasets = {"customers": mock_dataset} + + results = mapper.map_to_semantic_model(graph, mock_model) + + assert len(results) == 1 + assert results[0].is_new is False + assert results[0].matched_table == "customers" + assert results[0].matched_column == "customer_id" + assert results[0].confidence >= 0.8 + + def test_suggest_new_nodes(self): + mapper = SemanticMapper() + + entity = Entity(id="new_concept", text="New Concept", label="CONCEPT") + result = MappingResult(entity=entity, is_new=True) + + suggestions = mapper.suggest_new_nodes([result]) + + assert len(suggestions) == 1 + assert suggestions[0]["suggested_name"] == "new_concept" + assert suggestions[0]["entity_type"] == "CONCEPT" + + def test_suggest_new_nodes_filters_matched(self): + mapper = SemanticMapper() + + matched_entity = Entity(id="matched", text="Matched", label="TEST") + new_entity = Entity(id="new", text="New", label="TEST") + + results = [ + MappingResult(entity=matched_entity, matched_table="t1", matched_column="c1", is_new=False), + MappingResult(entity=new_entity, is_new=True), + ] + + suggestions = mapper.suggest_new_nodes(results) + + # Only the new entity should be suggested + assert len(suggestions) == 1 + assert suggestions[0]["suggested_name"] == "new" + + +class TestMappingResult: + def test_to_dict(self): + entity = Entity(id="test_id", text="Test", label="TEST") + result = MappingResult( + entity=entity, + matched_table="table1", + matched_column="col1", + confidence=0.9, + is_new=False, + ) + + d = result.to_dict() + + assert d["entity_id"] == "test_id" + assert d["matched_table"] == "table1" + assert d["confidence"] == 0.9 + assert d["is_new"] is False diff --git a/tests/text_processor/test_models.py b/tests/text_processor/test_models.py new file mode 100644 index 0000000..d86716e --- /dev/null +++ b/tests/text_processor/test_models.py @@ -0,0 +1,117 @@ +"""Tests for text processor Pydantic models.""" + +from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship + + +class TestEntity: + def test_entity_creation(self): + entity = Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT") + assert entity.id == "Invoice_123" + assert entity.text == "Invoice 123" + assert entity.label == "DOCUMENT" + assert entity.confidence == 1.0 + + def test_entity_with_attributes(self): + entity = Entity( + id="Amount_5400", + text="$5,400", + label="MONEY", + attributes={"currency": "USD", "value": 5400}, + ) + assert entity.attributes["currency"] == "USD" + assert entity.attributes["value"] == 5400 + + +class TestRelationship: + def test_relationship_creation(self): + rel = Relationship( + subject_id="Invoice_123", + predicate="hasAmount", + object_id="Amount_5400", + ) + assert rel.subject_id == "Invoice_123" + assert rel.predicate == "hasAmount" + assert rel.object_id == "Amount_5400" + assert rel.confidence == 1.0 + + +class TestRDFTriple: + def test_triple_creation(self): + triple = RDFTriple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/hasAmount", + object="5400", + object_type="literal", + ) + assert triple.subject == "http://example.org/Invoice_123" + assert triple.object_type == "literal" + + def test_triple_to_turtle(self): + triple = RDFTriple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/hasAmount", + object="5400", + object_type="literal", + ) + turtle = triple.to_turtle() + assert "" in turtle + assert '"5400"' in turtle + + def test_triple_with_uri_object(self): + triple = RDFTriple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/issuedBy", + object="http://example.org/Vendor_A", + object_type="uri", + ) + turtle = triple.to_turtle() + assert "" in turtle + + +class TestRDFGraph: + def test_graph_creation(self): + graph = RDFGraph() + assert len(graph.entities) == 0 + assert len(graph.triples) == 0 + + def test_add_triple(self): + graph = RDFGraph() + graph.add_triple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/hasAmount", + obj="5400", + ) + assert len(graph.triples) == 1 + + def test_to_turtle(self): + graph = RDFGraph() + graph.add_triple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/hasAmount", + obj="5400", + ) + turtle = graph.to_turtle() + assert "@prefix" in turtle + assert "http://example.org/Invoice_123" in turtle + + def test_to_json_ld(self): + graph = RDFGraph() + graph.add_triple( + subject="http://example.org/Invoice_123", + predicate="http://example.org/hasAmount", + obj="5400", + ) + json_ld = graph.to_json_ld() + assert "@context" in json_ld + assert "@graph" in json_ld + assert len(json_ld["@graph"]) == 1 + + def test_get_entity_by_id(self): + entity = Entity(id="test_id", text="Test", label="TEST") + graph = RDFGraph(entities=[entity]) + found = graph.get_entity_by_id("test_id") + assert found is not None + assert found.text == "Test" + + not_found = graph.get_entity_by_id("nonexistent") + assert not_found is None diff --git a/tests/text_processor/test_processor.py b/tests/text_processor/test_processor.py new file mode 100644 index 0000000..aaa55e9 --- /dev/null +++ b/tests/text_processor/test_processor.py @@ -0,0 +1,84 @@ +"""Tests for the TextToSemanticProcessor.""" + +from unittest.mock import MagicMock + +from intugle.text_processor.models import Entity, RDFGraph, Relationship +from intugle.text_processor.processor import TextToSemanticProcessor + + +class TestTextToSemanticProcessor: + def test_processor_initialization(self): + processor = TextToSemanticProcessor() + assert processor.model == "gpt-4o-mini" + assert processor.output_format == "rdf_star" + + def test_processor_custom_config(self): + processor = TextToSemanticProcessor( + model="gpt-4", + output_format="rdf", + namespace="http://custom.org/", + ) + assert processor.model == "gpt-4" + assert processor.output_format == "rdf" + assert processor.rdf_builder.namespace == "http://custom.org/" + + def test_processor_with_mock_extractor(self): + # Create mock extractor + mock_extractor = MagicMock() + mock_extractor.extract.return_value = ( + [Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT")], + [Relationship(subject_id="Invoice_123", predicate="hasAmount", object_id="5400")], + ) + + processor = TextToSemanticProcessor(extractor=mock_extractor) + + result = processor.parse("Invoice 123 for $5,400") + + mock_extractor.extract.assert_called_once() + assert isinstance(result, RDFGraph) + assert len(result.entities) == 1 + assert len(result.relationships) == 1 + + def test_export_turtle(self): + mock_extractor = MagicMock() + mock_extractor.extract.return_value = ( + [Entity(id="Test", text="Test", label="TEST")], + [], + ) + + processor = TextToSemanticProcessor(extractor=mock_extractor) + graph = processor.parse("Test text") + + turtle = processor.export_turtle(graph) + assert "@prefix" in turtle + assert "Test" in turtle + + def test_export_json_ld(self): + mock_extractor = MagicMock() + mock_extractor.extract.return_value = ( + [Entity(id="Test", text="Test", label="TEST")], + [], + ) + + processor = TextToSemanticProcessor(extractor=mock_extractor) + graph = processor.parse("Test text") + + json_ld = processor.export_json_ld(graph) + assert "@context" in json_ld + assert "@graph" in json_ld + + def test_metadata_in_graph(self): + mock_extractor = MagicMock() + mock_extractor.extract.return_value = ([], []) + + processor = TextToSemanticProcessor( + model="test-model", + output_format="rdf_star", + extractor=mock_extractor, + ) + + graph = processor.parse("Test", metadata={"source": "test.txt"}) + + assert graph.metadata["model"] == "test-model" + assert graph.metadata["output_format"] == "rdf_star" + assert graph.metadata["source"] == "test.txt"