diff --git a/pyproject.toml b/pyproject.toml
index 4df6392..f287be6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,6 +94,7 @@ streamlit = [
"Bug Tracker" = "https://github.com/Intugle/data-tools/issues"
[project.scripts]
+intugle = "intugle.cli:main"
intugle-mcp = "intugle.mcp.server:main"
intugle-streamlit = "intugle.cli:run_streamlit_app"
diff --git a/src/intugle/__init__.py b/src/intugle/__init__.py
index 272058c..ed7fb5e 100644
--- a/src/intugle/__init__.py
+++ b/src/intugle/__init__.py
@@ -1,3 +1,14 @@
from intugle.analysis.models import DataSet as DataSet
from intugle.data_product import DataProduct as DataProduct
from intugle.semantic_model import SemanticModel as SemanticModel
+
+__all__ = ["DataSet", "DataProduct", "SemanticModel"]
+
+# Expose text processor for unstructured text-to-semantic conversion
+try:
+ from intugle.text_processor import TextToSemanticProcessor # noqa: F401
+
+ __all__.append("TextToSemanticProcessor")
+except ImportError:
+ # Text processor dependencies might not be available
+ pass
diff --git a/src/intugle/cli.py b/src/intugle/cli.py
index ab126a9..a4c9c39 100644
--- a/src/intugle/cli.py
+++ b/src/intugle/cli.py
@@ -1,6 +1,16 @@
+import argparse
import importlib.util
+import json
+import logging
import os
import subprocess
+import sys
+
+# Setup basic logging for CLI commands
+logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
def run_streamlit_app():
@@ -30,7 +40,7 @@ def run_streamlit_app():
# Get the absolute path to the main.py of the Streamlit app
app_dir = os.path.join(os.path.dirname(__file__), 'streamlit_app')
app_path = os.path.join(app_dir, 'main.py')
-
+
# Ensure the app_path exists
if not os.path.exists(app_path):
print(f"Error: Streamlit app not found at {app_path}")
@@ -41,5 +51,109 @@ def run_streamlit_app():
subprocess.run(["streamlit", "run", app_path], cwd=app_dir)
+def run_text_to_semantic(args):
+ """Execute the text-to-semantic conversion command."""
+ try:
+ from intugle.text_processor import TextToSemanticProcessor
+ except ImportError as e:
+ logger.error("Text processor not available.")
+ logger.error(f"Error: {e}")
+ sys.exit(1)
+
+ try:
+ # Read input text
+ if args.input == "-":
+ text = sys.stdin.read()
+ else:
+ with open(args.input, "r", encoding="utf-8") as f:
+ text = f.read()
+
+ logger.info(f"Processing text of length {len(text)} characters")
+
+ # Initialize processor
+ processor = TextToSemanticProcessor(
+ model=args.model,
+ output_format=args.format,
+ )
+
+ # Parse text to RDF
+ logger.info("Extracting entities and relationships...")
+ rdf_graph = processor.parse(text)
+
+ logger.info(
+ f"Extracted {len(rdf_graph.entities)} entities, "
+ f"{len(rdf_graph.relationships)} relationships, "
+ f"{len(rdf_graph.triples)} triples"
+ )
+
+ # Output results
+ if args.output_format == "turtle":
+ output = rdf_graph.to_turtle()
+ elif args.output_format == "json-ld":
+ output = json.dumps(rdf_graph.to_json_ld(), indent=2)
+ else: # json
+ output = json.dumps(rdf_graph.to_dict(), indent=2)
+
+ if args.output:
+ os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+ with open(args.output, "w", encoding="utf-8") as f:
+ f.write(output)
+ logger.info(f"Output written to: {args.output}")
+ else:
+ print(output)
+
+ logger.info("Text-to-semantic conversion complete.")
+
+ except Exception as e:
+ logger.error(f"Job failed: {str(e)}")
+ sys.exit(1)
+
+
+def main():
+ """Main entry point for the intugle CLI."""
+ parser = argparse.ArgumentParser(
+ description="Intugle - GenAI-powered semantic layer toolkit"
+ )
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+ # Streamlit command
+ subparsers.add_parser("streamlit", help="Launch the Streamlit web application")
+
+ # Text-to-semantic command
+ text_parser = subparsers.add_parser(
+ "text-to-semantic", help="Convert unstructured text to RDF/semantic triples"
+ )
+ text_parser.add_argument(
+ "--input", "-i", required=True,
+ help="Input text file path (use '-' for stdin)"
+ )
+ text_parser.add_argument(
+ "--output", "-o",
+ help="Output file path (prints to stdout if not specified)"
+ )
+ text_parser.add_argument(
+ "--model", "-m", default="gpt-4o-mini",
+ help="LLM model for extraction (default: gpt-4o-mini)"
+ )
+ text_parser.add_argument(
+ "--format", "-f", choices=["rdf", "rdf_star"], default="rdf_star",
+ help="RDF format: 'rdf' or 'rdf_star' (default: rdf_star)"
+ )
+ text_parser.add_argument(
+ "--output-format", choices=["json", "turtle", "json-ld"], default="json",
+ help="Output format: json, turtle, or json-ld (default: json)"
+ )
+
+ args = parser.parse_args()
+
+ if args.command == "streamlit":
+ run_streamlit_app()
+ elif args.command == "text-to-semantic":
+ run_text_to_semantic(args)
+ else:
+ parser.print_help()
+
+
if __name__ == "__main__":
- run_streamlit_app()
+ main()
+
diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py
index 31d1af0..20cbf93 100644
--- a/src/intugle/semantic_model.py
+++ b/src/intugle/semantic_model.py
@@ -195,6 +195,55 @@ def search(self, query: str):
log.error(f"Could not perform semantic search: {e}")
raise e
+ def overlay(self, rdf_graph: Any, match_threshold: float = 0.85) -> "SemanticModel":
+ """
+ Overlay an RDF graph from unstructured text onto this semantic model.
+
+ Maps extracted entities and relationships from the RDF graph to existing
+ semantic nodes, enabling integration of text-derived knowledge.
+
+ Args:
+ rdf_graph: An RDFGraph instance from TextToSemanticProcessor.
+ match_threshold: Minimum similarity score for entity matching (0.0-1.0).
+
+ Returns:
+ Self for method chaining.
+ """
+ from intugle.text_processor.mapper import SemanticMapper
+
+ console.print(
+ f"Overlaying RDF graph with {len(rdf_graph.entities)} entities...",
+ style="yellow",
+ )
+
+ mapper = SemanticMapper(match_threshold=match_threshold)
+ mapping_results = mapper.map_to_semantic_model(rdf_graph, self)
+
+ # Store mapping results for later use
+ if not hasattr(self, "_text_mappings"):
+ self._text_mappings = []
+ self._text_mappings.extend(mapping_results)
+
+ # Generate suggestions for new nodes
+ new_suggestions = mapper.suggest_new_nodes(mapping_results)
+ if new_suggestions:
+ console.print(
+ f"Found {len(new_suggestions)} unmapped entities that could be new concepts.",
+ style="yellow",
+ )
+
+ matched = sum(1 for r in mapping_results if not r.is_new)
+ console.print(
+ f"Overlay complete: {matched}/{len(mapping_results)} entities matched to existing nodes.",
+ style="bold green",
+ )
+
+ return self
+
+ def get_text_mappings(self) -> list:
+ """Get all text-to-semantic mappings from overlay operations."""
+ return getattr(self, "_text_mappings", [])
+
def deploy(self, target: str, **kwargs):
"""
Deploys the semantic model to a specified target platform based on the persisted YAML files.
diff --git a/src/intugle/text_processor/__init__.py b/src/intugle/text_processor/__init__.py
new file mode 100644
index 0000000..6e228bf
--- /dev/null
+++ b/src/intugle/text_processor/__init__.py
@@ -0,0 +1,17 @@
+"""
+Unstructured Text Processor Module.
+
+Provides functionality to convert unstructured text into RDF triples
+and map them to the existing Semantic Model.
+"""
+
+from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship
+from intugle.text_processor.processor import TextToSemanticProcessor
+
+__all__ = [
+ "TextToSemanticProcessor",
+ "RDFTriple",
+ "RDFGraph",
+ "Entity",
+ "Relationship",
+]
diff --git a/src/intugle/text_processor/extractors/__init__.py b/src/intugle/text_processor/extractors/__init__.py
new file mode 100644
index 0000000..07a3962
--- /dev/null
+++ b/src/intugle/text_processor/extractors/__init__.py
@@ -0,0 +1,6 @@
+"""Extractors subpackage for NLP backends."""
+
+from intugle.text_processor.extractors.base import BaseExtractor
+from intugle.text_processor.extractors.llm_extractor import LLMExtractor
+
+__all__ = ["BaseExtractor", "LLMExtractor"]
diff --git a/src/intugle/text_processor/extractors/base.py b/src/intugle/text_processor/extractors/base.py
new file mode 100644
index 0000000..491e9fb
--- /dev/null
+++ b/src/intugle/text_processor/extractors/base.py
@@ -0,0 +1,57 @@
+"""Base extractor interface for pluggable NLP backends."""
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+
+from intugle.text_processor.models import Entity, Relationship
+
+
+class BaseExtractor(ABC):
+ """
+ Abstract base class for text extractors.
+
+ Implementations should extract entities and relationships from text.
+ """
+
+ @abstractmethod
+ def extract_entities(self, text: str) -> List[Entity]:
+ """
+ Extract named entities from text.
+
+ Args:
+ text: Input text to process.
+
+ Returns:
+ List of extracted Entity objects.
+ """
+ pass
+
+ @abstractmethod
+ def extract_relationships(
+ self, text: str, entities: List[Entity]
+ ) -> List[Relationship]:
+ """
+ Extract relationships between entities.
+
+ Args:
+ text: Input text to process.
+ entities: Previously extracted entities.
+
+ Returns:
+ List of extracted Relationship objects.
+ """
+ pass
+
+ def extract(self, text: str) -> Tuple[List[Entity], List[Relationship]]:
+ """
+ Full extraction pipeline: entities then relationships.
+
+ Args:
+ text: Input text to process.
+
+ Returns:
+ Tuple of (entities, relationships).
+ """
+ entities = self.extract_entities(text)
+ relationships = self.extract_relationships(text, entities)
+ return entities, relationships
diff --git a/src/intugle/text_processor/extractors/llm_extractor.py b/src/intugle/text_processor/extractors/llm_extractor.py
new file mode 100644
index 0000000..0c4d97a
--- /dev/null
+++ b/src/intugle/text_processor/extractors/llm_extractor.py
@@ -0,0 +1,201 @@
+"""
+LLM-based extractor using LangChain infrastructure.
+
+Uses the existing LangChain setup in intugle for entity and relationship extraction.
+"""
+
+import hashlib
+import logging
+
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from intugle.text_processor.extractors.base import BaseExtractor
+from intugle.text_processor.models import Entity, Relationship
+
+log = logging.getLogger(__name__)
+
+
+class ExtractedEntity(BaseModel):
+ """Schema for LLM-extracted entity."""
+
+ text: str = Field(..., description="The entity text as it appears in the document")
+ label: str = Field(
+ ...,
+ description="Entity type: PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER",
+ )
+ normalized_id: str = Field(
+ ..., description="A normalized identifier (e.g., 'Invoice_123' from 'Invoice 123')"
+ )
+
+
+class ExtractedRelationship(BaseModel):
+ """Schema for LLM-extracted relationship."""
+
+ subject: str = Field(..., description="The normalized_id of the subject entity")
+ predicate: str = Field(
+ ...,
+ description="The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy', 'locatedIn')",
+ )
+ object: str = Field(
+ ..., description="The normalized_id of the object entity OR a literal value"
+ )
+
+
+class ExtractionResult(BaseModel):
+ """Combined extraction result from LLM."""
+
+ entities: List[ExtractedEntity] = Field(default_factory=list)
+ relationships: List[ExtractedRelationship] = Field(default_factory=list)
+
+
+class LLMExtractor(BaseExtractor):
+ """
+ LLM-based entity and relationship extractor.
+
+ Uses LangChain structured output for reliable extraction.
+ Supports multiple LLM providers via LLM_PROVIDER environment variable.
+ """
+
+ def __init__(self, model_name: str = "gpt-4o-mini"):
+ """
+ Initialize the LLM extractor.
+
+ Args:
+ model_name: Name of the LLM model to use.
+ """
+ self.model_name = model_name
+ self._llm = None
+
+ def _get_llm(self):
+ """Lazy initialization of LLM based on provider."""
+ if self._llm is None:
+ import os
+ provider = os.environ.get("LLM_PROVIDER", "openai").lower()
+
+ if provider == "google-genai" or self.model_name.startswith("gemini"):
+ from langchain_google_genai import ChatGoogleGenerativeAI
+ model = self.model_name if self.model_name.startswith("gemini") else "gemini-2.5-flash"
+ self._llm = ChatGoogleGenerativeAI(model=model, temperature=0)
+ else:
+ from langchain_openai import ChatOpenAI
+ self._llm = ChatOpenAI(model=self.model_name, temperature=0)
+
+ return self._llm
+
+ def _generate_entity_id(self, text: str, label: str) -> str:
+ """Generate a unique ID for an entity."""
+ content = f"{label}:{text}".lower()
+ return hashlib.md5(content.encode()).hexdigest()[:8]
+
+ def extract_entities(self, text: str) -> List[Entity]:
+ """Extract entities using LLM."""
+ llm = self._get_llm()
+
+ prompt = f"""Extract all named entities from the following text.
+For each entity, identify:
+- The exact text as it appears
+- The entity type (PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER)
+- A normalized identifier (convert spaces to underscores, remove special chars)
+
+Text:
+{text}
+
+Extract entities:"""
+
+ structured_llm = llm.with_structured_output(ExtractionResult)
+ result: ExtractionResult = structured_llm.invoke(prompt)
+
+ entities = []
+ for ext in result.entities:
+ entity = Entity(
+ id=ext.normalized_id or self._generate_entity_id(ext.text, ext.label),
+ text=ext.text,
+ label=ext.label,
+ confidence=0.9,
+ )
+ entities.append(entity)
+
+ log.info(f"Extracted {len(entities)} entities from text")
+ return entities
+
+ def extract_relationships(
+ self, text: str, entities: List[Entity]
+ ) -> List[Relationship]:
+ """Extract relationships between entities using LLM."""
+ if not entities:
+ return []
+
+ llm = self._get_llm()
+
+ entity_list = "\n".join([f"- {e.id} ({e.label}): {e.text}" for e in entities])
+
+ prompt = f"""Given the following text and extracted entities, identify relationships between them.
+
+Text:
+{text}
+
+Entities:
+{entity_list}
+
+For each relationship, specify:
+- subject: The normalized_id of the subject entity
+- predicate: The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy', 'worksFor')
+- object: The normalized_id of the object entity OR a literal value
+
+Extract relationships:"""
+
+ structured_llm = llm.with_structured_output(ExtractionResult)
+ result: ExtractionResult = structured_llm.invoke(prompt)
+
+ relationships = []
+ {e.id for e in entities}
+
+ for rel in result.relationships:
+ relationship = Relationship(
+ subject_id=rel.subject,
+ predicate=rel.predicate,
+ object_id=rel.object,
+ confidence=0.85,
+ )
+ relationships.append(relationship)
+
+ log.info(f"Extracted {len(relationships)} relationships from text")
+ return relationships
+
+ def extract_all(self, text: str) -> ExtractionResult:
+ """
+ Single-pass extraction of both entities and relationships.
+
+ More efficient than separate calls as it uses one LLM invocation.
+ """
+ llm = self._get_llm()
+
+ prompt = f"""Analyze the following text and extract:
+1. All named entities (PERSON, ORGANIZATION, LOCATION, DATE, MONEY, PRODUCT, DOCUMENT, or OTHER)
+2. All relationships between entities
+
+For entities:
+- text: The exact text as it appears
+- label: The entity type
+- normalized_id: A normalized identifier (e.g., 'Invoice_123' from 'Invoice 123')
+
+For relationships:
+- subject: The normalized_id of the subject entity
+- predicate: The relationship type in camelCase (e.g., 'hasAmount', 'issuedBy')
+- object: The normalized_id of the object entity OR a literal value
+
+Text:
+{text}
+
+Extract:"""
+
+ structured_llm = llm.with_structured_output(ExtractionResult)
+ result: ExtractionResult = structured_llm.invoke(prompt)
+
+ log.info(
+ f"Single-pass extraction: {len(result.entities)} entities, "
+ f"{len(result.relationships)} relationships"
+ )
+ return result
diff --git a/src/intugle/text_processor/mapper.py b/src/intugle/text_processor/mapper.py
new file mode 100644
index 0000000..25c529a
--- /dev/null
+++ b/src/intugle/text_processor/mapper.py
@@ -0,0 +1,189 @@
+"""
+Semantic Mapper for aligning RDF graphs with SemanticModel.
+
+Maps extracted entities and relationships to existing semantic nodes.
+"""
+
+import logging
+
+from difflib import SequenceMatcher
+from typing import Any, Dict, List, Optional, Tuple
+
+from intugle.text_processor.models import Entity, RDFGraph
+
+log = logging.getLogger(__name__)
+
+
+class MappingResult:
+ """Result of mapping an entity to a semantic node."""
+
+ def __init__(
+ self,
+ entity: Entity,
+ matched_table: Optional[str] = None,
+ matched_column: Optional[str] = None,
+ confidence: float = 0.0,
+ is_new: bool = False,
+ ):
+ self.entity = entity
+ self.matched_table = matched_table
+ self.matched_column = matched_column
+ self.confidence = confidence
+ self.is_new = is_new
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "entity_id": self.entity.id,
+ "entity_text": self.entity.text,
+ "entity_label": self.entity.label,
+ "matched_table": self.matched_table,
+ "matched_column": self.matched_column,
+ "confidence": self.confidence,
+ "is_new": self.is_new,
+ }
+
+
+class SemanticMapper:
+ """
+ Maps RDF graph entities to existing SemanticModel nodes.
+
+ Uses string similarity and optional embeddings for matching.
+ """
+
+ def __init__(
+ self,
+ match_threshold: float = 0.85,
+ use_embeddings: bool = False,
+ ):
+ """
+ Initialize the semantic mapper.
+
+ Args:
+ match_threshold: Minimum similarity score for a match (0.0-1.0).
+ use_embeddings: Whether to use embedding-based matching.
+ """
+ self.match_threshold = match_threshold
+ self.use_embeddings = use_embeddings
+
+ def _string_similarity(self, s1: str, s2: str) -> float:
+ """Calculate string similarity using SequenceMatcher."""
+ s1_lower = s1.lower().replace("_", " ")
+ s2_lower = s2.lower().replace("_", " ")
+ return SequenceMatcher(None, s1_lower, s2_lower).ratio()
+
+ def _find_best_match(
+ self,
+ entity: Entity,
+ candidates: List[Tuple[str, str]], # (table_name, column_name)
+ ) -> Optional[Tuple[str, str, float]]:
+ """
+ Find the best matching candidate for an entity.
+
+ Args:
+ entity: The entity to match.
+ candidates: List of (table_name, column_name) tuples to match against.
+
+ Returns:
+ Tuple of (table_name, column_name, confidence) or None if no match.
+ """
+ best_match = None
+ best_score = 0.0
+
+ for table_name, column_name in candidates:
+ # Compare against column name
+ score = self._string_similarity(entity.text, column_name)
+
+ # Also try the entity ID
+ id_score = self._string_similarity(entity.id, column_name)
+ score = max(score, id_score)
+
+ if score > best_score:
+ best_score = score
+ best_match = (table_name, column_name, score)
+
+ if best_match and best_match[2] >= self.match_threshold:
+ return best_match
+ return None
+
+ def map_to_semantic_model(
+ self,
+ rdf_graph: RDFGraph,
+ semantic_model: Any, # SemanticModel type
+ ) -> List[MappingResult]:
+ """
+ Map RDF graph entities to a SemanticModel.
+
+ Args:
+ rdf_graph: The RDF graph to map.
+ semantic_model: The target SemanticModel instance.
+
+ Returns:
+ List of MappingResult objects describing the mappings.
+ """
+ results = []
+
+ # Extract candidate columns from semantic model datasets
+ candidates = []
+ for dataset_name, dataset in semantic_model.datasets.items():
+ if hasattr(dataset, "source") and hasattr(dataset.source, "table"):
+ for col in dataset.source.table.columns:
+ candidates.append((dataset_name, col.name))
+
+ log.info(f"Mapping {len(rdf_graph.entities)} entities against {len(candidates)} candidates")
+
+ for entity in rdf_graph.entities:
+ match = self._find_best_match(entity, candidates)
+
+ if match:
+ table_name, column_name, confidence = match
+ result = MappingResult(
+ entity=entity,
+ matched_table=table_name,
+ matched_column=column_name,
+ confidence=confidence,
+ is_new=False,
+ )
+ log.debug(
+ f"Matched entity '{entity.id}' to {table_name}.{column_name} "
+ f"(confidence: {confidence:.2f})"
+ )
+ else:
+ result = MappingResult(
+ entity=entity,
+ is_new=True,
+ )
+ log.debug(f"No match found for entity '{entity.id}' - marked as new")
+
+ results.append(result)
+
+ matched = sum(1 for r in results if not r.is_new)
+ log.info(f"Mapping complete: {matched}/{len(results)} entities matched")
+
+ return results
+
+ def suggest_new_nodes(
+ self,
+ mapping_results: List[MappingResult],
+ ) -> List[Dict[str, Any]]:
+ """
+ Generate suggestions for new semantic nodes from unmapped entities.
+
+ Args:
+ mapping_results: Results from map_to_semantic_model.
+
+ Returns:
+ List of suggested new node definitions.
+ """
+ suggestions = []
+
+ for result in mapping_results:
+ if result.is_new:
+ suggestion = {
+ "suggested_name": result.entity.id,
+ "entity_type": result.entity.label,
+ "original_text": result.entity.text,
+ "suggested_table_name": f"text_extracted_{result.entity.label.lower()}",
+ }
+ suggestions.append(suggestion)
+
+ return suggestions
diff --git a/src/intugle/text_processor/models.py b/src/intugle/text_processor/models.py
new file mode 100644
index 0000000..7329af5
--- /dev/null
+++ b/src/intugle/text_processor/models.py
@@ -0,0 +1,124 @@
+"""
+Pydantic models for RDF triples and graph representation.
+
+Supports RDF-star annotations for provenance and confidence metadata.
+"""
+
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class Entity(BaseModel):
+ """Represents an extracted entity from text."""
+
+ id: str = Field(..., description="Unique identifier for the entity")
+ text: str = Field(..., description="Original text of the entity")
+ label: str = Field(..., description="Entity type/label (e.g., PERSON, ORG, MONEY)")
+ start_char: Optional[int] = Field(None, description="Start character position in source text")
+ end_char: Optional[int] = Field(None, description="End character position in source text")
+ confidence: float = Field(1.0, description="Confidence score for the entity extraction")
+ attributes: Dict[str, Any] = Field(default_factory=dict, description="Additional entity attributes")
+
+
+class Relationship(BaseModel):
+ """Represents a relationship between two entities."""
+
+ subject_id: str = Field(..., description="ID of the subject entity")
+ predicate: str = Field(..., description="Relationship type/predicate")
+ object_id: str = Field(..., description="ID of the object entity")
+ confidence: float = Field(1.0, description="Confidence score for the relationship")
+
+
+class RDFTriple(BaseModel):
+ """
+ Represents an RDF triple (subject, predicate, object).
+
+ Supports RDF-star annotations via the metadata field for provenance,
+ confidence, and other contextual information.
+ """
+
+ subject: str = Field(..., description="Subject of the triple")
+ predicate: str = Field(..., description="Predicate/relationship of the triple")
+ object: str = Field(..., description="Object of the triple")
+ object_type: str = Field("literal", description="Type of object: 'uri' or 'literal'")
+ metadata: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="RDF-star annotations (provenance, confidence, source, etc.)",
+ )
+
+ def to_turtle(self) -> str:
+ """Convert triple to Turtle format."""
+ obj = f'"{self.object}"' if self.object_type == "literal" else f"<{self.object}>"
+ return f"<{self.subject}> <{self.predicate}> {obj} ."
+
+ def to_ntriples(self) -> str:
+ """Convert triple to N-Triples format."""
+ return self.to_turtle()
+
+
+class RDFGraph(BaseModel):
+ """
+ Represents a collection of RDF triples forming a graph.
+
+ Includes extracted entities, relationships, and the resulting triples.
+ """
+
+ entities: List[Entity] = Field(default_factory=list, description="Extracted entities")
+ relationships: List[Relationship] = Field(default_factory=list, description="Extracted relationships")
+ triples: List[RDFTriple] = Field(default_factory=list, description="RDF triples")
+ source_text: Optional[str] = Field(None, description="Original source text")
+ metadata: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="Graph-level metadata (processing info, model used, etc.)",
+ )
+
+ def add_triple(
+ self,
+ subject: str,
+ predicate: str,
+ obj: str,
+ object_type: str = "literal",
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> "RDFGraph":
+ """Add a triple to the graph."""
+ triple = RDFTriple(
+ subject=subject,
+ predicate=predicate,
+ object=obj,
+ object_type=object_type,
+ metadata=metadata or {},
+ )
+ self.triples.append(triple)
+ return self
+
+ def get_entity_by_id(self, entity_id: str) -> Optional[Entity]:
+ """Retrieve an entity by its ID."""
+ for entity in self.entities:
+ if entity.id == entity_id:
+ return entity
+ return None
+
+ def to_turtle(self) -> str:
+ """Export graph to Turtle format."""
+ lines = ["@prefix ex: .", ""]
+ for triple in self.triples:
+ lines.append(triple.to_turtle())
+ return "\n".join(lines)
+
+ def to_json_ld(self) -> Dict[str, Any]:
+ """Export graph to JSON-LD format."""
+ return {
+ "@context": {"ex": "http://example.org/"},
+ "@graph": [
+ {
+ "@id": t.subject,
+ t.predicate: {"@value": t.object} if t.object_type == "literal" else {"@id": t.object},
+ }
+ for t in self.triples
+ ],
+ }
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Export graph as a dictionary."""
+ return self.model_dump()
diff --git a/src/intugle/text_processor/processor.py b/src/intugle/text_processor/processor.py
new file mode 100644
index 0000000..e457401
--- /dev/null
+++ b/src/intugle/text_processor/processor.py
@@ -0,0 +1,136 @@
+"""
+Main TextToSemanticProcessor class.
+
+High-level orchestrator for converting unstructured text to RDF graphs.
+"""
+
+import logging
+
+from typing import Any, Dict, Literal, Optional
+
+from intugle.text_processor.extractors.base import BaseExtractor
+from intugle.text_processor.extractors.llm_extractor import LLMExtractor
+from intugle.text_processor.models import Entity, RDFGraph, Relationship
+from intugle.text_processor.rdf.builder import RDFBuilder
+
+log = logging.getLogger(__name__)
+
+
+class TextToSemanticProcessor:
+ """
+ High-level orchestrator for converting unstructured text to RDF graphs.
+
+ Provides a simple API for text-to-semantic conversion as specified in the
+ feature requirements.
+
+ Example:
+ >>> processor = TextToSemanticProcessor(model="gpt-4o-mini", output_format="rdf_star")
+ >>> rdf_graph = processor.parse(text_input)
+ """
+
+ def __init__(
+ self,
+ model: str = "gpt-4o-mini",
+ output_format: Literal["rdf", "rdf_star"] = "rdf_star",
+ extractor: Optional[BaseExtractor] = None,
+ namespace: Optional[str] = None,
+ ):
+ """
+ Initialize the text processor.
+
+ Args:
+ model: Name of the LLM model for extraction (e.g., 'gpt-4o-mini', 'en_core_web_lg').
+ output_format: Output format - 'rdf' for standard RDF, 'rdf_star' for RDF with annotations.
+ extractor: Optional custom extractor instance. If not provided, uses LLMExtractor.
+ namespace: Base namespace URI for generated entities.
+ """
+ self.model = model
+ self.output_format = output_format
+ self.namespace = namespace
+
+ # Use provided extractor or default to LLM-based
+ if extractor is not None:
+ self.extractor = extractor
+ else:
+ self.extractor = LLMExtractor(model_name=model)
+
+ # Configure RDF builder based on output format
+ include_provenance = output_format == "rdf_star"
+ self.rdf_builder = RDFBuilder(
+ namespace=namespace,
+ include_provenance=include_provenance,
+ )
+
+ log.info(f"TextToSemanticProcessor initialized with model={model}, format={output_format}")
+
+ def parse(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> RDFGraph:
+ """
+ Parse unstructured text and convert to an RDF graph.
+
+ This is the main entry point for text-to-RDF conversion.
+
+ Args:
+ text: Input text to process (e.g., from OCR, documents, etc.).
+ metadata: Optional metadata to include in the graph (e.g., source document ID).
+
+ Returns:
+ RDFGraph containing extracted entities, relationships, and triples.
+ """
+ log.info(f"Processing text of length {len(text)}")
+
+ # Extract entities and relationships
+ entities, relationships = self.extractor.extract(text)
+
+ # Build RDF graph
+ graph_metadata = metadata or {}
+ graph_metadata["model"] = self.model
+ graph_metadata["output_format"] = self.output_format
+
+ rdf_graph = self.rdf_builder.build(
+ entities=entities,
+ relationships=relationships,
+ source_text=text,
+ metadata=graph_metadata,
+ )
+
+ log.info(
+ f"Parsed text into RDF graph: {len(entities)} entities, "
+ f"{len(relationships)} relationships, {len(rdf_graph.triples)} triples"
+ )
+
+ return rdf_graph
+
+ def extract_entities(self, text: str) -> list[Entity]:
+ """
+ Extract only entities from text (no relationships).
+
+ Args:
+ text: Input text to process.
+
+ Returns:
+ List of extracted Entity objects.
+ """
+ return self.extractor.extract_entities(text)
+
+ def extract_relationships(
+ self, text: str, entities: list[Entity]
+ ) -> list[Relationship]:
+ """
+ Extract relationships between known entities.
+
+ Args:
+ text: Input text to process.
+ entities: Pre-extracted entities.
+
+ Returns:
+ List of extracted Relationship objects.
+ """
+ return self.extractor.extract_relationships(text, entities)
+
+ def export_turtle(self, rdf_graph: RDFGraph) -> str:
+ """Export RDF graph to Turtle format."""
+ return rdf_graph.to_turtle()
+
+ def export_json_ld(self, rdf_graph: RDFGraph) -> Dict[str, Any]:
+ """Export RDF graph to JSON-LD format."""
+ return rdf_graph.to_json_ld()
diff --git a/src/intugle/text_processor/rdf/__init__.py b/src/intugle/text_processor/rdf/__init__.py
new file mode 100644
index 0000000..6046453
--- /dev/null
+++ b/src/intugle/text_processor/rdf/__init__.py
@@ -0,0 +1,5 @@
+"""RDF subpackage for graph building and serialization."""
+
+from intugle.text_processor.rdf.builder import RDFBuilder
+
+__all__ = ["RDFBuilder"]
diff --git a/src/intugle/text_processor/rdf/builder.py b/src/intugle/text_processor/rdf/builder.py
new file mode 100644
index 0000000..38a136d
--- /dev/null
+++ b/src/intugle/text_processor/rdf/builder.py
@@ -0,0 +1,165 @@
+"""
+RDF Graph Builder.
+
+Constructs RDF triples from extracted entities and relationships.
+"""
+
+import logging
+
+from typing import Any, Dict, List, Optional
+
+from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship
+
+log = logging.getLogger(__name__)
+
+
+class RDFBuilder:
+ """
+ Builds RDF graphs from extracted entities and relationships.
+
+ Supports configurable ontology prefixes and RDF-star annotations.
+ """
+
+ DEFAULT_NAMESPACE = "http://intugle.ai/ontology/"
+
+ # Standard predicates for entity properties
+ PREDICATE_TYPE = "rdf:type"
+ PREDICATE_LABEL = "rdfs:label"
+ PREDICATE_VALUE = "hasValue"
+
+ def __init__(
+ self,
+ namespace: Optional[str] = None,
+ include_entity_triples: bool = True,
+ include_provenance: bool = True,
+ ):
+ """
+ Initialize the RDF builder.
+
+ Args:
+ namespace: Base namespace URI for generated entities.
+ include_entity_triples: Whether to generate type/label triples for entities.
+ include_provenance: Whether to include provenance metadata in triples.
+ """
+ self.namespace = namespace or self.DEFAULT_NAMESPACE
+ self.include_entity_triples = include_entity_triples
+ self.include_provenance = include_provenance
+
+ def _make_uri(self, local_name: str) -> str:
+ """Create a full URI from a local name."""
+ # Clean the local name for URI usage
+ clean_name = local_name.replace(" ", "_").replace("$", "").replace(",", "")
+ return f"{self.namespace}{clean_name}"
+
+ def _entity_to_triples(self, entity: Entity) -> List[RDFTriple]:
+ """Generate RDF triples for an entity."""
+ triples = []
+ entity_uri = self._make_uri(entity.id)
+
+ # Type triple
+ triples.append(
+ RDFTriple(
+ subject=entity_uri,
+ predicate=self._make_uri(self.PREDICATE_TYPE),
+ object=self._make_uri(entity.label),
+ object_type="uri",
+ metadata={"confidence": entity.confidence} if self.include_provenance else {},
+ )
+ )
+
+ # Label triple
+ triples.append(
+ RDFTriple(
+ subject=entity_uri,
+ predicate=self._make_uri(self.PREDICATE_LABEL),
+ object=entity.text,
+ object_type="literal",
+ metadata={"confidence": entity.confidence} if self.include_provenance else {},
+ )
+ )
+
+ # Additional attribute triples
+ for attr_key, attr_value in entity.attributes.items():
+ triples.append(
+ RDFTriple(
+ subject=entity_uri,
+ predicate=self._make_uri(attr_key),
+ object=str(attr_value),
+ object_type="literal",
+ )
+ )
+
+ return triples
+
+ def _relationship_to_triple(
+ self,
+ relationship: Relationship,
+ entities: Dict[str, Entity],
+ ) -> RDFTriple:
+ """Convert a relationship to an RDF triple."""
+ subject_uri = self._make_uri(relationship.subject_id)
+ predicate_uri = self._make_uri(relationship.predicate)
+
+ # Check if object is an entity or a literal
+ if relationship.object_id in entities:
+ object_value = self._make_uri(relationship.object_id)
+ object_type = "uri"
+ else:
+ # Treat as literal value
+ object_value = relationship.object_id
+ object_type = "literal"
+
+ metadata = {}
+ if self.include_provenance:
+ metadata["confidence"] = relationship.confidence
+
+ return RDFTriple(
+ subject=subject_uri,
+ predicate=predicate_uri,
+ object=object_value,
+ object_type=object_type,
+ metadata=metadata,
+ )
+
+ def build(
+ self,
+ entities: List[Entity],
+ relationships: List[Relationship],
+ source_text: Optional[str] = None,
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> RDFGraph:
+ """
+ Build an RDF graph from entities and relationships.
+
+ Args:
+ entities: List of extracted entities.
+ relationships: List of extracted relationships.
+ source_text: Original source text (for provenance).
+ metadata: Additional graph-level metadata.
+
+ Returns:
+ RDFGraph containing all generated triples.
+ """
+ triples = []
+ entity_map = {e.id: e for e in entities}
+
+ # Generate entity triples
+ if self.include_entity_triples:
+ for entity in entities:
+ triples.extend(self._entity_to_triples(entity))
+
+ # Generate relationship triples
+ for relationship in relationships:
+ triple = self._relationship_to_triple(relationship, entity_map)
+ triples.append(triple)
+
+ graph = RDFGraph(
+ entities=entities,
+ relationships=relationships,
+ triples=triples,
+ source_text=source_text,
+ metadata=metadata or {},
+ )
+
+ log.info(f"Built RDF graph with {len(triples)} triples")
+ return graph
diff --git a/tests/streamlit_app/test_helper.py b/tests/streamlit_app/test_helper.py
index 93d5d61..70eddab 100644
--- a/tests/streamlit_app/test_helper.py
+++ b/tests/streamlit_app/test_helper.py
@@ -1,14 +1,16 @@
"""Tests for streamlit_app/helper.py file reading functions."""
import io
-import pytest
-import pandas as pd
+
from unittest.mock import Mock
+import pandas as pd
+import pytest
+
from src.intugle.streamlit_app.helper import (
+ _read_bytes_to_df_core,
read_bytes_to_df,
read_file_to_df,
- _read_bytes_to_df_core,
)
diff --git a/tests/text_processor/__init__.py b/tests/text_processor/__init__.py
new file mode 100644
index 0000000..995b374
--- /dev/null
+++ b/tests/text_processor/__init__.py
@@ -0,0 +1 @@
+# Text processor tests
diff --git a/tests/text_processor/test_builder.py b/tests/text_processor/test_builder.py
new file mode 100644
index 0000000..bd8e2a3
--- /dev/null
+++ b/tests/text_processor/test_builder.py
@@ -0,0 +1,114 @@
+"""Tests for the RDF Builder."""
+
+from intugle.text_processor.models import Entity, Relationship
+from intugle.text_processor.rdf.builder import RDFBuilder
+
+
+class TestRDFBuilder:
+ def test_builder_initialization(self):
+ builder = RDFBuilder()
+ assert builder.namespace == RDFBuilder.DEFAULT_NAMESPACE
+ assert builder.include_entity_triples is True
+ assert builder.include_provenance is True
+
+ def test_builder_custom_namespace(self):
+ builder = RDFBuilder(namespace="http://custom.org/")
+ assert builder.namespace == "http://custom.org/"
+
+ def test_build_empty(self):
+ builder = RDFBuilder()
+ graph = builder.build(entities=[], relationships=[])
+ assert len(graph.triples) == 0
+ assert len(graph.entities) == 0
+
+ def test_build_with_entities(self):
+ builder = RDFBuilder()
+ entities = [
+ Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"),
+ Entity(id="Vendor_A", text="Vendor A", label="ORGANIZATION"),
+ ]
+ graph = builder.build(entities=entities, relationships=[])
+
+ # Each entity should produce type and label triples
+ assert len(graph.triples) == 4 # 2 entities x 2 triples each
+ assert len(graph.entities) == 2
+
+ def test_build_with_relationships(self):
+ builder = RDFBuilder()
+ entities = [
+ Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"),
+ Entity(id="Vendor_A", text="Vendor A", label="ORGANIZATION"),
+ ]
+ relationships = [
+ Relationship(
+ subject_id="Invoice_123",
+ predicate="issuedBy",
+ object_id="Vendor_A",
+ )
+ ]
+ graph = builder.build(entities=entities, relationships=relationships)
+
+ # 4 entity triples + 1 relationship triple
+ assert len(graph.triples) == 5
+ assert len(graph.relationships) == 1
+
+ def test_build_with_literal_object(self):
+ builder = RDFBuilder()
+ entities = [
+ Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"),
+ ]
+ relationships = [
+ Relationship(
+ subject_id="Invoice_123",
+ predicate="hasAmount",
+ object_id="5400", # Not an entity ID, should be treated as literal
+ )
+ ]
+ graph = builder.build(entities=entities, relationships=relationships)
+
+ # Find the hasAmount triple
+ amount_triple = None
+ for t in graph.triples:
+ if "hasAmount" in t.predicate:
+ amount_triple = t
+ break
+
+ assert amount_triple is not None
+ assert amount_triple.object_type == "literal"
+
+ def test_build_without_entity_triples(self):
+ builder = RDFBuilder(include_entity_triples=False)
+ entities = [
+ Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT"),
+ ]
+ relationships = [
+ Relationship(
+ subject_id="Invoice_123",
+ predicate="hasAmount",
+ object_id="5400",
+ )
+ ]
+ graph = builder.build(entities=entities, relationships=relationships)
+
+ # Only the relationship triple, no entity type/label triples
+ assert len(graph.triples) == 1
+
+ def test_provenance_metadata(self):
+ builder = RDFBuilder(include_provenance=True)
+ entities = [
+ Entity(id="Test", text="Test", label="TEST", confidence=0.95),
+ ]
+ graph = builder.build(entities=entities, relationships=[])
+
+ # Check that confidence is in metadata
+ assert graph.triples[0].metadata.get("confidence") == 0.95
+
+ def test_no_provenance_metadata(self):
+ builder = RDFBuilder(include_provenance=False)
+ entities = [
+ Entity(id="Test", text="Test", label="TEST", confidence=0.95),
+ ]
+ graph = builder.build(entities=entities, relationships=[])
+
+ # Metadata should be empty
+ assert graph.triples[0].metadata == {}
diff --git a/tests/text_processor/test_mapper.py b/tests/text_processor/test_mapper.py
new file mode 100644
index 0000000..14dbf05
--- /dev/null
+++ b/tests/text_processor/test_mapper.py
@@ -0,0 +1,123 @@
+"""Tests for the SemanticMapper."""
+
+from unittest.mock import MagicMock
+
+from intugle.text_processor.mapper import MappingResult, SemanticMapper
+from intugle.text_processor.models import Entity, RDFGraph
+
+
+class TestSemanticMapper:
+ def test_mapper_initialization(self):
+ mapper = SemanticMapper()
+ assert mapper.match_threshold == 0.85
+ assert mapper.use_embeddings is False
+
+ def test_mapper_custom_threshold(self):
+ mapper = SemanticMapper(match_threshold=0.7)
+ assert mapper.match_threshold == 0.7
+
+ def test_string_similarity(self):
+ mapper = SemanticMapper()
+
+ # Exact match
+ assert mapper._string_similarity("invoice", "invoice") == 1.0
+
+ # Case insensitive
+ assert mapper._string_similarity("Invoice", "invoice") == 1.0
+
+ # Underscore vs space
+ assert mapper._string_similarity("invoice_id", "invoice id") == 1.0
+
+ # Partial match
+ similarity = mapper._string_similarity("invoice", "invoices")
+ assert 0.8 < similarity < 1.0
+
+ def test_map_to_semantic_model_no_match(self):
+ mapper = SemanticMapper(match_threshold=0.9)
+
+ entity = Entity(id="random_entity", text="Random Thing", label="OTHER")
+ graph = RDFGraph(entities=[entity])
+
+ # Mock semantic model with no matching columns
+ mock_model = MagicMock()
+ mock_dataset = MagicMock()
+ mock_column = MagicMock()
+ mock_column.name = "unrelated_column"
+ mock_dataset.source.table.columns = [mock_column]
+ mock_model.datasets = {"table1": mock_dataset}
+
+ results = mapper.map_to_semantic_model(graph, mock_model)
+
+ assert len(results) == 1
+ assert results[0].is_new is True
+ assert results[0].matched_table is None
+
+ def test_map_to_semantic_model_with_match(self):
+ mapper = SemanticMapper(match_threshold=0.8)
+
+ entity = Entity(id="customer_id", text="Customer ID", label="IDENTIFIER")
+ graph = RDFGraph(entities=[entity])
+
+ # Mock semantic model with matching column
+ mock_model = MagicMock()
+ mock_dataset = MagicMock()
+ mock_column = MagicMock()
+ mock_column.name = "customer_id"
+ mock_dataset.source.table.columns = [mock_column]
+ mock_model.datasets = {"customers": mock_dataset}
+
+ results = mapper.map_to_semantic_model(graph, mock_model)
+
+ assert len(results) == 1
+ assert results[0].is_new is False
+ assert results[0].matched_table == "customers"
+ assert results[0].matched_column == "customer_id"
+ assert results[0].confidence >= 0.8
+
+ def test_suggest_new_nodes(self):
+ mapper = SemanticMapper()
+
+ entity = Entity(id="new_concept", text="New Concept", label="CONCEPT")
+ result = MappingResult(entity=entity, is_new=True)
+
+ suggestions = mapper.suggest_new_nodes([result])
+
+ assert len(suggestions) == 1
+ assert suggestions[0]["suggested_name"] == "new_concept"
+ assert suggestions[0]["entity_type"] == "CONCEPT"
+
+ def test_suggest_new_nodes_filters_matched(self):
+ mapper = SemanticMapper()
+
+ matched_entity = Entity(id="matched", text="Matched", label="TEST")
+ new_entity = Entity(id="new", text="New", label="TEST")
+
+ results = [
+ MappingResult(entity=matched_entity, matched_table="t1", matched_column="c1", is_new=False),
+ MappingResult(entity=new_entity, is_new=True),
+ ]
+
+ suggestions = mapper.suggest_new_nodes(results)
+
+ # Only the new entity should be suggested
+ assert len(suggestions) == 1
+ assert suggestions[0]["suggested_name"] == "new"
+
+
+class TestMappingResult:
+ def test_to_dict(self):
+ entity = Entity(id="test_id", text="Test", label="TEST")
+ result = MappingResult(
+ entity=entity,
+ matched_table="table1",
+ matched_column="col1",
+ confidence=0.9,
+ is_new=False,
+ )
+
+ d = result.to_dict()
+
+ assert d["entity_id"] == "test_id"
+ assert d["matched_table"] == "table1"
+ assert d["confidence"] == 0.9
+ assert d["is_new"] is False
diff --git a/tests/text_processor/test_models.py b/tests/text_processor/test_models.py
new file mode 100644
index 0000000..d86716e
--- /dev/null
+++ b/tests/text_processor/test_models.py
@@ -0,0 +1,117 @@
+"""Tests for text processor Pydantic models."""
+
+from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship
+
+
+class TestEntity:
+ def test_entity_creation(self):
+ entity = Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT")
+ assert entity.id == "Invoice_123"
+ assert entity.text == "Invoice 123"
+ assert entity.label == "DOCUMENT"
+ assert entity.confidence == 1.0
+
+ def test_entity_with_attributes(self):
+ entity = Entity(
+ id="Amount_5400",
+ text="$5,400",
+ label="MONEY",
+ attributes={"currency": "USD", "value": 5400},
+ )
+ assert entity.attributes["currency"] == "USD"
+ assert entity.attributes["value"] == 5400
+
+
+class TestRelationship:
+ def test_relationship_creation(self):
+ rel = Relationship(
+ subject_id="Invoice_123",
+ predicate="hasAmount",
+ object_id="Amount_5400",
+ )
+ assert rel.subject_id == "Invoice_123"
+ assert rel.predicate == "hasAmount"
+ assert rel.object_id == "Amount_5400"
+ assert rel.confidence == 1.0
+
+
+class TestRDFTriple:
+ def test_triple_creation(self):
+ triple = RDFTriple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/hasAmount",
+ object="5400",
+ object_type="literal",
+ )
+ assert triple.subject == "http://example.org/Invoice_123"
+ assert triple.object_type == "literal"
+
+ def test_triple_to_turtle(self):
+ triple = RDFTriple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/hasAmount",
+ object="5400",
+ object_type="literal",
+ )
+ turtle = triple.to_turtle()
+ assert "" in turtle
+ assert '"5400"' in turtle
+
+ def test_triple_with_uri_object(self):
+ triple = RDFTriple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/issuedBy",
+ object="http://example.org/Vendor_A",
+ object_type="uri",
+ )
+ turtle = triple.to_turtle()
+ assert "" in turtle
+
+
+class TestRDFGraph:
+ def test_graph_creation(self):
+ graph = RDFGraph()
+ assert len(graph.entities) == 0
+ assert len(graph.triples) == 0
+
+ def test_add_triple(self):
+ graph = RDFGraph()
+ graph.add_triple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/hasAmount",
+ obj="5400",
+ )
+ assert len(graph.triples) == 1
+
+ def test_to_turtle(self):
+ graph = RDFGraph()
+ graph.add_triple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/hasAmount",
+ obj="5400",
+ )
+ turtle = graph.to_turtle()
+ assert "@prefix" in turtle
+ assert "http://example.org/Invoice_123" in turtle
+
+ def test_to_json_ld(self):
+ graph = RDFGraph()
+ graph.add_triple(
+ subject="http://example.org/Invoice_123",
+ predicate="http://example.org/hasAmount",
+ obj="5400",
+ )
+ json_ld = graph.to_json_ld()
+ assert "@context" in json_ld
+ assert "@graph" in json_ld
+ assert len(json_ld["@graph"]) == 1
+
+ def test_get_entity_by_id(self):
+ entity = Entity(id="test_id", text="Test", label="TEST")
+ graph = RDFGraph(entities=[entity])
+ found = graph.get_entity_by_id("test_id")
+ assert found is not None
+ assert found.text == "Test"
+
+ not_found = graph.get_entity_by_id("nonexistent")
+ assert not_found is None
diff --git a/tests/text_processor/test_processor.py b/tests/text_processor/test_processor.py
new file mode 100644
index 0000000..aaa55e9
--- /dev/null
+++ b/tests/text_processor/test_processor.py
@@ -0,0 +1,84 @@
+"""Tests for the TextToSemanticProcessor."""
+
+from unittest.mock import MagicMock
+
+from intugle.text_processor.models import Entity, RDFGraph, Relationship
+from intugle.text_processor.processor import TextToSemanticProcessor
+
+
+class TestTextToSemanticProcessor:
+ def test_processor_initialization(self):
+ processor = TextToSemanticProcessor()
+ assert processor.model == "gpt-4o-mini"
+ assert processor.output_format == "rdf_star"
+
+ def test_processor_custom_config(self):
+ processor = TextToSemanticProcessor(
+ model="gpt-4",
+ output_format="rdf",
+ namespace="http://custom.org/",
+ )
+ assert processor.model == "gpt-4"
+ assert processor.output_format == "rdf"
+ assert processor.rdf_builder.namespace == "http://custom.org/"
+
+ def test_processor_with_mock_extractor(self):
+ # Create mock extractor
+ mock_extractor = MagicMock()
+ mock_extractor.extract.return_value = (
+ [Entity(id="Invoice_123", text="Invoice 123", label="DOCUMENT")],
+ [Relationship(subject_id="Invoice_123", predicate="hasAmount", object_id="5400")],
+ )
+
+ processor = TextToSemanticProcessor(extractor=mock_extractor)
+
+ result = processor.parse("Invoice 123 for $5,400")
+
+ mock_extractor.extract.assert_called_once()
+ assert isinstance(result, RDFGraph)
+ assert len(result.entities) == 1
+ assert len(result.relationships) == 1
+
+ def test_export_turtle(self):
+ mock_extractor = MagicMock()
+ mock_extractor.extract.return_value = (
+ [Entity(id="Test", text="Test", label="TEST")],
+ [],
+ )
+
+ processor = TextToSemanticProcessor(extractor=mock_extractor)
+ graph = processor.parse("Test text")
+
+ turtle = processor.export_turtle(graph)
+ assert "@prefix" in turtle
+ assert "Test" in turtle
+
+ def test_export_json_ld(self):
+ mock_extractor = MagicMock()
+ mock_extractor.extract.return_value = (
+ [Entity(id="Test", text="Test", label="TEST")],
+ [],
+ )
+
+ processor = TextToSemanticProcessor(extractor=mock_extractor)
+ graph = processor.parse("Test text")
+
+ json_ld = processor.export_json_ld(graph)
+ assert "@context" in json_ld
+ assert "@graph" in json_ld
+
+ def test_metadata_in_graph(self):
+ mock_extractor = MagicMock()
+ mock_extractor.extract.return_value = ([], [])
+
+ processor = TextToSemanticProcessor(
+ model="test-model",
+ output_format="rdf_star",
+ extractor=mock_extractor,
+ )
+
+ graph = processor.parse("Test", metadata={"source": "test.txt"})
+
+ assert graph.metadata["model"] == "test-model"
+ assert graph.metadata["output_format"] == "rdf_star"
+ assert graph.metadata["source"] == "test.txt"