Intugle · trxvorr · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -94,6 +94,7 @@ streamlit = [
 "Bug Tracker" = "https://github.com/Intugle/data-tools/issues"
 
 [project.scripts]
+intugle = "intugle.cli:main"
 intugle-mcp = "intugle.mcp.server:main"
 intugle-streamlit = "intugle.cli:run_streamlit_app"
 

diff --git a/src/intugle/__init__.py b/src/intugle/__init__.py
@@ -1,3 +1,14 @@
 from intugle.analysis.models import DataSet as DataSet
 from intugle.data_product import DataProduct as DataProduct
 from intugle.semantic_model import SemanticModel as SemanticModel
+
+__all__ = ["DataSet", "DataProduct", "SemanticModel"]
+
+# Expose text processor for unstructured text-to-semantic conversion
+try:
+    from intugle.text_processor import TextToSemanticProcessor  # noqa: F401
+
+    __all__.append("TextToSemanticProcessor")
+except ImportError:
+    # Text processor dependencies might not be available
+    pass
diff --git a/src/intugle/cli.py b/src/intugle/cli.py
@@ -1,6 +1,16 @@
+import argparse
 import importlib.util
+import json
+import logging
 import os
 import subprocess
+import sys
+
+# Setup basic logging for CLI commands
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
 
 
 def run_streamlit_app():
@@ -30,7 +40,7 @@ def run_streamlit_app():
     # Get the absolute path to the main.py of the Streamlit app
     app_dir = os.path.join(os.path.dirname(__file__), 'streamlit_app')
     app_path = os.path.join(app_dir, 'main.py')
-    
+
     # Ensure the app_path exists
     if not os.path.exists(app_path):
         print(f"Error: Streamlit app not found at {app_path}")
@@ -41,5 +51,109 @@ def run_streamlit_app():
     subprocess.run(["streamlit", "run", app_path], cwd=app_dir)
 
 
+def run_text_to_semantic(args):
+    """Execute the text-to-semantic conversion command."""
+    try:
+        from intugle.text_processor import TextToSemanticProcessor
+    except ImportError as e:
+        logger.error("Text processor not available.")
+        logger.error(f"Error: {e}")
+        sys.exit(1)
+
+    try:
+        # Read input text
+        if args.input == "-":
+            text = sys.stdin.read()
+        else:
+            with open(args.input, "r", encoding="utf-8") as f:
+                text = f.read()
+
+        logger.info(f"Processing text of length {len(text)} characters")
+
+        # Initialize processor
+        processor = TextToSemanticProcessor(
+            model=args.model,
+            output_format=args.format,
+        )
+
+        # Parse text to RDF
+        logger.info("Extracting entities and relationships...")
+        rdf_graph = processor.parse(text)
+
+        logger.info(
+            f"Extracted {len(rdf_graph.entities)} entities, "
+            f"{len(rdf_graph.relationships)} relationships, "
+            f"{len(rdf_graph.triples)} triples"
+        )
+
+        # Output results
+        if args.output_format == "turtle":
+            output = rdf_graph.to_turtle()
+        elif args.output_format == "json-ld":
+            output = json.dumps(rdf_graph.to_json_ld(), indent=2)
+        else:  # json
+            output = json.dumps(rdf_graph.to_dict(), indent=2)
+
+        if args.output:
+            os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            logger.info(f"Output written to: {args.output}")
+        else:
+            print(output)
+
+        logger.info("Text-to-semantic conversion complete.")
+
+    except Exception as e:
+        logger.error(f"Job failed: {str(e)}")
+        sys.exit(1)
+
+
+def main():
+    """Main entry point for the intugle CLI."""
+    parser = argparse.ArgumentParser(
+        description="Intugle - GenAI-powered semantic layer toolkit"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Streamlit command
+    subparsers.add_parser("streamlit", help="Launch the Streamlit web application")
+
+    # Text-to-semantic command
+    text_parser = subparsers.add_parser(
+        "text-to-semantic", help="Convert unstructured text to RDF/semantic triples"
+    )
+    text_parser.add_argument(
+        "--input", "-i", required=True,
+        help="Input text file path (use '-' for stdin)"
+    )
+    text_parser.add_argument(
+        "--output", "-o",
+        help="Output file path (prints to stdout if not specified)"
+    )
+    text_parser.add_argument(
+        "--model", "-m", default="gpt-4o-mini",
+        help="LLM model for extraction (default: gpt-4o-mini)"
+    )
+    text_parser.add_argument(
+        "--format", "-f", choices=["rdf", "rdf_star"], default="rdf_star",
+        help="RDF format: 'rdf' or 'rdf_star' (default: rdf_star)"
+    )
+    text_parser.add_argument(
+        "--output-format", choices=["json", "turtle", "json-ld"], default="json",
+        help="Output format: json, turtle, or json-ld (default: json)"
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "streamlit":
+        run_streamlit_app()
+    elif args.command == "text-to-semantic":
+        run_text_to_semantic(args)
+    else:
+        parser.print_help()
+
+
 if __name__ == "__main__":
-    run_streamlit_app()
+    main()
+
diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py
@@ -195,6 +195,55 @@ def search(self, query: str):
             log.error(f"Could not perform semantic search: {e}")
             raise e
 
+    def overlay(self, rdf_graph: Any, match_threshold: float = 0.85) -> "SemanticModel":
+        """
+        Overlay an RDF graph from unstructured text onto this semantic model.
+
+        Maps extracted entities and relationships from the RDF graph to existing
+        semantic nodes, enabling integration of text-derived knowledge.
+
+        Args:
+            rdf_graph: An RDFGraph instance from TextToSemanticProcessor.
+            match_threshold: Minimum similarity score for entity matching (0.0-1.0).
+
+        Returns:
+            Self for method chaining.
+        """
+        from intugle.text_processor.mapper import SemanticMapper
+
+        console.print(
+            f"Overlaying RDF graph with {len(rdf_graph.entities)} entities...",
+            style="yellow",
+        )
+
+        mapper = SemanticMapper(match_threshold=match_threshold)
+        mapping_results = mapper.map_to_semantic_model(rdf_graph, self)
+
+        # Store mapping results for later use
+        if not hasattr(self, "_text_mappings"):
+            self._text_mappings = []
+        self._text_mappings.extend(mapping_results)
+
+        # Generate suggestions for new nodes
+        new_suggestions = mapper.suggest_new_nodes(mapping_results)
+        if new_suggestions:
+            console.print(
+                f"Found {len(new_suggestions)} unmapped entities that could be new concepts.",
+                style="yellow",
+            )
+
+        matched = sum(1 for r in mapping_results if not r.is_new)
+        console.print(
+            f"Overlay complete: {matched}/{len(mapping_results)} entities matched to existing nodes.",
+            style="bold green",
+        )
+
+        return self
+
+    def get_text_mappings(self) -> list:
+        """Get all text-to-semantic mappings from overlay operations."""
+        return getattr(self, "_text_mappings", [])
+
     def deploy(self, target: str, **kwargs):
         """
         Deploys the semantic model to a specified target platform based on the persisted YAML files.

diff --git a/src/intugle/text_processor/__init__.py b/src/intugle/text_processor/__init__.py
@@ -0,0 +1,17 @@
+"""
+Unstructured Text Processor Module.
+
+Provides functionality to convert unstructured text into RDF triples
+and map them to the existing Semantic Model.
+"""
+
+from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship
+from intugle.text_processor.processor import TextToSemanticProcessor
+
+__all__ = [
+    "TextToSemanticProcessor",
+    "RDFTriple",
+    "RDFGraph",
+    "Entity",
+    "Relationship",
+]
diff --git a/src/intugle/text_processor/extractors/__init__.py b/src/intugle/text_processor/extractors/__init__.py
@@ -0,0 +1,6 @@
+"""Extractors subpackage for NLP backends."""
+
+from intugle.text_processor.extractors.base import BaseExtractor
+from intugle.text_processor.extractors.llm_extractor import LLMExtractor
+
+__all__ = ["BaseExtractor", "LLMExtractor"]
diff --git a/src/intugle/text_processor/extractors/base.py b/src/intugle/text_processor/extractors/base.py
@@ -0,0 +1,57 @@
+"""Base extractor interface for pluggable NLP backends."""
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+
+from intugle.text_processor.models import Entity, Relationship
+
+
+class BaseExtractor(ABC):
+    """
+    Abstract base class for text extractors.
+
+    Implementations should extract entities and relationships from text.
+    """
+
+    @abstractmethod
+    def extract_entities(self, text: str) -> List[Entity]:
+        """
+        Extract named entities from text.
+
+        Args:
+            text: Input text to process.
+
+        Returns:
+            List of extracted Entity objects.
+        """
+        pass
+
+    @abstractmethod
+    def extract_relationships(
+        self, text: str, entities: List[Entity]
+    ) -> List[Relationship]:
+        """
+        Extract relationships between entities.
+
+        Args:
+            text: Input text to process.
+            entities: Previously extracted entities.
+
+        Returns:
+            List of extracted Relationship objects.
+        """
+        pass
+
+    def extract(self, text: str) -> Tuple[List[Entity], List[Relationship]]:
+        """
+        Full extraction pipeline: entities then relationships.
+
+        Args:
+            text: Input text to process.
+
+        Returns:
+            Tuple of (entities, relationships).
+        """
+        entities = self.extract_entities(text)
+        relationships = self.extract_relationships(text, entities)
+        return entities, relationships