Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ streamlit = [
"Bug Tracker" = "https://github.com/Intugle/data-tools/issues"

[project.scripts]
intugle = "intugle.cli:main"
intugle-mcp = "intugle.mcp.server:main"
intugle-streamlit = "intugle.cli:run_streamlit_app"

Expand Down
11 changes: 11 additions & 0 deletions src/intugle/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
from intugle.analysis.models import DataSet as DataSet
from intugle.data_product import DataProduct as DataProduct
from intugle.semantic_model import SemanticModel as SemanticModel

__all__ = ["DataSet", "DataProduct", "SemanticModel"]

# Expose text processor for unstructured text-to-semantic conversion
try:
from intugle.text_processor import TextToSemanticProcessor # noqa: F401

__all__.append("TextToSemanticProcessor")
except ImportError:
# Text processor dependencies might not be available
pass
118 changes: 116 additions & 2 deletions src/intugle/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
import argparse
import importlib.util
import json
import logging
import os
import subprocess
import sys

# Setup basic logging for CLI commands
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def run_streamlit_app():
Expand Down Expand Up @@ -30,7 +40,7 @@ def run_streamlit_app():
# Get the absolute path to the main.py of the Streamlit app
app_dir = os.path.join(os.path.dirname(__file__), 'streamlit_app')
app_path = os.path.join(app_dir, 'main.py')

# Ensure the app_path exists
if not os.path.exists(app_path):
print(f"Error: Streamlit app not found at {app_path}")
Expand All @@ -41,5 +51,109 @@ def run_streamlit_app():
subprocess.run(["streamlit", "run", app_path], cwd=app_dir)


def run_text_to_semantic(args):
"""Execute the text-to-semantic conversion command."""
try:
from intugle.text_processor import TextToSemanticProcessor
except ImportError as e:
logger.error("Text processor not available.")
logger.error(f"Error: {e}")
sys.exit(1)

try:
# Read input text
if args.input == "-":
text = sys.stdin.read()
else:
with open(args.input, "r", encoding="utf-8") as f:
text = f.read()

logger.info(f"Processing text of length {len(text)} characters")

# Initialize processor
processor = TextToSemanticProcessor(
model=args.model,
output_format=args.format,
)

# Parse text to RDF
logger.info("Extracting entities and relationships...")
rdf_graph = processor.parse(text)

logger.info(
f"Extracted {len(rdf_graph.entities)} entities, "
f"{len(rdf_graph.relationships)} relationships, "
f"{len(rdf_graph.triples)} triples"
)

# Output results
if args.output_format == "turtle":
output = rdf_graph.to_turtle()
elif args.output_format == "json-ld":
output = json.dumps(rdf_graph.to_json_ld(), indent=2)
else: # json
output = json.dumps(rdf_graph.to_dict(), indent=2)

if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Output written to: {args.output}")
else:
print(output)

logger.info("Text-to-semantic conversion complete.")

except Exception as e:
logger.error(f"Job failed: {str(e)}")
sys.exit(1)


def main():
"""Main entry point for the intugle CLI."""
parser = argparse.ArgumentParser(
description="Intugle - GenAI-powered semantic layer toolkit"
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")

# Streamlit command
subparsers.add_parser("streamlit", help="Launch the Streamlit web application")

# Text-to-semantic command
text_parser = subparsers.add_parser(
"text-to-semantic", help="Convert unstructured text to RDF/semantic triples"
)
text_parser.add_argument(
"--input", "-i", required=True,
help="Input text file path (use '-' for stdin)"
)
text_parser.add_argument(
"--output", "-o",
help="Output file path (prints to stdout if not specified)"
)
text_parser.add_argument(
"--model", "-m", default="gpt-4o-mini",
help="LLM model for extraction (default: gpt-4o-mini)"
)
text_parser.add_argument(
"--format", "-f", choices=["rdf", "rdf_star"], default="rdf_star",
help="RDF format: 'rdf' or 'rdf_star' (default: rdf_star)"
)
text_parser.add_argument(
"--output-format", choices=["json", "turtle", "json-ld"], default="json",
help="Output format: json, turtle, or json-ld (default: json)"
)

args = parser.parse_args()

if args.command == "streamlit":
run_streamlit_app()
elif args.command == "text-to-semantic":
run_text_to_semantic(args)
else:
parser.print_help()


if __name__ == "__main__":
run_streamlit_app()
main()

49 changes: 49 additions & 0 deletions src/intugle/semantic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,55 @@ def search(self, query: str):
log.error(f"Could not perform semantic search: {e}")
raise e

def overlay(self, rdf_graph: Any, match_threshold: float = 0.85) -> "SemanticModel":
"""
Overlay an RDF graph from unstructured text onto this semantic model.

Maps extracted entities and relationships from the RDF graph to existing
semantic nodes, enabling integration of text-derived knowledge.

Args:
rdf_graph: An RDFGraph instance from TextToSemanticProcessor.
match_threshold: Minimum similarity score for entity matching (0.0-1.0).

Returns:
Self for method chaining.
"""
from intugle.text_processor.mapper import SemanticMapper

console.print(
f"Overlaying RDF graph with {len(rdf_graph.entities)} entities...",
style="yellow",
)

mapper = SemanticMapper(match_threshold=match_threshold)
mapping_results = mapper.map_to_semantic_model(rdf_graph, self)

# Store mapping results for later use
if not hasattr(self, "_text_mappings"):
self._text_mappings = []
self._text_mappings.extend(mapping_results)

# Generate suggestions for new nodes
new_suggestions = mapper.suggest_new_nodes(mapping_results)
if new_suggestions:
console.print(
f"Found {len(new_suggestions)} unmapped entities that could be new concepts.",
style="yellow",
)

matched = sum(1 for r in mapping_results if not r.is_new)
console.print(
f"Overlay complete: {matched}/{len(mapping_results)} entities matched to existing nodes.",
style="bold green",
)

return self

def get_text_mappings(self) -> list:
"""Get all text-to-semantic mappings from overlay operations."""
return getattr(self, "_text_mappings", [])

def deploy(self, target: str, **kwargs):
"""
Deploys the semantic model to a specified target platform based on the persisted YAML files.
Expand Down
17 changes: 17 additions & 0 deletions src/intugle/text_processor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Unstructured Text Processor Module.

Provides functionality to convert unstructured text into RDF triples
and map them to the existing Semantic Model.
"""

from intugle.text_processor.models import Entity, RDFGraph, RDFTriple, Relationship
from intugle.text_processor.processor import TextToSemanticProcessor

__all__ = [
"TextToSemanticProcessor",
"RDFTriple",
"RDFGraph",
"Entity",
"Relationship",
]
6 changes: 6 additions & 0 deletions src/intugle/text_processor/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Extractors subpackage for NLP backends."""

from intugle.text_processor.extractors.base import BaseExtractor
from intugle.text_processor.extractors.llm_extractor import LLMExtractor

__all__ = ["BaseExtractor", "LLMExtractor"]
57 changes: 57 additions & 0 deletions src/intugle/text_processor/extractors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Base extractor interface for pluggable NLP backends."""

from abc import ABC, abstractmethod
from typing import List, Tuple

from intugle.text_processor.models import Entity, Relationship


class BaseExtractor(ABC):
"""
Abstract base class for text extractors.

Implementations should extract entities and relationships from text.
"""

@abstractmethod
def extract_entities(self, text: str) -> List[Entity]:
"""
Extract named entities from text.

Args:
text: Input text to process.

Returns:
List of extracted Entity objects.
"""
pass

@abstractmethod
def extract_relationships(
self, text: str, entities: List[Entity]
) -> List[Relationship]:
"""
Extract relationships between entities.

Args:
text: Input text to process.
entities: Previously extracted entities.

Returns:
List of extracted Relationship objects.
"""
pass

def extract(self, text: str) -> Tuple[List[Entity], List[Relationship]]:
"""
Full extraction pipeline: entities then relationships.

Args:
text: Input text to process.

Returns:
Tuple of (entities, relationships).
"""
entities = self.extract_entities(text)
relationships = self.extract_relationships(text, entities)
return entities, relationships
Loading